In [1]:
# Python ≥3.5 is required
import sys
assert sys.version_info >= (3, 5)

# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"

# Common imports
import numpy as np
import os
import pandas as pd
# For visualization
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
pd.options.display.max_rows = None
pd.options.display.max_columns = None

# to make this notebook's output stable across runs
np.random.seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# Where to save the figures
PROJECT_ROOT_DIR = "."
CHAPTER_ID = "decision_trees"
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID)
os.makedirs(IMAGES_PATH, exist_ok=True)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)

# Common imports
import numpy as np
import os

### Loading the dataset from a given filepath:

In [2]:
h = open("C:/Users/AVIK DAS/Desktop/CMI H.W. & Assignments/4th semester/AML/Assignment1/SMSSpamCollection","r")

### Preprocessing the data:

Extracting the message and it's label to create the training dataset

In [3]:
def messlabeldist(message):
    l = message.split("\t")
    m_label = [l[1][:-1], l[0]]
    return m_label

In [4]:
h = open("C:/Users/AVIK DAS/Desktop/CMI H.W. & Assignments/4th semester/AML/Assignment1/SMSSpamCollection","r")
column_names = ["message", "label"]
Full_data = []
i = 0
index_names = []
for line in h:
    i += 1
    index_names.append(i)
    d = messlabeldist(line)
    Full_data.append(d)
h.close()
df = pd.DataFrame(data = Full_data, 
                  index = index_names, 
                  columns = column_names)

In [5]:
df.sample(n=15)

Unnamed: 0,message,label
3691,You still coming tonight?,ham
3528,"""HEY BABE! FAR 2 SPUN-OUT 2 SPK AT DA MO... DE...",ham
725,Ya even those cookies have jelly on them,ham
3371,Sorry i've not gone to that place. I.ll do so ...,ham
469,When are you going to ride your bike?,ham
5413,"Daddy, shu shu is looking 4 u... U wan me 2 te...",ham
4363,"Don't Think About ""What u Have Got"" Think Abou...",ham
4242,The LAY MAN! Just to let you know you are miss...,ham
5443,Thank you. do you generally date the brothas?,ham
5310,What you did in leave.,ham


### Train-Test-Validation splitting:

In [6]:
#for splitting the dataset into train test data
from sklearn.model_selection import train_test_split
#Spliting into Train & Test Data
train_X,test_X,train_Y,test_Y = sklearn.model_selection.train_test_split(df.iloc[:,:1],df.iloc[:,1:],train_size=0.85, test_size=0.15, random_state=42)
train_X,valid_X,train_Y,valid_Y = sklearn.model_selection.train_test_split(train_X,train_Y,train_size=0.85, test_size=0.15, random_state=42)

In [7]:
len(train_X),len(test_X),len(valid_X)

(4026, 837, 711)

In [8]:
# saving the dataframe
column_names = ["message", "label"]
train_index_names = [i for i in range(len(train_X))]

train_data = [[list(train_X.message)[i],list(train_Y.label)[i]] for i in range(len(train_X))]
train_df = pd.DataFrame(data = train_data,
                       index = train_index_names, 
                  columns = column_names)
train_df.to_csv('train.csv')

test_index_names = [i for i in range(len(test_X))]
test_data = [[list(test_X.message)[i],list(test_Y.label)[i]] for i in range(len(test_X))]
test_df = pd.DataFrame(data = test_data,
                      index = test_index_names, 
                  columns = column_names)
test_df.to_csv('test.csv')

valid_index_names = [i for i in range(len(valid_X))]
valid_data = [[list(valid_X.message)[i],list(valid_Y.label)[i]] for i in range(len(valid_X))]
valid_df = pd.DataFrame(data = valid_data,
                       index = valid_index_names, 
                  columns = column_names)
valid_df.to_csv('validation.csv')

In [9]:
train_Y.describe()

Unnamed: 0,label
count,4026
unique,2
top,ham
freq,3508


In [10]:
test_Y.describe()

Unnamed: 0,label
count,837
unique,2
top,ham
freq,712


In [11]:
valid_Y.describe()

Unnamed: 0,label
count,711
unique,2
top,ham
freq,607
