In [1]:
import numpy as np
import pandas as pd

### Read the sms data

In [2]:
sms_data = pd.read_csv('sms.tsv', sep = "\t", names = ["label", "message"])
sms_data.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
# Quick way of converting string label to numeric targets
sms_data['target'] = sms_data.label.map({'ham':0, 'spam':1})

In [4]:
sms_data.head()

Unnamed: 0,label,message,target
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [5]:
print("SMS data shape: ", sms_data.shape)
print("\nSMS data label value counts:\n", sms_data.label.value_counts())
print("\nPercentage of label value counts:\n", sms_data.label.value_counts()/len(sms_data.label) * 100)

SMS data shape:  (5572, 3)

SMS data label value counts:
 ham     4825
spam     747
Name: label, dtype: int64

Percentage of label value counts:
 ham     86.593683
spam    13.406317
Name: label, dtype: float64


In [6]:
X = sms_data['message']
y = sms_data['target']

### Create Training and testing data

In [7]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size = 0.2, stratify = y, random_state = 42)

print("X_train shape: ", X_train.shape)
print("X_test shape: ", X_test.shape)
print("y_train shape: ", y_train.shape)
print("y_test shape: ", y_test.shape)

print("\nType of X_train: ", type(X_train))

X_train shape:  (4457,)
X_test shape:  (1115,)
y_train shape:  (4457,)
y_test shape:  (1115,)

Type of X_train:  <class 'pandas.core.series.Series'>


In [8]:
#Sample_data:
X_train.head()

184                              He will, you guys close?
2171    CAN I PLEASE COME UP NOW IMIN TOWN.DONTMATTER ...
5422              Ok k..sry i knw 2 siva..tats y i askd..
4113                            I'll see, but prolly yeah
4588    I'll see if I can swing by in a bit, got some ...
Name: message, dtype: object

### CountVectorizer

In [9]:
from sklearn.feature_extraction.text import CountVectorizer

bag_sms = CountVectorizer()
bag_sms.fit(X_train)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [10]:
X_train_dtm = bag_sms.transform(X_train)
X_test_dtm = bag_sms.transform(X_test)

print("Total number of tokens: ", len(bag_sms.get_feature_names()))
print("X_train_dtm.shape: ", X_train_dtm.shape)
print("X_test_dtm.shape: ", X_test_dtm.shape)
print("\nX_train_dtm.type: ", type(X_train_dtm))

Total number of tokens:  7668
X_train_dtm.shape:  (4457, 7668)
X_test_dtm.shape:  (1115, 7668)

X_train_dtm.type:  <class 'scipy.sparse.csr.csr_matrix'>


In [11]:
# Converting to pandas DataFrame for better understanding
pd_bag_sms = pd.DataFrame(data=X_train_dtm.toarray(), columns=bag_sms.get_feature_names())
pd_bag_sms.head()

Unnamed: 0,00,000,000pes,008704050406,0089,0121,01223585236,01223585334,02,0207,...,zed,zeros,zhong,zindgi,zoe,zogtorius,zyada,èn,ú1,〨ud
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Generate Tf-idf features 

In [12]:
from sklearn.feature_extraction.text import TfidfTransformer

tfidf_sms = TfidfTransformer(use_idf=True, norm='l2', smooth_idf=True)
tfidf_sms.fit(X_train_dtm)

TfidfTransformer(norm='l2', smooth_idf=True, sublinear_tf=False, use_idf=True)

In [13]:
X_train_tfidf = tfidf_sms.transform(X_train_dtm)
X_test_tfidf = tfidf_sms.transform(X_test_dtm)

In [14]:
print("X_train_tfidf.shape: ", X_train_tfidf.shape)
print("X_test_tfidf.shape: ", X_test_tfidf.shape)
print("\nX_train_tfidf.type: ", type(X_train_tfidf))

X_train_tfidf.shape:  (4457, 7668)
X_test_tfidf.shape:  (1115, 7668)

X_train_tfidf.type:  <class 'scipy.sparse.csr.csr_matrix'>


In [15]:
# Converting to pandas DataFrame for better understanding
pd_tfidf_sms = pd.DataFrame(data=X_train_tfidf.toarray(), columns=bag_sms.get_feature_names())
pd_tfidf_sms.head()

Unnamed: 0,00,000,000pes,008704050406,0089,0121,01223585236,01223585334,02,0207,...,zed,zeros,zhong,zindgi,zoe,zogtorius,zyada,èn,ú1,〨ud
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Pickling

In [17]:
import _pickle as pickle

with open('sms_train_test.dat', 'wb') as fp:
    pickle.dump(X_train_dtm, fp)
    pickle.dump(X_test_dtm, fp)
    pickle.dump(X_train_tfidf, fp)
    pickle.dump(X_test_tfidf, fp)
    pickle.dump(X_train, fp)
    pickle.dump(X_test, fp)
    pickle.dump(y_train, fp)
    pickle.dump(y_test, fp)
    pickle.dump(sms_data, fp)