In [20]:
import pandas as pd
from tqdm import tqdm
#text preprocessing
import string
import re
import nltk

In [2]:
data=pd.read_csv('./SMSCollection.csv',encoding='latin-1',error_bad_lines=False,header=None,
                 sep='\t',names=['label','text'])
data.head()

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
print("Out of {} rows ,{} are spam and {} are ham".format(len(data),
                                                          len(data[data['label']=='spam']),
                                                          len(data[data['label']=='ham'])))

Out of 5572 rows ,747 are spam and 4825 are ham


In [4]:
stopword=nltk.corpus.stopwords.words("english")
st=nltk.SnowballStemmer("english")
#function to perform basic cleaning tasks
def clean(text):
    text="".join([word.lower() for word in text if word not in string.punctuation])
    tokens=re.split('\W+',text)
    text=[st.stem(word) for word in tokens if word not in stopword]
    return text

In [5]:
#using tfidf for vectorizing data
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf=TfidfVectorizer(analyzer=clean)
x_tfidf=tfidf.fit_transform(data['text'])
#print(tfidf.get_feature_names())

In [6]:
#vectorizers output a sparse matrix
x_tfidfdf=pd.DataFrame(x_tfidf.toarray())
x_tfidfdf.columns=tfidf.get_feature_names()

In [7]:
#feature engineering
data['text_len']=data['text'].apply(lambda x:len(x)-x.count(" "))
data.head()

Unnamed: 0,label,text,text_len
0,ham,"Go until jurong point, crazy.. Available only ...",92
1,ham,Ok lar... Joking wif u oni...,24
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,128
3,ham,U dun say so early hor... U c already then say...,39
4,ham,"Nah I don't think he goes to usf, he lives aro...",49


In [8]:
#create features for a % of the text which is punctuation

def count_punct(text):
    count=sum([1 for char in text if char in string.punctuation])
    return round(count/(len(text)-text.count(" ")),3)*100
data['text_feature']=data['text'].apply(lambda x:count_punct(x))
data['punct%']=data['text'].apply(lambda x:count_punct(x))
data.head()

Unnamed: 0,label,text,text_len,text_feature,punct%
0,ham,"Go until jurong point, crazy.. Available only ...",92,9.8,9.8
1,ham,Ok lar... Joking wif u oni...,24,25.0,25.0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,128,4.7,4.7
3,ham,U dun say so early hor... U c already then say...,39,15.4,15.4
4,ham,"Nah I don't think he goes to usf, he lives aro...",49,4.1,4.1


In [9]:
import numpy as np
from matplotlib import pyplot
bins=np.linspace(0,200,40)
pyplot.hist(data[data['label']=='spam']['text_len'],bins,alpha=0.7,normed=True,label='spam')
pyplot.hist(data[data['label']=='ham']['text_len'],bins,alpha=0.7,normed=True,label='ham')
pyplot.legend(loc='upper right')
pyplot.show()

The 'normed' kwarg was deprecated in Matplotlib 2.1 and will be removed in 3.1. Use 'density' instead.
  alternative="'density'", removal="3.1")


<Figure size 640x480 with 1 Axes>

In [10]:
from sklearn.ensemble import RandomForestClassifier
#cross validation and rfc
from sklearn.model_selection import KFold,cross_val_score
xfeatures=pd.concat([data['text_len'],data['punct%'],pd.DataFrame(x_tfidf.toarray())],axis=1)

In [11]:
rf=RandomForestClassifier(n_jobs=-1)
kfold=KFold(n_splits=5)
cross_val_score(rf,xfeatures,data['label'],cv=kfold,scoring='accuracy',n_jobs=-1)

array([0.96950673, 0.97578475, 0.97037702, 0.96499102, 0.96947935])

In [12]:
#explore random forest through the holdout set
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.model_selection import train_test_split
xtrain,xtest,ytrain,ytest=train_test_split(xfeatures,data['label'],test_size=0.2)

In [13]:
rf=RandomForestClassifier(n_estimators=50,max_depth=20,n_jobs=1)
rf_model=rf.fit(xtrain,ytrain)

In [14]:
sorted(zip(rf_model.feature_importances_,xtrain.columns),reverse=True)[0:10]

[(0.056702808477674414, 'text_len'),
 (0.03297231197547871, 1816),
 (0.032970737797076585, 3152),
 (0.03272132749631371, 8144),
 (0.02870729984804416, 2047),
 (0.025914729846010825, 7385),
 (0.02075591505118663, 6774),
 (0.020428015305010018, 5755),
 (0.020188753487757206, 7498),
 (0.015587028414584885, 4826)]

In [15]:
ypred=rf_model.predict(xtest)
precision,recall,fscore,support=score(ytest,ypred,pos_label='spam',average='binary')
print(precision,recall,fscore,support)

1.0 0.6918238993710691 0.8178438661710037 None


In [16]:
#random forest with grid search
xtrain,xtest,ytrain,ytest=train_test_split(xfeatures,data['label'],test_size=0.2)

In [17]:
def train_rf(n_est,depth):
    rf=RandomForestClassifier(n_estimators=n_est,max_depth=depth,n_jobs=-1)
    rf_model=rf.fit(xtrain,ytrain)
    ypred=rf_model.predict(xtest)
    precision,recall,fscore,support=score(ytest,ypred,pos_label='spam',average='binary')
    print('est {},depth {}---,precision {},recall {},accuracy {}'.format(n_est,depth,round(precision,3),round(recall,3),
                                                                        round((ytest==ypred).sum()/len(ypred),3)))
from tqdm import tqdm
for n_est in tqdm([10,50,100]):
    for depth in [10,20,30,None]:
        train_rf(n_est,depth)

  0%|                                                    | 0/3 [00:00<?, ?it/s]

est 10,depth 10---,precision 1.0,recall 0.341,accuracy 0.918
est 10,depth 20---,precision 0.99,recall 0.703,accuracy 0.962
est 10,depth 30---,precision 0.989,recall 0.674,accuracy 0.959
est 10,depth None---,precision 0.991,recall 0.761,accuracy 0.97


 33%|██████████████▋                             | 1/3 [00:04<00:09,  4.75s/it]

est 50,depth 10---,precision 1.0,recall 0.203,accuracy 0.901
est 50,depth 20---,precision 1.0,recall 0.623,accuracy 0.953
est 50,depth 30---,precision 0.99,recall 0.688,accuracy 0.961
est 50,depth None---,precision 1.0,recall 0.768,accuracy 0.971


 67%|█████████████████████████████▎              | 2/3 [00:13<00:06,  6.02s/it]

est 100,depth 10---,precision 1.0,recall 0.326,accuracy 0.917
est 100,depth 20---,precision 1.0,recall 0.63,accuracy 0.954
est 100,depth 30---,precision 1.0,recall 0.688,accuracy 0.961
est 100,depth None---,precision 1.0,recall 0.826,accuracy 0.978


100%|████████████████████████████████████████████| 3/3 [00:30<00:00,  9.39s/it]


In [18]:
#random forest evaluation with gridsearchCV(cross validation+grid search)
from sklearn.feature_extraction.text import CountVectorizer
x_tfidf_feat=pd.concat([data['text_len'],data['punct%'],pd.DataFrame(x_tfidf.toarray())])
count_vect=CountVectorizer(analyzer=clean)
x_count=count_vect.fit_transform(data['text'])

x_tfidf_feat=pd.concat([data['text_len'],data['punct%'],pd.DataFrame(x_tfidf.toarray())],axis=1)
x_count_feat=pd.concat([data['text_len'],data['punct%'],pd.DataFrame(x_tfidf.toarray())],axis=1)

In [None]:
from sklearn.model_selection import GridSearchCV
rf=RandomForestClassifier()
param={'n_estimators':[10,150,300],
      'max_depth':[30,60,90,None]}

gs=GridSearchCV(rf,param,cv=5,n_jobs=-1)
gs_fit=gs.fit(x_tfidf_feat,data['label'])#do x_count_feat also
pd.DataFrame(gs_fit.cv_results_).sort_values('mean_test_score',ascending=False)[0:5]

In [25]:
#gradient boosting grid search
from sklearn.ensemble import GradientBoostingClassifier
#build own grid search
#random forest with grid search
xtrain,xtest,ytrain,ytest=train_test_split(xfeatures,data['label'],test_size=0.2)

In [None]:
def train_gb(est,depth,lr):
    gb=GradientBoostingClassifier(n_estimators=n_est,max_depth=depth,learning_rate=lr)
    gb_model=gb.fit(xtrain,ytrain)
    ypred=gb_model.predict(xtest)
    precision,recall,fscore,support=score(ytest,ypred,pos_label='spam',average='binary')
    print('est {},depth {},lr {},---,precision {},recall {},accuracy {}'.format(n_est,depth,lr,round(precision,3),round(recall,3),
                                                                        round((ytest==ypred).sum()/len(ypred),3)))

from tqdm import tqdm
for n_est in tqdm([50,100,150]):
    for depth in [3,7,11,15]:
        for lr in[0.01,0.1,1]:
            train_gb(n_est,depth,lr)

  'precision', 'predicted', average, warn_for)


est 50,depth 3,lr 0.01,---,precision 0.0,recall 0.0,accuracy 0.875
est 50,depth 3,lr 0.1,---,precision 0.871,recall 0.727,accuracy 0.952
est 50,depth 3,lr 1,---,precision 0.842,recall 0.806,accuracy 0.957


  'precision', 'predicted', average, warn_for)


est 50,depth 7,lr 0.01,---,precision 0.0,recall 0.0,accuracy 0.875
est 50,depth 7,lr 0.1,---,precision 0.866,recall 0.791,accuracy 0.959
est 50,depth 7,lr 1,---,precision 0.806,recall 0.835,accuracy 0.954


  'precision', 'predicted', average, warn_for)


est 50,depth 11,lr 0.01,---,precision 0.0,recall 0.0,accuracy 0.875
est 50,depth 11,lr 0.1,---,precision 0.852,recall 0.827,accuracy 0.961
est 50,depth 11,lr 1,---,precision 0.812,recall 0.842,accuracy 0.956


  'precision', 'predicted', average, warn_for)


est 50,depth 15,lr 0.01,---,precision 0.0,recall 0.0,accuracy 0.875
est 50,depth 15,lr 0.1,---,precision 0.853,recall 0.835,accuracy 0.961
est 50,depth 15,lr 1,---,precision 0.811,recall 0.835,accuracy 0.955


 33%|████████████▎                        | 1/3 [4:39:17<9:18:34, 16757.18s/it]

est 100,depth 3,lr 0.01,---,precision 0.945,recall 0.374,accuracy 0.919
est 100,depth 3,lr 0.1,---,precision 0.884,recall 0.77,accuracy 0.959
est 100,depth 3,lr 1,---,precision 0.85,recall 0.813,accuracy 0.959
est 100,depth 7,lr 0.01,---,precision 0.863,recall 0.633,accuracy 0.942
est 100,depth 7,lr 0.1,---,precision 0.873,recall 0.842,accuracy 0.965
est 100,depth 7,lr 1,---,precision 0.832,recall 0.82,accuracy 0.957
est 100,depth 11,lr 0.01,---,precision 0.845,recall 0.705,accuracy 0.947
est 100,depth 11,lr 0.1,---,precision 0.871,recall 0.827,accuracy 0.963
est 100,depth 11,lr 1,---,precision 0.818,recall 0.842,accuracy 0.957
est 100,depth 15,lr 0.01,---,precision 0.835,recall 0.763,accuracy 0.952
est 100,depth 15,lr 0.1,---,precision 0.86,recall 0.842,accuracy 0.963
est 100,depth 15,lr 1,---,precision 0.814,recall 0.849,accuracy 0.957


 67%|████████████████████████            | 2/3 [15:38:23<6:33:13, 23593.96s/it]

est 150,depth 3,lr 0.01,---,precision 0.848,recall 0.561,accuracy 0.933
est 150,depth 3,lr 0.1,---,precision 0.883,recall 0.763,accuracy 0.958
est 150,depth 3,lr 1,---,precision 0.844,recall 0.82,accuracy 0.959
est 150,depth 7,lr 0.01,---,precision 0.862,recall 0.676,accuracy 0.946
est 150,depth 7,lr 0.1,---,precision 0.872,recall 0.835,accuracy 0.964
est 150,depth 7,lr 1,---,precision 0.827,recall 0.827,accuracy 0.957
est 150,depth 11,lr 0.01,---,precision 0.843,recall 0.734,accuracy 0.95
est 150,depth 11,lr 0.1,---,precision 0.853,recall 0.835,accuracy 0.961
est 150,depth 11,lr 1,---,precision 0.825,recall 0.849,accuracy 0.959


In [None]:
#gradient boosting with gridsearch CV
gb=GradientBoostingClassifier()
param={'n_estimators':[100,150],
      'max_depth':[7,11,15],
      'learning_rate':[0.1]
      }

gs=GridSearchCV(gb,param,cv=5,n_jobs=-1)
gs_fit=gs.fit(x_tfidf_feat,data['label'])#do x_count_feat also
pd.DataFrame(gs_fit.cv_results_).sort_values('mean_test_score',ascending=False)[0:5]