In [1]:
import string
import pandas as pd
import numpy as np

from collections import Counter
from nltk.tokenize import TweetTokenizer
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

# Python script for confusion matrix creation. 
from sklearn.metrics import confusion_matrix 
from sklearn.metrics import accuracy_score 
from sklearn.metrics import classification_report 

np.set_printoptions(precision=5)
%matplotlib inline
pd.options.display.max_columns=1000

In [2]:
# Load data
df = pd.read_csv('SMSSpamCollection.txt', sep='\t', header=None, names=['spam', 'text'])

# set categorical values of spam to 0 or 1
df['spam'] = df['spam'] == 'spam' # makes True/False instead of "spam" and "ham"
df['spam'] = df['spam'].astype(int)  # number values instead of boolean value

# Get rid of the punctuation
translator = str.maketrans('', '', string.punctuation)
df.text = df.text.apply(lambda x: x.translate(translator))

# Adding new feature 'length'
L = []
for i in df.text:
    L.append(len(i))
df['length'] = L
df.head()


#Create sub DataFrame
sub_df = df[['text', 'length']]

# Split train test
X_train,  X_test, y_train, y_test =  train_test_split(sub_df, df.spam.values, test_size=0.2, random_state=42)


Our first simple model will predict whether message is a spam or ham, using feature 'lenght' and 'num_words',  
which we create now:

In [3]:
# Use TweetTokenizer 
tknzr = TweetTokenizer()
X_train['text'] = X_train.text.apply(tknzr.tokenize)
X_test['text'] = X_test.text.apply(tknzr.tokenize)
X_train['text'].head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


1978    [Reply, to, win, £, 100, weekly, Where, will, ...
3989    [Hello, Sort, of, out, in, town, already, That...
3935    [How, come, guoyang, go, n, tell, her, Then, u...
4078    [Hey, sathya, till, now, we, dint, meet, not, ...
4086    [Orange, brings, you, ringtones, from, all, ti...
Name: text, dtype: object

In [4]:
# Adding a new feature 'num_words' - how many words are in a message
X_train['num_words'] = X_train.text.apply(len)
X_test['num_words'] = X_test.text.apply(len)
X_train.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,text,length,num_words
1978,"[Reply, to, win, £, 100, weekly, Where, will, ...",101,22
3989,"[Hello, Sort, of, out, in, town, already, That...",98,21
3935,"[How, come, guoyang, go, n, tell, her, Then, u...",46,11
4078,"[Hey, sathya, till, now, we, dint, meet, not, ...",95,20
4086,"[Orange, brings, you, ringtones, from, all, ti...",149,28


In [5]:
X_train = X_train[['length', 'num_words']].values
X_test = X_test[['length', 'num_words']].values
X_train

array([[101,  22],
       [ 98,  21],
       [ 46,  11],
       ...,
       [ 37,   5],
       [ 26,   5],
       [ 39,   8]], dtype=int64)

In [6]:
# We will used MinMaxScaler, which scales values in a way that our new values will be within itnerval <0,1>.
# ATTENTION! With train set we use .fit_transform method(), with test set only .transform()!!!
scaler = MinMaxScaler()
X_train_sc = scaler.fit_transform(X_train)
X_test_sc = scaler.transform(X_test)
X_train_sc



array([[0.11274, 0.12865],
       [0.10936, 0.12281],
       [0.05073, 0.06433],
       ...,
       [0.04059, 0.02924],
       [0.02818, 0.02924],
       [0.04284, 0.04678]])

### 1) Logistic Regression

In [7]:
# Logistic Regression model
clf = LogisticRegression(random_state=0, class_weight='balanced', solver='sag')

model = clf.fit(X_train_sc, y_train)

print("Model score for training set:", model.score(X_train_sc, y_train))
print("Model score for testing set:", model.score(X_test_sc, y_test))

Model score for training set: 0.7938074938299304
Model score for testing set: 0.7901345291479821


In [8]:
y_pred = clf.predict(X_test_sc)

In [9]:
results = confusion_matrix(y_test, y_pred) 
print('Confusion Matrix :\n', results)
print('Accuracy Score :',accuracy_score(y_test, y_pred))
print('Report : ', classification_report(y_test, y_pred))

Confusion Matrix :
 [[750 216]
 [ 18 131]]
Accuracy Score : 0.7901345291479821
Report :                precision    recall  f1-score   support

           0       0.98      0.78      0.87       966
           1       0.38      0.88      0.53       149

   micro avg       0.79      0.79      0.79      1115
   macro avg       0.68      0.83      0.70      1115
weighted avg       0.90      0.79      0.82      1115



###### Try to change some of hyperparameters
Now change solver:

In [10]:
clf = LogisticRegression(random_state=0, class_weight='balanced', solver='newton-cg')

model = clf.fit(X_train_sc, y_train)

print("Model score for training set:", model.score(X_train_sc, y_train))
print("Model score for testing set:", model.score(X_test_sc, y_test))

y_pred = clf.predict(X_test_sc)

results = confusion_matrix(y_test, y_pred) 
print('Confusion Matrix :\n', results)
print('Accuracy Score :',accuracy_score(y_test, y_pred))
print('Report : ', classification_report(y_test, y_pred))

Model score for training set: 0.7958267893201705
Model score for testing set: 0.7901345291479821
Confusion Matrix :
 [[751 215]
 [ 19 130]]
Accuracy Score : 0.7901345291479821
Report :                precision    recall  f1-score   support

           0       0.98      0.78      0.87       966
           1       0.38      0.87      0.53       149

   micro avg       0.79      0.79      0.79      1115
   macro avg       0.68      0.82      0.70      1115
weighted avg       0.90      0.79      0.82      1115



In [11]:
clf = LogisticRegression(random_state=0, class_weight='balanced', solver='lbfgs')

model = clf.fit(X_train_sc, y_train)

print("Model score for training set:", model.score(X_train_sc, y_train))
print("Model score for testing set:", model.score(X_test_sc, y_test))

y_pred = clf.predict(X_test_sc)

results = confusion_matrix(y_test, y_pred) 
print('Confusion Matrix :\n', results)
print('Accuracy Score :',accuracy_score(y_test, y_pred))
print('Report : ', classification_report(y_test, y_pred))

Model score for training set: 0.7958267893201705
Model score for testing set: 0.7901345291479821
Confusion Matrix :
 [[751 215]
 [ 19 130]]
Accuracy Score : 0.7901345291479821
Report :                precision    recall  f1-score   support

           0       0.98      0.78      0.87       966
           1       0.38      0.87      0.53       149

   micro avg       0.79      0.79      0.79      1115
   macro avg       0.68      0.82      0.70      1115
weighted avg       0.90      0.79      0.82      1115



No significant change when changing solver. Try l1 instead of l2 and therefore different solver.

In [12]:
clf = LogisticRegression(penalty='l1',random_state=0, class_weight='balanced', solver='liblinear')

model = clf.fit(X_train_sc, y_train)

print("Model score for training set:", model.score(X_train_sc, y_train))
print("Model score for testing set:", model.score(X_test_sc, y_test))

y_pred = clf.predict(X_test_sc)

results = confusion_matrix(y_test, y_pred) 
print('Confusion Matrix :\n', results)
print('Accuracy Score :',accuracy_score(y_test, y_pred))
print('Report : ', classification_report(y_test, y_pred))

Model score for training set: 0.8422705855956921
Model score for testing set: 0.8475336322869955
Confusion Matrix :
 [[817 149]
 [ 21 128]]
Accuracy Score : 0.8475336322869955
Report :                precision    recall  f1-score   support

           0       0.97      0.85      0.91       966
           1       0.46      0.86      0.60       149

   micro avg       0.85      0.85      0.85      1115
   macro avg       0.72      0.85      0.75      1115
weighted avg       0.91      0.85      0.87      1115





Changing C1 - inverse of regularization strength; must be a positive float. Like in support vector machines, smaller values specify stronger regularization.

In [13]:
clf = LogisticRegression(penalty='l1',random_state=0, class_weight='balanced', solver='liblinear', C=0.1)

model = clf.fit(X_train_sc, y_train)

print("Model score for training set:", model.score(X_train_sc, y_train))
print("Model score for testing set:", model.score(X_test_sc, y_test))

y_pred = clf.predict(X_test_sc)

results = confusion_matrix(y_test, y_pred) 
print('Confusion Matrix :\n', results)
print('Accuracy Score :',accuracy_score(y_test, y_pred))
print('Report : ', classification_report(y_test, y_pred))

Model score for training set: 0.8021090419564729
Model score for testing set: 0.7973094170403587
Confusion Matrix :
 [[758 208]
 [ 18 131]]
Accuracy Score : 0.7973094170403587
Report :                precision    recall  f1-score   support

           0       0.98      0.78      0.87       966
           1       0.39      0.88      0.54       149

   micro avg       0.80      0.80      0.80      1115
   macro avg       0.68      0.83      0.70      1115
weighted avg       0.90      0.80      0.83      1115



In [14]:
clf = LogisticRegression(penalty='l1',random_state=0, class_weight='balanced', solver='liblinear', C=0.5)

model = clf.fit(X_train_sc, y_train)

print("Model score for training set:", model.score(X_train_sc, y_train))
print("Model score for testing set:", model.score(X_test_sc, y_test))

y_pred = clf.predict(X_test_sc)

results = confusion_matrix(y_test, y_pred) 
print('Confusion Matrix :\n', results)
print('Accuracy Score :',accuracy_score(y_test, y_pred))
print('Report : ', classification_report(y_test, y_pred))

Model score for training set: 0.8357639667938075
Model score for testing set: 0.8448430493273542
Confusion Matrix :
 [[810 156]
 [ 17 132]]
Accuracy Score : 0.8448430493273542
Report :                precision    recall  f1-score   support

           0       0.98      0.84      0.90       966
           1       0.46      0.89      0.60       149

   micro avg       0.84      0.84      0.84      1115
   macro avg       0.72      0.86      0.75      1115
weighted avg       0.91      0.84      0.86      1115





### 2) SVM

In [15]:
clf = SVC(random_state=0, class_weight='balanced')

model = clf.fit(X_train_sc, y_train)

print("Model score for training set:", model.score(X_train_sc, y_train))
print("Model score for testing set:", model.score(X_test_sc, y_test))

y_pred = clf.predict(X_test_sc)

results = confusion_matrix(y_test, y_pred) 
print('Confusion Matrix :\n', results)
print('Accuracy Score :',accuracy_score(y_test, y_pred))
print('Report : ', classification_report(y_test, y_pred))



Model score for training set: 0.7870765088624635
Model score for testing set: 0.7883408071748879
Confusion Matrix :
 [[745 221]
 [ 15 134]]
Accuracy Score : 0.7883408071748879
Report :                precision    recall  f1-score   support

           0       0.98      0.77      0.86       966
           1       0.38      0.90      0.53       149

   micro avg       0.79      0.79      0.79      1115
   macro avg       0.68      0.84      0.70      1115
weighted avg       0.90      0.79      0.82      1115



In [16]:
clf = SVC(random_state=0, kernel='sigmoid', class_weight='balanced')

model = clf.fit(X_train_sc, y_train)

print("Model score for training set:", model.score(X_train_sc, y_train))
print("Model score for testing set:", model.score(X_test_sc, y_test))

y_pred = clf.predict(X_test_sc)

results = confusion_matrix(y_test, y_pred) 
print('Confusion Matrix :\n', results)
print('Accuracy Score :',accuracy_score(y_test, y_pred))
print('Report : ', classification_report(y_test, y_pred))



Model score for training set: 0.7839353825443123
Model score for testing set: 0.7820627802690583
Confusion Matrix :
 [[738 228]
 [ 15 134]]
Accuracy Score : 0.7820627802690583
Report :                precision    recall  f1-score   support

           0       0.98      0.76      0.86       966
           1       0.37      0.90      0.52       149

   micro avg       0.78      0.78      0.78      1115
   macro avg       0.68      0.83      0.69      1115
weighted avg       0.90      0.78      0.81      1115



In [17]:
clf = SVC(random_state=0, kernel='sigmoid', coef0=0.1, class_weight='balanced')

model = clf.fit(X_train_sc, y_train)

print("Model score for training set:", model.score(X_train_sc, y_train))
print("Model score for testing set:", model.score(X_test_sc, y_test))

y_pred = clf.predict(X_test_sc)

results = confusion_matrix(y_test, y_pred) 
print('Confusion Matrix :\n', results)
print('Accuracy Score :',accuracy_score(y_test, y_pred))
print('Report : ', classification_report(y_test, y_pred))



Model score for training set: 0.7825891855508189
Model score for testing set: 0.7820627802690583
Confusion Matrix :
 [[738 228]
 [ 15 134]]
Accuracy Score : 0.7820627802690583
Report :                precision    recall  f1-score   support

           0       0.98      0.76      0.86       966
           1       0.37      0.90      0.52       149

   micro avg       0.78      0.78      0.78      1115
   macro avg       0.68      0.83      0.69      1115
weighted avg       0.90      0.78      0.81      1115



### 3) Random Forest

In [18]:
clf = RandomForestClassifier(random_state=0, class_weight='balanced')

model = clf.fit(X_train_sc, y_train)

print("Model score for training set:", model.score(X_train_sc, y_train))
print("Model score for testing set:", model.score(X_test_sc, y_test))

y_pred = clf.predict(X_test_sc)

results = confusion_matrix(y_test, y_pred) 
print('Confusion Matrix :\n', results)
print('Accuracy Score :',accuracy_score(y_test, y_pred))
print('Report : ', classification_report(y_test, y_pred))

Model score for training set: 0.9313439533318375
Model score for testing set: 0.8798206278026905
Confusion Matrix :
 [[878  88]
 [ 46 103]]
Accuracy Score : 0.8798206278026905
Report :                precision    recall  f1-score   support

           0       0.95      0.91      0.93       966
           1       0.54      0.69      0.61       149

   micro avg       0.88      0.88      0.88      1115
   macro avg       0.74      0.80      0.77      1115
weighted avg       0.90      0.88      0.89      1115





In [19]:
clf = RandomForestClassifier(random_state=0, class_weight='balanced', n_estimators=2)

model = clf.fit(X_train_sc, y_train)

print("Model score for training set:", model.score(X_train_sc, y_train))
print("Model score for testing set:", model.score(X_test_sc, y_test))

y_pred = clf.predict(X_test_sc)

results = confusion_matrix(y_test, y_pred) 
print('Confusion Matrix :\n', results)
print('Accuracy Score :',accuracy_score(y_test, y_pred))
print('Report : ', classification_report(y_test, y_pred))

Model score for training set: 0.9234911375364595
Model score for testing set: 0.8923766816143498
Confusion Matrix :
 [[906  60]
 [ 60  89]]
Accuracy Score : 0.8923766816143498
Report :                precision    recall  f1-score   support

           0       0.94      0.94      0.94       966
           1       0.60      0.60      0.60       149

   micro avg       0.89      0.89      0.89      1115
   macro avg       0.77      0.77      0.77      1115
weighted avg       0.89      0.89      0.89      1115



In [20]:
clf = RandomForestClassifier(random_state=0, class_weight='balanced', bootstrap=False)

model = clf.fit(X_train_sc, y_train)

print("Model score for training set:", model.score(X_train_sc, y_train))
print("Model score for testing set:", model.score(X_test_sc, y_test))

y_pred = clf.predict(X_test_sc)

results = confusion_matrix(y_test, y_pred) 
print('Confusion Matrix :\n', results)
print('Accuracy Score :',accuracy_score(y_test, y_pred))
print('Report : ', classification_report(y_test, y_pred))



Model score for training set: 0.9216962082118016
Model score for testing set: 0.8771300448430494
Confusion Matrix :
 [[868  98]
 [ 39 110]]
Accuracy Score : 0.8771300448430494
Report :                precision    recall  f1-score   support

           0       0.96      0.90      0.93       966
           1       0.53      0.74      0.62       149

   micro avg       0.88      0.88      0.88      1115
   macro avg       0.74      0.82      0.77      1115
weighted avg       0.90      0.88      0.89      1115



In [21]:
clf = RandomForestClassifier(random_state=0, class_weight='balanced', bootstrap=False, n_estimators=2)

model = clf.fit(X_train_sc, y_train)

print("Model score for training set:", model.score(X_train_sc, y_train))
print("Model score for testing set:", model.score(X_test_sc, y_test))

y_pred = clf.predict(X_test_sc)

results = confusion_matrix(y_test, y_pred) 
print('Confusion Matrix :\n', results)
print('Accuracy Score :',accuracy_score(y_test, y_pred))
print('Report : ', classification_report(y_test, y_pred))

Model score for training set: 0.9216962082118016
Model score for testing set: 0.8753363228699551
Confusion Matrix :
 [[871  95]
 [ 44 105]]
Accuracy Score : 0.8753363228699551
Report :                precision    recall  f1-score   support

           0       0.95      0.90      0.93       966
           1       0.53      0.70      0.60       149

   micro avg       0.88      0.88      0.88      1115
   macro avg       0.74      0.80      0.76      1115
weighted avg       0.89      0.88      0.88      1115



In [22]:
# Load data
logreg = pd.read_csv('length_numw_logreg.txt')
logreg

Unnamed: 0,model,penalty,solver,C1,score_train,score_test,precision_0,precision_1,recall_0,recall_1,f1_score_0,f1_score_1
0,Logistic regression,l2,'sag',1.0,0.7938,0.7901,0.98,0.38,0.78,0.88,0.87,0.53
1,Logistic regression,l2,'newton-cg',1.0,0.7958,0.7901,0.98,0.38,0.78,0.87,0.87,0.53
2,Logistic regression,l2,'lbfgs',1.0,0.7958,0.7901,0.98,0.38,0.78,0.87,0.87,0.53
3,Logistic regression,l1,'liblinear',1.0,0.8423,0.8475,0.97,0.46,0.85,0.86,0.91,0.6
4,Logistic regression,l1,'liblinear',0.1,0.8021,0.7973,0.98,0.39,0.78,0.88,0.87,0.54
5,Logistic regression,l1,'liblinear',0.5,0.8357,0.8448,0.98,0.46,0.84,0.89,0.9,0.6


In [23]:
# Load data
svm = pd.read_csv('length_numw_SVM.txt')
svm

Unnamed: 0,model,kernel,coef0,score_train,score_test,precision_0,precision_1,recall_0,recall_1,f1_score_0,f1_score_1
0,SVC,'rbf',0.0,0.787,0.7883,0.98,0.38,0.77,0.9,0.86,0.53
1,SVC,'sigmoid',0.0,0.7839,0.782,0.98,0.37,0.76,0.9,0.86,0.52
2,SVC,'sigmoid',0.1,0.7826,0.7821,0.98,0.37,0.76,0.9,0.86,0.52


In [24]:
# Load data
ranfor = pd.read_csv('length_numw_ranfor.txt')
ranfor

Unnamed: 0,model,n_estimators,bootstrap,score_train,score_test,precision_0,precision_1,recall_0,recall_1,f1_score_0,f1_score_1
0,Random Forest,10,True,0.9313,0.8798,0.95,0.54,0.91,0.69,0.93,0.61
1,Random Forest,2,True,0.9235,0.8924,0.94,0.6,0.94,0.6,0.94,0.6
2,Random Forest,10,False,0.9217,0.8771,0.96,0.53,0.9,0.74,0.93,0.62
