In [1]:
import string
import pandas as pd
import numpy as np

from collections import Counter
from nltk.tokenize import TweetTokenizer
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

# Python script for confusion matrix creation. 
from sklearn.metrics import confusion_matrix 
from sklearn.metrics import accuracy_score 
from sklearn.metrics import classification_report 

np.set_printoptions(precision=5)
%matplotlib inline
pd.options.display.max_columns=1000

In [2]:
# Load data
df = pd.read_csv('SMSSpamCollection.txt', sep='\t', header=None, names=['spam', 'text'])

# set categorical values of spam to 0 or 1
df['spam'] = df['spam'] == 'spam' # makes True/False instead of "spam" and "ham"
df['spam'] = df['spam'].astype(int)  # number values instead of boolean value

# Get rid of the punctuation
translator = str.maketrans('', '', string.punctuation)
df.text = df.text.apply(lambda x: x.translate(translator))

# Adding new feature 'length'
L = []
for i in df.text:
    L.append(len(i))
df['length'] = L
df.head()

Unnamed: 0,spam,text,length
0,0,Go until jurong point crazy Available only in ...,102
1,0,Ok lar Joking wif u oni,23
2,1,Free entry in 2 a wkly comp to win FA Cup fina...,149
3,0,U dun say so early hor U c already then say,43
4,0,Nah I dont think he goes to usf he lives aroun...,59


Our first simple model will predict whether message is a spam or ham, just using feature 'lenght'. 

In [3]:
df_len = df[['spam', 'length']]
df_len.head()

Unnamed: 0,spam,length
0,0,102
1,0,23
2,1,149
3,0,43
4,0,59


In [4]:
X_train,  X_test, y_train, y_test =  train_test_split(df_len.length.values, df_len.spam.values, test_size=0.2, random_state=42)

We don't need to scale features, as we have just one. However, in later models we use more than just this features and therefore as exercise, we do it right now as well.

In [5]:
# We will used MinMaxScaler, which scales values in a way that our new values will be within itnerval <0,1>.
# ATTENTION! With train set we use .fit_transform method(), with test set only .transform()!!!
scaler = MinMaxScaler()
X_train_sc = scaler.fit_transform(X_train.reshape(-1, 1))
X_test_sc = scaler.transform(X_test.reshape(-1, 1))



### 1) Logistic Regression

In [6]:
# Logistic Regression model
clf = LogisticRegression(random_state=0, class_weight='balanced', solver='sag')

model = clf.fit(X_train_sc, y_train)

print("Model score for training set:", model.score(X_train_sc, y_train))
print("Model score for testing set:", model.score(X_test_sc, y_test))

Model score for training set: 0.8021090419564729
Model score for testing set: 0.7973094170403587


In [7]:
y_pred = clf.predict(X_test_sc)

In [8]:
results = confusion_matrix(y_test, y_pred) 
print('Confusion Matrix :\n', results)
print('Accuracy Score :',accuracy_score(y_test, y_pred))
print('Report : ', classification_report(y_test, y_pred))

Confusion Matrix :
 [[758 208]
 [ 18 131]]
Accuracy Score : 0.7973094170403587
Report :                precision    recall  f1-score   support

           0       0.98      0.78      0.87       966
           1       0.39      0.88      0.54       149

   micro avg       0.80      0.80      0.80      1115
   macro avg       0.68      0.83      0.70      1115
weighted avg       0.90      0.80      0.83      1115



###### Try to change some of hyperparameters
Now change solver:

In [24]:
clf = LogisticRegression(random_state=0, class_weight='balanced', solver='newton-cg')

model = clf.fit(X_train_sc, y_train)

print("Model score for training set:", model.score(X_train_sc, y_train))
print("Model score for testing set:", model.score(X_test_sc, y_test))

y_pred = clf.predict(X_test_sc)

results = confusion_matrix(y_test, y_pred) 
print('Confusion Matrix :\n', results)
print('Accuracy Score :',accuracy_score(y_test, y_pred))
print('Report : ', classification_report(y_test, y_pred))

Model score for training set: 0.8021090419564729
Model score for testing set: 0.7973094170403587
Confusion Matrix :
 [[758 208]
 [ 18 131]]
Accuracy Score : 0.7973094170403587
Report :                precision    recall  f1-score   support

           0       0.98      0.78      0.87       966
           1       0.39      0.88      0.54       149

   micro avg       0.80      0.80      0.80      1115
   macro avg       0.68      0.83      0.70      1115
weighted avg       0.90      0.80      0.83      1115



In [25]:
clf = LogisticRegression(random_state=0, class_weight='balanced', solver='lbfgs')

model = clf.fit(X_train_sc, y_train)

print("Model score for training set:", model.score(X_train_sc, y_train))
print("Model score for testing set:", model.score(X_test_sc, y_test))

y_pred = clf.predict(X_test_sc)

results = confusion_matrix(y_test, y_pred) 
print('Confusion Matrix :\n', results)
print('Accuracy Score :',accuracy_score(y_test, y_pred))
print('Report : ', classification_report(y_test, y_pred))

Model score for training set: 0.8021090419564729
Model score for testing set: 0.7973094170403587
Confusion Matrix :
 [[758 208]
 [ 18 131]]
Accuracy Score : 0.7973094170403587
Report :                precision    recall  f1-score   support

           0       0.98      0.78      0.87       966
           1       0.39      0.88      0.54       149

   micro avg       0.80      0.80      0.80      1115
   macro avg       0.68      0.83      0.70      1115
weighted avg       0.90      0.80      0.83      1115



No significant change when changing solver. Try l1 instead of l2 and therefore different solver.

In [26]:
clf = LogisticRegression(penalty='l1',random_state=0, class_weight='balanced', solver='liblinear')

model = clf.fit(X_train_sc, y_train)

print("Model score for training set:", model.score(X_train_sc, y_train))
print("Model score for testing set:", model.score(X_test_sc, y_test))

y_pred = clf.predict(X_test_sc)

results = confusion_matrix(y_test, y_pred) 
print('Confusion Matrix :\n', results)
print('Accuracy Score :',accuracy_score(y_test, y_pred))
print('Report : ', classification_report(y_test, y_pred))

Model score for training set: 0.8041283374467131
Model score for testing set: 0.7982062780269058
Confusion Matrix :
 [[760 206]
 [ 19 130]]
Accuracy Score : 0.7982062780269058
Report :                precision    recall  f1-score   support

           0       0.98      0.79      0.87       966
           1       0.39      0.87      0.54       149

   micro avg       0.80      0.80      0.80      1115
   macro avg       0.68      0.83      0.70      1115
weighted avg       0.90      0.80      0.83      1115



Changing C1 - inverse of regularization strength; must be a positive float. Like in support vector machines, smaller values specify stronger regularization.

In [27]:
clf = LogisticRegression(penalty='l1',random_state=0, class_weight='balanced', solver='liblinear', C=0.1)

model = clf.fit(X_train_sc, y_train)

print("Model score for training set:", model.score(X_train_sc, y_train))
print("Model score for testing set:", model.score(X_test_sc, y_test))

y_pred = clf.predict(X_test_sc)

results = confusion_matrix(y_test, y_pred) 
print('Confusion Matrix :\n', results)
print('Accuracy Score :',accuracy_score(y_test, y_pred))
print('Report : ', classification_report(y_test, y_pred))

Model score for training set: 0.8021090419564729
Model score for testing set: 0.7973094170403587
Confusion Matrix :
 [[758 208]
 [ 18 131]]
Accuracy Score : 0.7973094170403587
Report :                precision    recall  f1-score   support

           0       0.98      0.78      0.87       966
           1       0.39      0.88      0.54       149

   micro avg       0.80      0.80      0.80      1115
   macro avg       0.68      0.83      0.70      1115
weighted avg       0.90      0.80      0.83      1115



In [28]:
clf = LogisticRegression(penalty='l1',random_state=0, class_weight='balanced', solver='liblinear', C=0.5)

model = clf.fit(X_train_sc, y_train)

print("Model score for training set:", model.score(X_train_sc, y_train))
print("Model score for testing set:", model.score(X_test_sc, y_test))

y_pred = clf.predict(X_test_sc)

results = confusion_matrix(y_test, y_pred) 
print('Confusion Matrix :\n', results)
print('Accuracy Score :',accuracy_score(y_test, y_pred))
print('Report : ', classification_report(y_test, y_pred))

Model score for training set: 0.8041283374467131
Model score for testing set: 0.7982062780269058
Confusion Matrix :
 [[760 206]
 [ 19 130]]
Accuracy Score : 0.7982062780269058
Report :                precision    recall  f1-score   support

           0       0.98      0.79      0.87       966
           1       0.39      0.87      0.54       149

   micro avg       0.80      0.80      0.80      1115
   macro avg       0.68      0.83      0.70      1115
weighted avg       0.90      0.80      0.83      1115



### 2) SVM

In [30]:
clf = SVC(random_state=0, class_weight='balanced')

model = clf.fit(X_train_sc, y_train)

print("Model score for training set:", model.score(X_train_sc, y_train))
print("Model score for testing set:", model.score(X_test_sc, y_test))

y_pred = clf.predict(X_test_sc)

results = confusion_matrix(y_test, y_pred) 
print('Confusion Matrix :\n', results)
print('Accuracy Score :',accuracy_score(y_test, y_pred))
print('Report : ', classification_report(y_test, y_pred))



Model score for training set: 0.7897689028494503
Model score for testing set: 0.7910313901345292
Confusion Matrix :
 [[747 219]
 [ 14 135]]
Accuracy Score : 0.7910313901345292
Report :                precision    recall  f1-score   support

           0       0.98      0.77      0.87       966
           1       0.38      0.91      0.54       149

   micro avg       0.79      0.79      0.79      1115
   macro avg       0.68      0.84      0.70      1115
weighted avg       0.90      0.79      0.82      1115



In [31]:
clf = SVC(random_state=0, kernel='sigmoid', class_weight='balanced')

model = clf.fit(X_train_sc, y_train)

print("Model score for training set:", model.score(X_train_sc, y_train))
print("Model score for testing set:", model.score(X_test_sc, y_test))

y_pred = clf.predict(X_test_sc)

results = confusion_matrix(y_test, y_pred) 
print('Confusion Matrix :\n', results)
print('Accuracy Score :',accuracy_score(y_test, y_pred))
print('Report : ', classification_report(y_test, y_pred))



Model score for training set: 0.7917881983396904
Model score for testing set: 0.7910313901345292
Confusion Matrix :
 [[747 219]
 [ 14 135]]
Accuracy Score : 0.7910313901345292
Report :                precision    recall  f1-score   support

           0       0.98      0.77      0.87       966
           1       0.38      0.91      0.54       149

   micro avg       0.79      0.79      0.79      1115
   macro avg       0.68      0.84      0.70      1115
weighted avg       0.90      0.79      0.82      1115



In [32]:
clf = SVC(random_state=0, kernel='sigmoid', coef0=0.1, class_weight='balanced')

model = clf.fit(X_train_sc, y_train)

print("Model score for training set:", model.score(X_train_sc, y_train))
print("Model score for testing set:", model.score(X_test_sc, y_test))

y_pred = clf.predict(X_test_sc)

results = confusion_matrix(y_test, y_pred) 
print('Confusion Matrix :\n', results)
print('Accuracy Score :',accuracy_score(y_test, y_pred))
print('Report : ', classification_report(y_test, y_pred))



Model score for training set: 0.7917881983396904
Model score for testing set: 0.7910313901345292
Confusion Matrix :
 [[747 219]
 [ 14 135]]
Accuracy Score : 0.7910313901345292
Report :                precision    recall  f1-score   support

           0       0.98      0.77      0.87       966
           1       0.38      0.91      0.54       149

   micro avg       0.79      0.79      0.79      1115
   macro avg       0.68      0.84      0.70      1115
weighted avg       0.90      0.79      0.82      1115



### 3) Random Forest

In [35]:
clf = RandomForestClassifier(random_state=0, class_weight='balanced')

model = clf.fit(X_train_sc, y_train)

print("Model score for training set:", model.score(X_train_sc, y_train))
print("Model score for testing set:", model.score(X_test_sc, y_test))

y_pred = clf.predict(X_test_sc)

results = confusion_matrix(y_test, y_pred) 
print('Confusion Matrix :\n', results)
print('Accuracy Score :',accuracy_score(y_test, y_pred))
print('Report : ', classification_report(y_test, y_pred))

Model score for training set: 0.849899035225488
Model score for testing set: 0.8493273542600897
Confusion Matrix :
 [[828 138]
 [ 30 119]]
Accuracy Score : 0.8493273542600897
Report :                precision    recall  f1-score   support

           0       0.97      0.86      0.91       966
           1       0.46      0.80      0.59       149

   micro avg       0.85      0.85      0.85      1115
   macro avg       0.71      0.83      0.75      1115
weighted avg       0.90      0.85      0.86      1115





In [36]:
clf = RandomForestClassifier(random_state=0, class_weight='balanced', n_estimators=2)

model = clf.fit(X_train_sc, y_train)

print("Model score for training set:", model.score(X_train_sc, y_train))
print("Model score for testing set:", model.score(X_test_sc, y_test))

y_pred = clf.predict(X_test_sc)

results = confusion_matrix(y_test, y_pred) 
print('Confusion Matrix :\n', results)
print('Accuracy Score :',accuracy_score(y_test, y_pred))
print('Report : ', classification_report(y_test, y_pred))

Model score for training set: 0.8335203051379851
Model score for testing set: 0.8358744394618834
Confusion Matrix :
 [[817 149]
 [ 34 115]]
Accuracy Score : 0.8358744394618834
Report :                precision    recall  f1-score   support

           0       0.96      0.85      0.90       966
           1       0.44      0.77      0.56       149

   micro avg       0.84      0.84      0.84      1115
   macro avg       0.70      0.81      0.73      1115
weighted avg       0.89      0.84      0.85      1115



In [38]:
clf = RandomForestClassifier(random_state=0, class_weight='balanced', bootstrap=False)

model = clf.fit(X_train_sc, y_train)

print("Model score for training set:", model.score(X_train_sc, y_train))
print("Model score for testing set:", model.score(X_test_sc, y_test))

y_pred = clf.predict(X_test_sc)

results = confusion_matrix(y_test, y_pred) 
print('Confusion Matrix :\n', results)
print('Accuracy Score :',accuracy_score(y_test, y_pred))
print('Report : ', classification_report(y_test, y_pred))

Model score for training set: 0.8472066412385012
Model score for testing set: 0.8430493273542601
Confusion Matrix :
 [[820 146]
 [ 29 120]]
Accuracy Score : 0.8430493273542601
Report :                precision    recall  f1-score   support

           0       0.97      0.85      0.90       966
           1       0.45      0.81      0.58       149

   micro avg       0.84      0.84      0.84      1115
   macro avg       0.71      0.83      0.74      1115
weighted avg       0.90      0.84      0.86      1115





In [11]:
clf = RandomForestClassifier(random_state=0, class_weight='balanced', bootstrap=False, n_estimators=2)

model = clf.fit(X_train_sc, y_train)

print("Model score for training set:", model.score(X_train_sc, y_train))
print("Model score for testing set:", model.score(X_test_sc, y_test))

y_pred = clf.predict(X_test_sc)

results = confusion_matrix(y_test, y_pred) 
print('Confusion Matrix :\n', results)
print('Accuracy Score :',accuracy_score(y_test, y_pred))
print('Report : ', classification_report(y_test, y_pred))

Model score for training set: 0.8472066412385012
Model score for testing set: 0.8430493273542601
Confusion Matrix :
 [[820 146]
 [ 29 120]]
Accuracy Score : 0.8430493273542601
Report :                precision    recall  f1-score   support

           0       0.97      0.85      0.90       966
           1       0.45      0.81      0.58       149

   micro avg       0.84      0.84      0.84      1115
   macro avg       0.71      0.83      0.74      1115
weighted avg       0.90      0.84      0.86      1115

