# Assignment 1 : SPAM classification 

## Importing the importing the necessary libraries and models

In [1]:
import sys

import numpy as np

import pandas as pd

import sklearn

import spacy

from sklearn.model_selection import train_test_split, GridSearchCV 

from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB

from sklearn.metrics import accuracy_score, confusion_matrix


## Loading the data set

In [2]:
df = pd.read_csv('SMSSpamCollection.txt',  sep="\t", header=None, names=["label", "SMS"])

In [3]:
df.head()

Unnamed: 0,label,SMS
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


## Load model to return language object

In [4]:
nlp = spacy.load("en_core_web_sm")

## Calling nlp on our sms texts to return a processed Doc for each one


In [5]:
#df['doc'] = [nlp(text) for text in df.SMS]
df['doc'] = list(nlp.pipe(df.SMS))

## Adding a column for lemmatization and stopword filtering

In [6]:
def stopwordlessLemma(tokens):
        return [token.lemma_ for token in tokens if not token.is_stop]

df['stopwordlessLemma'] = df['doc'].apply(stopwordlessLemma)

## Adding a column for POS tagging and stopword filtering

In [7]:
def addPOS(tokens):
    return [token.pos_ for token in tokens if not token.is_stop]

df['POS'] = df['doc'].apply(addPOS)


## preprocessed data:

In [8]:
df.head()

Unnamed: 0,label,SMS,doc,stopwordlessLemma,POS
0,ham,"Go until jurong point, crazy.. Available only ...","(Go, until, jurong, point, ,, crazy, .., Avail...","[jurong, point, ,, crazy, .., available, bugis...","[PROPN, NOUN, PUNCT, ADJ, PUNCT, ADJ, PROPN, P..."
1,ham,Ok lar... Joking wif u oni...,"(Ok, lar, ..., Joking, wif, u, oni, ...)","[ok, lar, ..., joking, wif, u, oni, ...]","[INTJ, NOUN, PUNCT, NOUN, NOUN, NOUN, NOUN, PU..."
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,"(Free, entry, in, 2, a, wkly, comp, to, win, F...","[free, entry, 2, wkly, comp, win, FA, Cup, fin...","[ADJ, NOUN, NUM, ADJ, NOUN, VERB, PROPN, PROPN..."
3,ham,U dun say so early hor... U c already then say...,"(U, dun, say, so, early, hor, ..., U, c, alrea...","[u, dun, early, hor, ..., U, c, ...]","[PRON, NOUN, ADJ, NOUN, PUNCT, PROPN, PROPN, P..."
4,ham,"Nah I don't think he goes to usf, he lives aro...","(Nah, I, do, n't, think, he, goes, to, usf, ,,...","[nah, think, go, usf, ,, live]","[ADV, VERB, VERB, PROPN, PUNCT, VERB]"


 #### we are going to train and test the models with the data "stopwordlessLemma", so:
 ##### 1. we select the right column:

In [9]:
#data = [' '.join(sms) for sms in df["stopwordlessLemma"]]

def dummy_fun(doc):
    return doc


In [10]:
data = df["stopwordlessLemma"]
label = df["label"]

In [11]:
X_train, X_test, y_train, y_test = train_test_split(data, label, test_size = 0.3, stratify = label)
#"stratify = y" parameter will help us to have the same ratio of samp/ham in the train and the test sets(disjoint sets)

##### 2. we vectorise the sequence of lemmas for each sms, using Tfi-df Vectorizer:

In [12]:
vectorizer = TfidfVectorizer(analyzer='word', tokenizer=dummy_fun, preprocessor=dummy_fun)



In [13]:
X = vectorizer.fit_transform(X_train)

print(X.shape)

(3900, 7619)


##### 3. now we have the X matrix and y vector, so we can split the data into train and test sets:

## Training the models:
### Random forest: 
##### to tune the hyper parameters, we'are using grid search to find the best combination to have the greatest accuracy score

In [14]:
rfc=RandomForestClassifier(random_state=42)
param_grid = { 
    'n_estimators': [720, 800],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth' : [9, 10, 11, 12],
    'criterion' :['gini', 'entropy']
}
CV_rfc = GridSearchCV(estimator=rfc, param_grid=param_grid, cv= 5)
CV_rfc.fit(X, y_train)
CV_rfc.best_params_


{'criterion': 'gini',
 'max_depth': 12,
 'max_features': 'auto',
 'n_estimators': 800}

##### now we use the test set to see if the model is good enough, and as you can see the accuracy was 90%

In [15]:
pred=CV_rfc.predict(vectorizer.transform(X_test))

print("Test predict accuracy score: {:.2f}".format(accuracy_score(y_test,pred)),"\n")

cm = confusion_matrix(y_test, pred)
print('Confusion matrix')

print(cm)

Test predict accuracy score: 0.91 

Confusion matrix
[[1448    0]
 [ 157   67]]


### Logistic regression
##### we'are using grid search again to tune the parameters

In [16]:
# Creating the model:
lr = LogisticRegression() 

grid = {
    'C':np.logspace(0.01,1,100), 
    'penalty': ['l1', 'l2','elasticnet']
}

# Creating GridSearchCV model:
lr_cv = GridSearchCV(lr, grid, cv=10) 
# Training the model:
lr_cv.fit(X, y_train)

print('best paremeters for logistic regression: ', lr_cv.best_params_)
print('best score for logistic regression after grid search cv:', lr_cv.best_score_)
lr_cv.best_params_["C"]

ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got elasticnet penalty.

ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got elasticnet penalty.

ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got elasticnet penalty.

ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got elasticnet penalty.

ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got elasticnet penalty.

ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

ValueError: Solver lbfgs supports only 'l2' or 'none' pena

best paremeters for logistic regression:  {'C': 9.33254300796991, 'penalty': 'l2'}
best score for logistic regression after grid search cv: 0.9764102564102565


ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got elasticnet penalty.



9.33254300796991

In [17]:
lr_tuned = LogisticRegression(C=lr_cv.best_params_["C"], penalty=lr_cv.best_params_["penalty"])
lr_tuned.fit(X, y_train)

LogisticRegression(C=9.33254300796991, class_weight=None, dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=100, multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

##### for this model the accuracy  is 98%

In [18]:
pred=lr_tuned.predict(vectorizer.transform(X_test))

print("Test predict accuracy score: {:.2f}".format(accuracy_score(y_test,pred)),"\n")

cm = confusion_matrix(y_test, pred)
print('Confusion matrix')

print(cm)

Test predict accuracy score: 0.98 

Confusion matrix
[[1441    7]
 [  29  195]]


### Naive Bayes
##### there's no parameters to tune for Naive Bayes

In [19]:
NB = GaussianNB()
NB.fit(X.toarray(), y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

In [20]:
pred=NB.predict(vectorizer.transform(X_test).toarray())

print("Test predict accuracy score: {:.2f}".format(accuracy_score(y_test,pred)),"\n")

cm = confusion_matrix(y_test, pred)
print('Confusion matrix')

print(cm)

Test predict accuracy score: 0.88 

Confusion matrix
[[1273  175]
 [  21  203]]
