# Bhavya Sharma &nbsp;&nbsp;&nbsp;&nbsp;                                               Carnegie Mellon University

# The following project is a classification task to classify SMS as spam or not spam. The data was taken from UC Irvine Machine Learning repository https://archive.ics.uci.edu/ml/datasets/sms+spam+collection

In [174]:
import nltk
from collections import Counter
import pandas as pd
import string
import numpy as np
import sklearn
import math
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.naive_bayes import GaussianNB

## Importing the dataset

In [129]:
spam_df = pd.read_csv("spam.csv", encoding = 'latin-1')

In [125]:
spam_df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


### We have two major columns - 'v1' has binary values - ham (not spam) and spam. Text consists of the original SMS that was collected from various soruces. Unnamed columns are created due to overflow of text to next cell in CSV file. Lets begin by properly renaming the columns

In [130]:
spam_df.columns = ['label','text','ud1','ud2','ud3']
spam_df.head()

Unnamed: 0,label,text,ud1,ud2,ud3
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


### Lets check if the columns ud1, ud2 and ud3 are all null values

In [106]:
ud1 = list(spam_df.loc[:,'ud1'])
len(set(ud1))

#There are 44 unique values

44

In [109]:
set(ud1)

{' But at d end my love compromised me for everything:-(\\".. Gud mornin:-)"',
 ' Dont Come Near My Body..!! Bcoz My Hands May Not Come 2 Wipe Ur Tears Off That Time..!Gud ni8"',
 ' ENJOYIN INDIANS AT THE MO..yeP. SaLL gOoD HehE ;> hows bout u shexy? Pete Xx\\""',
 ' GOD said',
 ' Gud night...."',
 ' HAD A COOL NYTHO',
 ' HOPE UR OK... WILL GIVE U A BUZ WEDLUNCH. GO OUTSOMEWHERE 4 ADRINK IN TOWN..CUD GO 2WATERSHD 4 A BIT? PPL FROMWRK WILL BTHERE. LOVE PETEXXX.\\""',
 ' HOWU DOIN? FOUNDURSELF A JOBYET SAUSAGE?LOVE JEN XXX\\""',
 " I don't mind",
 ' I\'ll come up"',
 ' PO Box 1146 MK45 2WT (2/3)"',
 ' PO Box 5249',
 ' SHE SHUDVETOLD U. DID URGRAN KNOW?NEWAY',
 ' \\"It is d wonderful fruit that a tree gives when it is being hurt by a stone.. Good night......"',
 ' always give response 2 who cares 4 U\\"... Gud night..swt dreams..take care"',
 ' b\'coz nobody will fight for u. Only u &amp; u have to fight for ur self &amp; win the battle. -VIVEKANAND- G 9t.. SD.."',
 ' bt not his girlfrnd.

In [111]:
ud2 = list(spam_df.loc[:,'ud2'])
ud3 = list(spam_df.loc[:,'ud3'])
print('Unique values in ud2:'+str(len(set(ud2))))
print('Unique values in ud3:'+str(len(set(ud3))))

Unique values in ud2:11
Unique values in ud3:6


### We can merge all these columns with the original text column to have only 2 columns for our classification problem

In [131]:
spam_df = spam_df.replace(np.nan, '', regex=True)
spam_df['text'] = spam_df[['text', 'ud1','ud2','ud3']].apply(lambda x: ''.join(x), axis=1)
spam_df = spam_df.drop(columns=['ud1','ud2','ud3'])
spam_df.head()

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


### Lets convert label column to 0 and 1, where 1 denotes spam and 0 denots ham

In [132]:
spam_df = spam_df.assign(label = pd.Series(np.where(spam_df.label.values == 'spam', 1, 0),
          spam_df.index))
spam_df.head()

Unnamed: 0,label,text
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


## We will use nltk library for processing the text and perform classification

In [133]:
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Bhavya\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Bhavya\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Bhavya\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [134]:
lemmatizer=nltk.stem.wordnet.WordNetLemmatizer()
stopwords=nltk.corpus.stopwords.words('english')


### The following function will lower the case, and handles the punctuations in the text of the dataframe followed by which it generates tokens for each text

In [136]:
def process(text, lemmatizer=nltk.stem.wordnet.WordNetLemmatizer()):
    text = text.lower()
    text = text.replace("'s","")
    text = text.replace("'","")
    for i in string.punctuation:
        text = text.replace(i," ")
    tokens = nltk.word_tokenize(text)
    for i in range (0,len(tokens)):
        tokens[i] = lemmatizer.lemmatize(tokens[i])
    return tokens
    pass

In [138]:
# Lets test our function:
print(process(spam_df.loc[0,'text']))
print(process(spam_df.loc[100,'text']))

['go', 'until', 'jurong', 'point', 'crazy', 'available', 'only', 'in', 'bugis', 'n', 'great', 'world', 'la', 'e', 'buffet', 'cine', 'there', 'got', 'amore', 'wat']
['okay', 'name', 'ur', 'price', 'a', 'long', 'a', 'it', 'legal', 'wen', 'can', 'i', 'pick', 'them', 'up', 'y', 'u', 'ave', 'x', 'am', 'xx']


### In order to process all the rows of text and adding them back into dataframe, we can create the following function

In [139]:
def process_all(df, lemmatizer=nltk.stem.wordnet.WordNetLemmatizer()):
    df1 = df.copy()
    df1 = df1.reset_index(drop=True)
    text = (df1.loc[:,'text'])
    processed = []
    for i in text:
        processed.append(process(i))
    df1 = df1.assign(text = processed)
    return df1


In [140]:
processed_spam = process_all(spam_df)
print(processed_spam.head())


   label                                               text
0      0  [go, until, jurong, point, crazy, available, o...
1      0                     [ok, lar, joking, wif, u, oni]
2      1  [free, entry, in, 2, a, wkly, comp, to, win, f...
3      0  [u, dun, say, so, early, hor, u, c, already, t...
4      0  [nah, i, dont, think, he, go, to, usf, he, lif...


### We need to create a TF-IDF vector for our text. We also need to remove the rare words and stop words. For the purpose of this classification, it can be assumed that the words that appear only once are rare and do not add much information to our problem

### Lets begin with getting a list of rare words in our text column

In [141]:
def get_rare_words(processed_spam):
    
    text = list(processed_spam.loc[:,'text'])
    check = [j for i in text for j in i] # create list of lists
    counter = Counter(check)
    a = []
    for i in counter.elements():
        if(counter[i]==1):
            a.append(i)
    return sorted(a)



In [142]:
rare_words = get_rare_words(processed_spam)
print(len(rare_words)) 

4175


## Classification

### For the purpose of classification, we will try Naives Bayes, Logisitic Regression, and Support Vector Machine (Linear Kernel) and select the the classifier which gives us the best metrics. We will also try our data on default model (classifies according to frequency of labels) to set a standard for our selected model.

### We will also use k-fold cross-validation to ensure that our models do not overfit. The cross-validation function will call each classifer and generate a dataframe of 'predicted' and 'actual' responses.

### We will create a confusion matrix for each classifer to assess its performance using the predicted and actual values

### Lets begin with creating a function to generate a sparse matrix of features for each sms (excluding the rarewords and stop words) and a numpy array for the labels. The classifiers will train on the matrix and predict using the TF-IDF object

In [247]:
def create_features(train, rare_words):
    text = train.loc[:,'text']
    text_string = []
    for i in range(0, len(text)):
        text_string.append(" ".join(str(x) for x in text[i]))
    tfidf = sklearn.feature_extraction.text.TfidfVectorizer()
    stopwords=nltk.corpus.stopwords.words('english')
    tfidf.stop_words = stopwords + rare_words
    matrix = tfidf.fit_transform(text_string)
    y = np.array(train.loc[:,'label'])
    return(tfidf,matrix,y)
    


### The Classifier functions:

In [249]:
# Logistic Regression
def get_pred_logreg(train,test,rare_words):
    train = train.reset_index(drop=True)
    test = test.reset_index(drop=True)
    (tfidf, X, y) = create_features(train, rare_words)
    lm = LogisticRegression()
    lm.fit(X,y)
    
    query = test.loc[:,'text']# get test input data
    #transform the test data
    query_string = []
    for i in range(0, len(query)):
        query_string.append(" ".join(str(x) for x in query[i]))
    query_matrix = tfidf.transform(query_string)
    predicted_response = lm.predict(query_matrix)
    
    return pd.DataFrame({'Predicted':predicted_response, 'Actual':test.loc[:,'label']})


In [303]:
#Support Vector Machine
def get_pred_svm(train,test,rare_words):
    train = train.reset_index(drop=True)
    test = test.reset_index(drop=True)
    (tfidf, X, y) = create_features(train, rare_words)
    svm_model = svm.SVC(kernel = 'linear')
    svm_model.fit(X,y)
    
    query = test.loc[:,'text']# get test input data
    #transform the test data
    query_string = []
    for i in range(0, len(query)):
        query_string.append(" ".join(str(x) for x in query[i]))
    query_matrix = tfidf.transform(query_string)
    predicted_response = svm_model.predict(query_matrix)
    
    return pd.DataFrame({'Predicted':predicted_response, 'Actual':test.loc[:,'label']})


In [261]:
# Naive Bayes
def get_pred_nb(train,test,rare_words):
    train = train.reset_index(drop=True)
    test = test.reset_index(drop=True)
    (tfidf, X, y) = create_features(train, rare_words)
    gnb = GaussianNB()
    X = X.toarray()
    gnb.fit(X,y)
    
    query = test.loc[:,'text']# get test input data
    #transform the test data
    query_string = []
    for i in range(0, len(query)):
        query_string.append(" ".join(str(x) for x in query[i]))
    query_matrix = tfidf.transform(query_string)
    query_matrix = query_matrix.toarray()
    predicted_response = gnb.predict(query_matrix)
    
    return pd.DataFrame({'Predicted':predicted_response, 'Actual':test.loc[:,'label']})


In [252]:
# Default Classifier
def get_pred_default(train,test):
    y = train.loc[:,'label']
    
    #Get the frequently occuring category in training data
    arity_name, arity_freq = np.unique(y, return_counts = True)
    for i in range(0,len(arity_freq)):
        if(arity_freq[i] == max(arity_freq)):
            pred_def = i
    predicted_response = [pred_def]*len(test)
    return pd.DataFrame({'Predicted':predicted_response, 'Actual':test.loc[:,'label']})

## The k-fold cross-validation function:

In [282]:
# Cross-validation function

def do_cv_class(df, num_folds, model_name):
    
    # Randomize the dataframe
    df = df.sample(frac=1)
    
    #creation of  k folds
    quotient = len(df)//num_folds
    q_array = np.array([quotient])
    
    #create array of size k with quotient as elements
    q_list = np.repeat(q_array, [num_folds], axis=0) 
    
    #distribute the remainder amongst first few folds
    remainder = len(df)%num_folds
    for i in range (0,remainder):
        q_list[i] = q_list[i]+1
    
    # create a list of starting indexes for the k-folds
    k_index=[]
    k_index.append(q_list[0]) 
    for i in range(1,len(q_list)):
        k_index.append(q_list[i] + k_index[i-1])

    # create test and train data for each fold and get the dataframe
    final_df = pd.DataFrame()
    for i in range(0,len(k_index)):
        if(i == 0):
            test = df.iloc[:k_index[i],:]
            train = df.drop(df.index[:k_index[i]])
            if (model_name == 'logreg'):
                output_df = get_pred_logreg(train,test,rare_words)
                fold_list = [i]*len(output_df)
                output_df = output_df.assign(folds = fold_list)
            elif(model_name == 'svm'):
                output_df = get_pred_svm(train,test,rare_words)
                fold_list = [i]*len(output_df)
                output_df = output_df.assign(folds = fold_list)
            elif(model_name == 'nb'):
                output_df = get_pred_nb(train,test,rare_words)
                fold_list = [i]*len(output_df)
                output_df = output_df.assign(folds = fold_list)
            elif(model_name == 'default'):
                output_df = get_pred_default(train,test)
                fold_list = [i]*len(output_df)
                output_df = output_df.assign(folds = fold_list)
            final_df = final_df.append(output_df)
        else:
            test = df.iloc[k_index[i-1]:k_index[i],:]
            train = df.drop(df.index[k_index[i-1]:k_index[i]])
            if (model_name == 'logreg'):
                output_df = get_pred_logreg(train,test,rare_words)
                fold_list = [i]*len(output_df)
                output_df = output_df.assign(folds = fold_list)
            elif(model_name == 'svm'):
                output_df = get_pred_svm(train,test,rare_words)
                fold_list = [i]*len(output_df)
                output_df = output_df.assign(folds = fold_list)
            elif(model_name == 'nb'):
                output_df = get_pred_nb(train,test,rare_words)
                fold_list = [i]*len(output_df)
                output_df = output_df.assign(folds = fold_list)
            elif(model_name == 'default'):
                output_df = get_pred_default(train,test)
                fold_list = [i]*len(output_df)
                output_df = output_df.assign(folds = fold_list)
            final_df = final_df.append(output_df)

    return final_df # k-size dataframe with predicted and actual values of k-folds


## Function to print confusion matrix and calculate performance metrics

In [273]:
def print_cont_table(pred, cutoff=0.5):

    # converting pandas dataframe to list
    # If using list comment below
    pred_data = pred.iloc[:, 0:2].copy()
    data_output = pd.np.array(pred_data)
    # If using list comment above

    predicted_output = [float(i[0]) for i in data_output] # first column of list are the Predicted values
    actual_output = [int(i[1]) for i in data_output] # Second column of list are the Actual values
    # returns list = [[FALSE, TRUE][IF CONDITION] FOR a row in LIST]
    predicted_output_manipulate = [[0, 1][x > cutoff] for x in predicted_output] # can also be performed by map(lambda x: [0, 1][x > cutoff], predicted_output)

    n11_TP = sum([[0, 1][predicted_output_manipulate[i] == 1 and actual_output[i] == 1] for i in range(len(predicted_output))])
    n00_TN = sum([[0, 1][predicted_output_manipulate[i] == 0 and actual_output[i] == 0] for i in range(len(predicted_output))])
    n10_FN = sum([[0, 1][predicted_output_manipulate[i] == 0 and actual_output[i] == 1] for i in range(len(predicted_output))])
    n01_FP = sum([[0, 1][predicted_output_manipulate[i] == 1 and actual_output[i] == 0] for i in range(len(predicted_output))])
    Pos = n11_TP + n10_FN
    Neg = n01_FP + n00_TN
    PPos = n11_TP + n01_FP
    PNeg = n10_FN + n00_TN
    print("           |  PPos \t PNeg \t | Sums")
    print("-------------------------------------")
    print("actual pos |  %d \t %d \t | %d" % (n11_TP, n10_FN, Pos))
    print("actual neg |  %d \t %d \t | %d" % (n01_FP, n00_TN, Neg)) 
    print("-------------------------------------") 
    print("Sums       |  %d \t %d \t | %d" % (PPos, PNeg, (Pos+Neg))) 
    return None

def get_metrics(pred, cutoff=0.5):
    ### your implementation goes here
    pred_data = pred.iloc[:, 0:2].copy()
    data_output = pd.np.array(pred_data)
    # If using list comment above

    predicted_output = [float(i[0]) for i in data_output] # first column of list are the Predicted values
    actual_output = [int(i[1]) for i in data_output] # Second column of list are the Actual values
    # returns list = [[FALSE, TRUE][IF CONDITION] FOR a row in LIST]
    predicted_output_manipulate = [[0, 1][x > cutoff] for x in predicted_output] # can also be performed by map(lambda x: [0, 1][x > cutoff], predicted_output)

    n11_TP = sum([[0, 1][predicted_output_manipulate[i] == 1 and actual_output[i] == 1] for i in range(len(predicted_output))])
    n00_TN = sum([[0, 1][predicted_output_manipulate[i] == 0 and actual_output[i] == 0] for i in range(len(predicted_output))])
    n10_FN = sum([[0, 1][predicted_output_manipulate[i] == 0 and actual_output[i] == 1] for i in range(len(predicted_output))])
    n01_FP = sum([[0, 1][predicted_output_manipulate[i] == 1 and actual_output[i] == 0] for i in range(len(predicted_output))])
    Pos = n11_TP + n10_FN
    Neg = n01_FP + n00_TN
    PPos = n11_TP + n01_FP
    PNeg = n10_FN + n00_TN
    
    if(Neg == 0):
        Accuracy = float(n11_TP+n00_TN)/float(Pos+Neg)
        Recall = float(n11_TP)/float(Pos)
        Precision = float(n11_TP)/float(n11_TP+n01_FP)
        TP_rate = float(n11_TP)/float(n11_TP+n10_FN)
        FP_rate = 'N/A'
    elif(Pos == 0):
        TP_rate = 'N/A'
        FP_rate = float(n01_FP)/float(n00_TN+n01_FP)
        Accuracy = float(n11_TP+n00_TN)/float(Pos+Neg)
        Precision = float(n11_TP)/float(n11_TP+n01_FP)
        Recall = 'N/A'
    else:
        TP_rate = float(n11_TP)/float(n11_TP+n10_FN)
        FP_rate = float(n01_FP)/float(n00_TN+n01_FP)
        Accuracy = float(n11_TP+n00_TN)/float(Pos+Neg)
        Precision = float(n11_TP)/float(n11_TP+n01_FP)
        Recall = float(n11_TP)/float(Pos)

    return pd.DataFrame({'True Positive Rate':TP_rate, 'False Positive Rate':FP_rate, 'Accuracy':Accuracy, 'Precision':Precision, 'Recall':Recall}, index = [0])



## Evaluating Models using Cross-validation (k = 10)

In [274]:
#DEFAULT/BASE MODEL
tmp = do_cv_class(processed_spam,10,'default') # returns pandas dataframe
print_cont_table(tmp.iloc[:, 0:2])
print(get_metrics(tmp.iloc[:, 0:2]))


           |  PPos 	 PNeg 	 | Sums
-------------------------------------
actual pos |  0 	 0 	 | 0
actual neg |  747 	 4825 	 | 5572
-------------------------------------
Sums       |  747 	 4825 	 | 5572
   Accuracy  False Positive Rate  Precision Recall True Positive Rate
0  0.865937             0.134063        0.0    N/A                N/A


## Since the number of not-spam labels are significantly higher than spam labels in our data, the accuracy of the base model is 86%. Hence, on classifying all SMS as not-spam, we will still be 86% accurate about our classification. Thus, our chosen model needs be highly accurate to beat the base model AND it must have lower false-positive rate to prevent not-spams from getting classified as spam messages. (Assuming the cost of getting not-spams is higher than preventing spams)


In [268]:
#LOGISTIC REGRESSION

tmp = do_cv_class(processed_spam,10,'logreg') # returns pandas dataframe
print_cont_table(tmp.iloc[:, 0:2])
print(get_metrics(tmp.iloc[:, 0:2]))


           |  PPos 	 PNeg 	 | Sums
-------------------------------------
actual pos |  574 	 10 	 | 584
actual neg |  173 	 4815 	 | 4988
-------------------------------------
Sums       |  747 	 4825 	 | 5572
   Accuracy  False Positive Rate  Precision    Recall  True Positive Rate
0  0.967157             0.034683   0.768407  0.982877            0.982877


In [270]:
#NAIVES BAYES
tmp = do_cv_class(processed_spam,10,'nb') # returns pandas dataframe
print_cont_table(tmp.iloc[:, 0:2])
print(get_metrics(tmp.iloc[:, 0:2]))


           |  PPos 	 PNeg 	 | Sums
-------------------------------------
actual pos |  662 	 554 	 | 1216
actual neg |  85 	 4271 	 | 4356
-------------------------------------
Sums       |  747 	 4825 	 | 5572
   Accuracy  False Positive Rate  Precision    Recall  True Positive Rate
0  0.885319             0.019513   0.886212  0.544408            0.544408


In [304]:
#Linear SVM
tmp = do_cv_class(processed_spam,10,'svm') # returns pandas dataframe
print_cont_table(tmp.iloc[:, 0:2])
print(get_metrics(tmp.iloc[:, 0:2]))


           |  PPos 	 PNeg 	 | Sums
-------------------------------------
actual pos |  674 	 7 	 | 681
actual neg |  73 	 4818 	 | 4891
-------------------------------------
Sums       |  747 	 4825 	 | 5572
   Accuracy  False Positive Rate  Precision    Recall  True Positive Rate
0  0.985642             0.014925   0.902276  0.989721            0.989721


## Linear SVM gives us the highest accuracy and lowest false-positive rate and hence is the model of our choice

### Evaluation on a validation set (taken out from the original dataset)

In [311]:
df = processed_spam.copy()
df = df.sample(frac=1)
train_set = df.iloc[:4000]
train_set.head()


Unnamed: 0,label,text
4663,0,"[mum, not, going, robinson, already]"
3134,0,"[wat, make, some, people, dearer, is, not, jus..."
61,0,"[ha, ha, ha, good, joke, girl, are, situation,..."
5243,0,"[of, course, dont, tease, me, you, know, i, si..."
1950,0,"[oh, ic, i, thought, you, meant, mary, jane]"


In [312]:
validation_set = df.iloc[4000:]
validation_set.head()

Unnamed: 0,label,text
3222,0,"[well, that, must, be, a, pain, to, catch]"
1392,0,"[thk, shld, b, can, ya, i, wana, go, 4, lesson..."
4779,0,"[sen, told, that, he, is, going, to, join, his..."
4500,0,"[so, wat, da, decision]"
5122,0,"[not, enufcredeit, tocall, shall, ileave, uni,..."


In [314]:
#DEFAULT/BASE MODEL
tmp = do_cv_class(train_set,10,'default') # returns pandas dataframe
print_cont_table(tmp.iloc[:, 0:2])
print(get_metrics(tmp.iloc[:, 0:2]))


           |  PPos 	 PNeg 	 | Sums
-------------------------------------
actual pos |  0 	 0 	 | 0
actual neg |  539 	 3461 	 | 4000
-------------------------------------
Sums       |  539 	 3461 	 | 4000
   Accuracy  False Positive Rate  Precision Recall True Positive Rate
0   0.86525              0.13475        0.0    N/A                N/A


In [319]:
#performance of Base model on validation set
valid = get_pred_default(train_set,validation_set)
print_cont_table(valid)
print(get_metrics(valid))


           |  PPos 	 PNeg 	 | Sums
-------------------------------------
actual pos |  0 	 0 	 | 0
actual neg |  208 	 1364 	 | 1572
-------------------------------------
Sums       |  208 	 1364 	 | 1572
   Accuracy  False Positive Rate  Precision Recall True Positive Rate
0  0.867684             0.132316        0.0    N/A                N/A


In [315]:
#LOGISTIC REGRESSION

tmp = do_cv_class(train_set,10,'logreg') # returns pandas dataframe
print_cont_table(tmp.iloc[:, 0:2])
print(get_metrics(tmp.iloc[:, 0:2]))


           |  PPos 	 PNeg 	 | Sums
-------------------------------------
actual pos |  379 	 8 	 | 387
actual neg |  160 	 3453 	 | 3613
-------------------------------------
Sums       |  539 	 3461 	 | 4000
   Accuracy  False Positive Rate  Precision    Recall  True Positive Rate
0     0.958             0.044285   0.703154  0.979328            0.979328


In [321]:
#performance of Logreg model on validation set
valid = get_pred_logreg(train_set,validation_set,rare_words)
print_cont_table(valid)
print(get_metrics(valid))


           |  PPos 	 PNeg 	 | Sums
-------------------------------------
actual pos |  168 	 2 	 | 170
actual neg |  40 	 1362 	 | 1402
-------------------------------------
Sums       |  208 	 1364 	 | 1572
   Accuracy  False Positive Rate  Precision    Recall  True Positive Rate
0  0.973282             0.028531   0.807692  0.988235            0.988235


In [322]:
#NAIVES BAYES
tmp = do_cv_class(train_set,10,'nb') # returns pandas dataframe
print_cont_table(tmp.iloc[:, 0:2])
print(get_metrics(tmp.iloc[:, 0:2]))

           |  PPos 	 PNeg 	 | Sums
-------------------------------------
actual pos |  465 	 387 	 | 852
actual neg |  74 	 3074 	 | 3148
-------------------------------------
Sums       |  539 	 3461 	 | 4000
   Accuracy  False Positive Rate  Precision    Recall  True Positive Rate
0   0.88475             0.023507   0.862709  0.545775            0.545775


In [323]:
#performance of Naives Bayes on validation set
valid = get_pred_nb(train_set,validation_set,rare_words)
print_cont_table(valid)
print(get_metrics(valid))

           |  PPos 	 PNeg 	 | Sums
-------------------------------------
actual pos |  194 	 147 	 | 341
actual neg |  14 	 1217 	 | 1231
-------------------------------------
Sums       |  208 	 1364 	 | 1572
   Accuracy  False Positive Rate  Precision    Recall  True Positive Rate
0  0.897583             0.011373   0.932692  0.568915            0.568915


In [324]:
#Linear SVM
tmp = do_cv_class(train_set,10,'svm') # returns pandas dataframe
print_cont_table(tmp.iloc[:, 0:2])
print(get_metrics(tmp.iloc[:, 0:2]))


           |  PPos 	 PNeg 	 | Sums
-------------------------------------
actual pos |  475 	 9 	 | 484
actual neg |  64 	 3452 	 | 3516
-------------------------------------
Sums       |  539 	 3461 	 | 4000
   Accuracy  False Positive Rate  Precision    Recall  True Positive Rate
0   0.98175             0.018203   0.881262  0.981405            0.981405


In [325]:
#performance of Linear SVM on validation set
valid = get_pred_svm(train_set,validation_set,rare_words)
print_cont_table(valid)
print(get_metrics(valid))

           |  PPos 	 PNeg 	 | Sums
-------------------------------------
actual pos |  192 	 3 	 | 195
actual neg |  16 	 1361 	 | 1377
-------------------------------------
Sums       |  208 	 1364 	 | 1572
   Accuracy  False Positive Rate  Precision    Recall  True Positive Rate
0  0.987913             0.011619   0.923077  0.984615            0.984615


## Linear SVM hence maintains the highest accuracy and lowest False Positive Rate on both k-fold cross validation and 2-step k-fold cross-validaiton approach. SVM hence is our chosen model