In [None]:
!unzip '/content/assignment3_train.zip'
!unzip '/content/assignment3_test.zip'

In [2]:
import os
import pandas as pd
from pandas import DataFrame

In [3]:
import warnings
warnings.filterwarnings("ignore")

In [4]:
def files_preprocess(path):
    '''
    processes file in the given path into a pandas dataframe
    path/ham & path/spam
    ham - 1; spam - 0;
    '''
    list_of_dirs = os.listdir(path)

    file_data = []
    file_label = []
    
    for dir in list_of_dirs:

      dir_files =  os.listdir(path+dir)
      
      for file in dir_files:

        one_file_data = (open(path+dir+'/'+file,'r',encoding='ISO-8859-1')).read()
        if dir == 'spam':
          one_file_label = 0
        elif dir == 'ham':
          one_file_label = 1

        file_data.append(one_file_data)
        file_label.append(one_file_label)

    
    data = pd.DataFrame(list(zip(file_data, file_label)),columns =['msg', 'label'])

    return data

In [5]:
train_data = files_preprocess('/content/train/')
train_data

Unnamed: 0,msg,label
0,Subject: additional information : access reque...,1
1,Subject: re : feedback monitor error - meter 9...,1
2,"Subject: re : coastal oil & gas usa , l . p . ...",1
3,"Subject: 98 - 2601\nhi daren ,\ni ' m attempti...",1
4,Subject: 75 th anniversary celebration\nthank ...,1
...,...,...
458,Subject: for 9 or\nemail loading . . . %\nwise...,0
459,"Subject: re : account summary\nthu , 03 feb 20...",0
460,Subject: bro - im finished\nunbelievably cheap...,0
461,Subject: best software prices . incorrectness ...,0


In [6]:
test_data = files_preprocess('/content/test/')
test_data

Unnamed: 0,msg,label
0,Subject: re : noms / actual flow for 3 / 29 / ...,1
1,"Subject: flagstaff on 11 / 10\ndaren ,\na smal...",1
2,Subject: lst rev may 2000 hpl nom\nfyi\n- - - ...,1
3,"Subject: budget help ! !\nsalaries , salaries ...",1
4,Subject: spot deals\ni have created and entere...,1
...,...,...
473,Subject: forget working out\njust thought you ...,0
474,Subject: smoke 9885\ncgfsaw 91 cmdaawlolmrlbw ...,0
475,Subject: arturo hydrate eyesight humorous depa...,0
476,Subject: \nthe only fix to penis growth\nlimit...,0


True labels

In [7]:
train_data['label'].value_counts()

1    340
0    123
Name: label, dtype: int64

In [8]:
test_data['label'].value_counts()

1    348
0    130
Name: label, dtype: int64

In [9]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [10]:
def clean_message(message):
    
    message_tokens = word_tokenize(message.strip().lower())
    message_tokens = [token for token in message_tokens if token.isalpha()]

    return ' '.join(message_tokens)

In [11]:
def clean_message_with_stopwords(message):
    
    message_tokens = word_tokenize(message.strip().lower())
    message_tokens = [token for token in message_tokens if token.isalpha()]
    message_tokens = [word for word in message_tokens if word not in stopwords.words('english')]
    
    return ' '.join(message_tokens)

Message and its different cleaned forms

In [12]:
train_data['msg'][0]

'Subject: additional information : access request ( djfr - 4 ngpa 6 )\nadditional information for security resource request djfr - 4 ngpa 6 has been\nprovided by security .\nto view the request , double click your left mouse button on the notes\ndocument link below .'

In [13]:
clean_message(train_data['msg'][0])

'subject additional information access request djfr ngpa additional information for security resource request djfr ngpa has been provided by security to view the request double click your left mouse button on the notes document link below'

In [14]:
clean_message_with_stopwords(train_data['msg'][0])

'subject additional information access request djfr ngpa additional information security resource request djfr ngpa provided security view request double click left mouse button notes document link'

In [15]:
train_data['clean_msg'] = train_data['msg'].apply(clean_message)
train_data['clean_msg_no_sw'] = train_data['msg'].apply(clean_message_with_stopwords)

In [16]:
test_data['clean_msg'] = test_data['msg'].apply(clean_message)
test_data['clean_msg_no_sw'] = test_data['msg'].apply(clean_message_with_stopwords)

In [17]:
import numpy as np

In [18]:
def sigmoid(z):
  return 1/(1 + np.exp(-z))

In [19]:
def get_vocab(docs):
  '''
  returns a dictionary with all words and its counts
  '''
  vocab = {}
  for msg in docs:
    items = msg.split()
    for item in items:
      if item in vocab:
        vocab[item] += 1
      else:
        vocab[item] = 1
  return vocab

In [20]:
vocab =  get_vocab(train_data['clean_msg'])

In [21]:
def get_features(msg):
  '''
  returns a feature array for each msg based on the vocabulary built.
  '''
  inputs=np.array([])
  msg_tokens = msg.split()
  for w in vocab:
    if w in msg_tokens:
      inputs = np.append(inputs,1)
    else:
      inputs = np.append(inputs,0)
  return inputs

One Sample feature.. 

In [23]:
get_features(train_data['clean_msg'][0])

array([1., 1., 1., ..., 0., 0., 0.])

In [24]:
train_data['features'] = train_data['clean_msg'].apply(get_features)

In [25]:
test_data['features'] = test_data['clean_msg'].apply(get_features)

In [26]:
def trainLR(docs,labels,learning_rate,iterations):
  '''
  trains a Logistic Regression model on docs with its labels 
  updating weights based on learning_rate with iterations
  '''
  size = len(docs)

  weights = np.zeros(len(vocab))  
  bias = 0

  for i in range(iterations):
    for i in range(size):
      pred = sigmoid(np.dot(docs.iloc[i],weights) + bias)

      derv_weight = (pred - labels[i]) * docs[i]
      derv_bias = (pred - labels[i])  

      weights -= learning_rate * derv_weight
      bias -= learning_rate * derv_bias

  return weights,bias 

In [27]:
def testLR(docs,weights,bias):
    '''
    testing the docs based on learned weights and bias from training
    '''
    pred = np.array([])
    decision_boundary = 0.5
    
    for doc in docs:
      doc_pred = sigmoid(np.dot(doc,weights) + bias)
      
      if doc_pred >= decision_boundary:
        pred = np.append(pred,1)
      else:
        pred = np.append(pred,0)
    
    return pred


In [28]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [34]:
learning_rate = 0.001
iterations = 1000

weights,bias = trainLR(train_data['features'],train_data['label'],learning_rate,iterations)
y_pred = testLR(test_data['features'],weights,bias)
y_true = np.array(test_data['label'])
print(classification_report(y_true,y_pred))
print(accuracy_score(y_true,y_pred))
print(confusion_matrix(y_true,y_pred))


              precision    recall  f1-score   support

           0       0.93      0.93      0.93       130
           1       0.97      0.97      0.97       348

    accuracy                           0.96       478
   macro avg       0.95      0.95      0.95       478
weighted avg       0.96      0.96      0.96       478

0.9623430962343096
[[121   9]
 [  9 339]]


Filtering stop words..

In [30]:
train_data['features_no_sw'] = train_data['clean_msg_no_sw'].apply(get_features)

In [31]:
test_data['features_no_sw'] = test_data['clean_msg_no_sw'].apply(get_features)

In [35]:
learning_rate = 0.001
iterations = 1000

weights,bias = trainLR(train_data['features_no_sw'],train_data['label'],learning_rate,iterations)
y_pred = testLR(test_data['features_no_sw'],weights,bias)
y_true = np.array(test_data['label'])
print(classification_report(y_true,y_pred))
print(accuracy_score(y_true,y_pred))
print(confusion_matrix(y_true,y_pred))


              precision    recall  f1-score   support

           0       0.92      0.87      0.89       130
           1       0.95      0.97      0.96       348

    accuracy                           0.94       478
   macro avg       0.94      0.92      0.93       478
weighted avg       0.94      0.94      0.94       478

0.9435146443514645
[[113  17]
 [ 10 338]]
