# Basic 

## Download Dataset and Package

In [1]:
! curl http://nlp.cs.aueb.gr/software_and_datasets/Enron-Spam/preprocessed/enron1.tar.gz -o enron1.tar.gz
! tar -xf enron1.tar.gz enron1

print("Number of non-spam emails in the dataset:")
! ls -1 enron1/ham/*.txt | wc -l
print("Number of spam emails in the dataset:")
! ls -1 enron1/spam/*.txt | wc -l 

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 1760k  100 1760k    0     0   110k      0  0:00:15  0:00:15 --:--:--  227k
Number of non-spam emails in the dataset:
    3672
Number of spam emails in the dataset:
    1500


In [2]:
# Download the NLTK tokenizer models
! pip3 install nltk

Defaulting to user installation because normal site-packages is not writeable


In [3]:
import nltk
nltk.download("stopwords")
nltk.download("names")
nltk.download("wordnet")

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/ben8169/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package names to /Users/ben8169/nltk_data...
[nltk_data]   Package names is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/ben8169/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

## Load dataset

In [4]:
import glob, os

# init
"""
emails: a set of email
labels: a set of label representing whetere the gien email is spam or ham
  - spam: 1
  - ham: 0
"""

emails, labels = [], []
parition = 0

# load spam dataset
file_path = 'enron1/spam'

for fname in glob.glob(os.path.join(file_path, '*.txt')):
    with open(fname, 'r', encoding='ISO-8859-1') as f:
      emails.append(f.read())
      labels.append(1)

file_path = 'enron1/ham'
for fname in glob.glob(os.path.join(file_path, '*.txt')):
    with open(fname, 'r', encoding='ISO-8859-1') as f:
      emails.append(f.read())
      labels.append(0)

print('# of emails = {}\n# of labels = {}'.format(len(emails), len(labels)))


# of emails = 5172
# of labels = 5172


## Data Preprocessing 

In [5]:
def letters_only(word):
    # Q1. Remove numbers and punctuations [0.5 points]
    return word.isalpha()

from nltk.corpus import names
# Q2. Remove name entity [0.5 points]
all_names = set(names.words())

# lemmaization
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()


# put all together to clean texts
def clean_text(doc):
    cleaned_doc = [lemmatizer.lemmatize(word.lower())
                        for word in doc.split()
                        if word.isalpha() and word not in all_names]
    # Q3. For all words in doc, apply lowercase to words, remove number, punctuation, and name entity [2 points]

    return ' '.join(cleaned_doc)

cleaned_emails = [clean_text(doc) for doc in emails]

## Data Preparation

In [6]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

X_train, X_test, Y_train, Y_test = train_test_split(cleaned_emails, labels, test_size=0.33, random_state=486)

cv = CountVectorizer(stop_words='english', max_features=500)

# Q4. Get counter vector for X_train and X_test [1 point]
term_docs_train = cv.fit_transform(X_train)
term_docs_test = cv.transform(X_test)

# Testing Naive Bayes

In [7]:
import numpy as np
from naivebayes import NaiveBayesClassifier
from sklearn.metrics import accuracy_score

gnb = NaiveBayesClassifier()

# Q5. Train and predict Naive Bayes model [1 point]

gnb.fit(term_docs_train,Y_train)
gnb_pred = gnb.predict(term_docs_test)
gnb_accuracy = accuracy_score(Y_test, gnb_pred)
print("Accuracy of the model is: {:.2f}".format(gnb_accuracy))

NotImplementedError: adding a nonzero scalar to a sparse array is not supported

# Testing SVM

In [8]:
import numpy as np
from svm import SVMClassifier
from sklearn.metrics import accuracy_score

svc = SVMClassifier()

# Q6. Train and predict SVM model [1 point]

svc.fit(term_docs_train,Y_train)
svc_pred = svc.predict(term_docs_test)
svc_accuracy = accuracy_score(Y_test, svc_pred)
print("Accuracy of the model is: {:.2f}".format(svc_accuracy))

ValueError: matmul: Input operand 1 does not have enough dimensions (has 0, gufunc core with signature (n?,k),(k,m?)->(n?,m?) requires 1)

# Testing Logistic Regression

In [9]:
import numpy as np
from logisticregression import LogisticRegression
from sklearn.metrics import accuracy_score

lr = LogisticRegression()

# Q7. Train and predict Logistic Regression model [1 point]

lr.initialize_w(term_docs_train)
lr.fit(term_docs_train,Y_train)
lr_pred = lr.predict(term_docs_test)
lr_accuracy = accuracy_score(Y_test, lr_pred)
print("Accuracy of the model is: {:.2f}".format(lr_accuracy))

ValueError: matmul: Input operand 0 does not have enough dimensions (has 0, gufunc core with signature (n?,k),(k,m?)->(n?,m?) requires 1)

# Testing Confusion Matrix

In [10]:
from evaluation import my_evaluation_metrics

y_true = np.array([1,0,0,1,0,1,1,1,1,0,0,0,0,0,1,1,1,0,0,1])
y_pred = np.array([0,0,0,1,1,1,0,0,0,1,1,0,0,0,0,1,0,1,1,0])

em = my_evaluation_metrics()

print("The confusion matrix:")
print(em.my_confusion_matrix(y_true, y_pred))

The confusion matrix:
[[5 5]
 [7 3]]


# Testing TF-IDF

In [11]:
from evaluation import my_evaluation_metrics

documents = ['car car car train train train train', 'car car car train train train cookie',
             'car train coffee coffee coffee coffee coffee coffee coffee cookie cookie cookie cookie cookie cookie']

em = my_evaluation_metrics()
tf_idf = em.my_tf_idf(documents)
tf_idf

array([[-0.86304622, -1.15072829,  0.        ,  0.        ],
       [-0.86304622, -0.86304622,  0.        ,  0.        ],
       [-0.28768207, -0.28768207,  0.        ,  2.83825576]])