<a href="https://colab.research.google.com/github/a1icia-m/misc-small-projects/blob/main/nlp/spam_email_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#load df
import pandas as pd
df = pd.read_csv('/spam_email.csv')
df.tail()

Unnamed: 0,CATEGORY,MESSAGE,FILE_NAME
5791,0,"I'm one of the 30,000 but it's not working ver...",00609.dd49926ce94a1ea328cce9b62825bc97
5792,0,Damien Morton quoted:\n\n>W3C approves HTML 4 ...,00957.e0b56b117f3ec5f85e432a9d2a47801f
5793,0,"On Mon, 2002-07-22 at 06:50, che wrote:\n\n\n\...",01127.841233b48eceb74a825417d8d918abf8
5794,0,"Once upon a time, Manfred wrote :\n\n\n\n> I w...",01178.5c977dff972cd6eef64d4173b90307f0
5795,0,"If you run Pick, and then use the ""New FTOC"" b...",00747.352d424267d36975a7b40b85ffd0885e


In [None]:
df['CATEGORY'].value_counts()
#0 = not spam

Unnamed: 0_level_0,count
CATEGORY,Unnamed: 1_level_1
0,3900
1,1896


In [None]:
import nltk
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
#tokenize first before removing stopwords to make sure you operate on indiv words or meaningful phrases
#regexp allows you to add custom rules, word_tokenize is another common one (keeps punctuation and separates by word)
tokenizer = nltk.RegexpTokenizer(r"\w+")  #only regex chars, ignore punctuation
test_message = "Hey,, Gggg feet it going? <HTML. x random peepeepoopoo"

test_message_tokenized = tokenizer.tokenize(test_message)
test_message_tokenized
#each string is a token, still words, ignores punctuation and spaces

['Hey', 'Gggg', 'feet', 'it', 'going', 'HTML', 'x', 'random', 'peepeepoopoo']

In [None]:
test_message_lowercase = [t.lower() for t in test_message_tokenized]
test_message_lowercase
#many different types of processing

['hey', 'gggg', 'feet', 'it', 'going', 'html', 'x', 'random', 'peepeepoopoo']

In [None]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

test_message_lemmatized = [lemmatizer.lemmatize(t) for t in test_message_lowercase]
test_message_lemmatized
#feet converted to foot, it is trying to extract meaning from the text
#it is a stopword because it stops the test from being useful

['hey', 'gggg', 'foot', 'it', 'going', 'html', 'x', 'random', 'peepeepoopoo']

In [None]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

#get rid of all stopwords, can add your own stopwords
test_message_useful_tokens= [t for t in test_message_lemmatized if t not in stop_words]
test_message_useful_tokens
#these are still tokens, just processed tokens

['hey', 'gggg', 'foot', 'going', 'html', 'x', 'random', 'peepeepoopoo']

In [None]:
def message_to_tokens(s):
  tokens = tokenizer.tokenize(s)
  lowercase = [t.lower() for t in tokens]
  lemmatized = [lemmatizer.lemmatize(t) for t in lowercase]
  useful_tokens = [t for t in lemmatized if t not in stop_words]
  return useful_tokens

message_to_tokens(test_message)

['hey', 'gggg', 'foot', 'going', 'html', 'x', 'random', 'peepeepoopoo']

In [None]:
#shuffle df before we partition into two sections 1 for train, 1 for test
df = df.sample(frac = 1, random_state =1)
df=df.reset_index(drop = True)

split_index = int(len(df)*.8) #first 80% for training
train_df, test_df = df[:split_index], df[split_index:]

train_df = train_df.reset_index(drop = True)
test_df = test_df.reset_index(drop = True)

train_df, test_df

(      CATEGORY                                            MESSAGE  \
 0            1  \n\n<HTML><FONT  BACK="#ffffff" style="BACKGRO...   
 1            1  <html><body bgColor="#CCCCCC" topmargin=1 onMo...   
 2            0  Quoting Paul Linehan (plinehan@yahoo.com):\n\n...   
 3            0  <a href=http://www.aaronsw.com/weblog/>\n\nAar...   
 4            0  Oh yeah, the link for more info:\n\n\n\nhttp:/...   
 ...        ...                                                ...   
 4631         0  Gregory Alan Bolcer:\n\n>I'm not sure since I ...   
 4632         1  New Account For: zzzz@spamassassin.taint.org\n...   
 4633         0  >>>>> "O" == Owen Byrne <owen@permafrost.net> ...   
 4634         0  This is an automated response to a message you...   
 4635         0  http://www.ouchytheclown.com/welcome.html\n\n\...   
 
                                    FILE_NAME  
 0     00118.141d803810acd9d4fc23db103dddfcd9  
 1     00463.0bc4e08af0529dd773d9f10f922547db  
 2     00358.8

In [None]:
token_counter = {}

for message in train_df['MESSAGE']:
  message_tokens = message_to_tokens(message)

  for token in message_tokens:
    if token not in token_counter:
      token_counter[token] = 1
    else:
      token_counter[token] += 1

len(token_counter)

86439

In [None]:
def keep_token(token, thershold):
  if token not in token_counter:
    return False
  return (token_counter[token] >= thershold)

keep_token('hello', 100)

True

In [None]:
features = set()
for token in token_counter:
  if keep_token(token, 5275):
    features.add(token)
print(features)
len(features)

{'http', 'p', '20', 'font', 'align', '3d', 'face', 'center', 'b', 'br', '0', 'nbsp', 'www', '2', 'color', 'com', 'width', 'list', 'tr', 'size', 'td', '1', 'arial'}


23

In [None]:
feeatures = list(features)
features

{'0',
 '1',
 '2',
 '20',
 '3d',
 'align',
 'arial',
 'b',
 'br',
 'center',
 'color',
 'com',
 'face',
 'font',
 'http',
 'list',
 'nbsp',
 'p',
 'size',
 'td',
 'tr',
 'width',
 'www'}

In [None]:
token_to_index_mapping = {t:i for t, i in zip(features, range(len(features)))}
token_to_index_mapping

{'http': 0,
 'p': 1,
 '20': 2,
 'font': 3,
 'align': 4,
 '3d': 5,
 'face': 6,
 'center': 7,
 'b': 8,
 'br': 9,
 '0': 10,
 'nbsp': 11,
 'www': 12,
 '2': 13,
 'color': 14,
 'com': 15,
 'width': 16,
 'list': 17,
 'tr': 18,
 'size': 19,
 'td': 20,
 '1': 21,
 'arial': 22}

In [None]:
message_to_tokens("3d b <br> .com bad font font com randoms")

['3d', 'b', 'br', 'com', 'bad', 'font', 'font', 'com', 'randoms']

In [None]:
import numpy as np
def message_to_count_vector(message):
  count_vector = np.zeros(len(features))

  processed_list_of_tokens =message_to_tokens(message)

  for token in processed_list_of_tokens:
    if token in features:
      count_vector[token_to_index_mapping[token]]+=1
  return count_vector

message_to_count_vector("3d b <br> .com bad font font com randoms")

array([0., 0., 0., 2., 0., 1., 0., 0., 1., 1., 0., 0., 0., 0., 0., 2., 0.,
       0., 0., 0., 0., 0., 0.])

In [None]:
message_to_count_vector(train_df['MESSAGE'].iloc[5])

array([3., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       1., 0., 0., 0., 0., 0.])

In [None]:
train_df.iloc[5]

Unnamed: 0,5
CATEGORY,1
MESSAGE,Low-Cost Term-Life Insurance!\n\nSAVE up to 70...
FILE_NAME,00303.7d749e4a46ceb169ea1af5b9e5ab39a9


In [None]:
def df_to_x_y(dff):
  y= dff["CATEGORY"].to_numpy().astype(int)

  message_col = dff["MESSAGE"]
  count_vectors = []

  for message in message_col:
    count_v = message_to_count_vector(message)
    count_vectors.append(count_v)

  x = np.array(count_vectors).astype(int)

  return x,y


In [None]:
x_train, y_train = df_to_x_y(train_df)
x_test, y_test = df_to_x_y(test_df)

x_train.shape, y_train.shape, x_test.shape, y_test.shape
#x_train is matrix of inputs and has rows w each row a corresponding evctor, y_train is the classification of wheter or not it is spam or not spam

((4636, 23), (4636,), (1160, 23), (1160,))

In [None]:
from sklearn.preprocessing import MinMaxScaler #normalizes data so theyre all 0<=x<=1 where 0 is min, 1 is max

scaler = MinMaxScaler().fit(x_train)
x_train= scaler.transform(x_train)
x_test = scaler.transform(x_test)
x_train

array([[0.025     , 0.        , 0.        , ..., 0.        , 0.        ,
        0.003663  ],
       [0.00833333, 0.04065041, 0.        , ..., 0.01346801, 0.00363636,
        0.02564103],
       [0.00416667, 0.00406504, 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.00416667, 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.00416667, 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.0125    , 0.00406504, 0.        , ..., 0.        , 0.        ,
        0.        ]])

In [None]:
#logitic regression model!!
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

lr = LogisticRegression().fit(x_train, y_train)
print(classification_report(y_test, lr.predict(x_test))) #how well the test set does based on trained model set

              precision    recall  f1-score   support

           0       0.77      1.00      0.87       788
           1       0.99      0.37      0.54       372

    accuracy                           0.80      1160
   macro avg       0.88      0.68      0.70      1160
weighted avg       0.84      0.80      0.76      1160



In [None]:
#random forest model
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier().fit(x_train, y_train)
print(classification_report(y_test, rf.predict(x_test)))

              precision    recall  f1-score   support

           0       0.89      0.95      0.92       788
           1       0.88      0.76      0.82       372

    accuracy                           0.89      1160
   macro avg       0.89      0.86      0.87      1160
weighted avg       0.89      0.89      0.89      1160

