In [1]:
import pandas as pd 
import numpy as np 
import kaggle

In [2]:
kaggle.api.dataset_download_files('chandramoulinaidu/spam-classification-for-basic-nlp','csv_files',unzip=True)

In [3]:
df=pd.read_csv('csv_files/Spam Email raw text for NLP.csv')

In [4]:
df

Unnamed: 0,CATEGORY,MESSAGE,FILE_NAME
0,1,"Dear Homeowner,\n\n \n\nInterest Rates are at ...",00249.5f45607c1bffe89f60ba1ec9f878039a
1,1,ATTENTION: This is a MUST for ALL Computer Use...,00373.ebe8670ac56b04125c25100a36ab0510
2,1,This is a multi-part message in MIME format.\n...,00214.1367039e50dc6b7adb0f2aa8aba83216
3,1,IMPORTANT INFORMATION:\n\n\n\nThe new domain n...,00210.050ffd105bd4e006771ee63cabc59978
4,1,This is the bottom line. If you can GIVE AWAY...,00033.9babb58d9298daa2963d4f514193d7d6
...,...,...,...
5791,0,"I'm one of the 30,000 but it's not working ver...",00609.dd49926ce94a1ea328cce9b62825bc97
5792,0,Damien Morton quoted:\n\n>W3C approves HTML 4 ...,00957.e0b56b117f3ec5f85e432a9d2a47801f
5793,0,"On Mon, 2002-07-22 at 06:50, che wrote:\n\n\n\...",01127.841233b48eceb74a825417d8d918abf8
5794,0,"Once upon a time, Manfred wrote :\n\n\n\n> I w...",01178.5c977dff972cd6eef64d4173b90307f0


In [5]:
import nltk

tokenizer=nltk.RegexpTokenizer(r'\w+')
lemmatizer=nltk.stem.WordNetLemmatizer()
stopwords=nltk.corpus.stopwords.words('english')

In [6]:
def text_to_tokens(text):
    tokenized=tokenizer.tokenize(text)
    lower_tokens=[t.lower() for t in tokenized]
    lemmatized=[lemmatizer.lemmatize(t) for t in lower_tokens]
    tokens=[t for t in lemmatized if t not in stopwords]
    return tokens

In [7]:
df=df.sample(frac=1)

In [8]:
split_index=int(len(df)*0.8)

train_df=df.iloc[:split_index,:]
test_df=df.iloc[split_index:,:]

In [9]:
token_count={}

for text in train_df['MESSAGE']:
    text_as_tokens=text_to_tokens(text)
    for token in text_as_tokens:
        if token not in token_count:
            token_count[token]=1
        else:
            token_count[token]+=1

len(token_count)

90037

In [10]:
features=[]
for token,count in token_count.items():
    if count>1000:
        features.append(token)

In [11]:
features

['content',
 'type',
 'text',
 'html',
 '1',
 'message',
 'www',
 'com',
 'service',
 'site',
 'way',
 'internet',
 '20',
 'time',
 'business',
 '000',
 'people',
 'link',
 'http',
 'list',
 'go',
 'form',
 'u',
 'head',
 'title',
 '3d',
 '22',
 'body',
 'bgcolor',
 'table',
 'width',
 'border',
 'cellspacing',
 'cellpadding',
 'tr',
 'td',
 'height',
 'align',
 'font',
 'face',
 'arial',
 'helvetica',
 'sans',
 'serif',
 'size',
 'b',
 'color',
 'ie',
 'valign',
 'colspan',
 'p',
 'href',
 'img',
 'src',
 'gif',
 '2',
 'nbsp',
 'br',
 'ffffff',
 '000000',
 'center',
 'div',
 'style',
 '0',
 'left',
 '7',
 'top',
 '8',
 '100',
 'first',
 'want',
 'wa',
 'right',
 'name',
 'year',
 'like',
 'system',
 '6',
 'make',
 'company',
 'money',
 'one',
 'doe',
 'use',
 '50',
 'free',
 'home',
 '5',
 'click',
 'see',
 'could',
 'may',
 'verdana',
 'email',
 'mailing',
 'remove',
 'spam',
 'information',
 '2002',
 'get',
 'would',
 'file',
 'new',
 'linux',
 'user',
 'group',
 'ilug',
 'mailman',

In [12]:
features_dict={token:i for i,token in enumerate(features)}
features_dict

{'content': 0,
 'type': 1,
 'text': 2,
 'html': 3,
 '1': 4,
 'message': 5,
 'www': 6,
 'com': 7,
 'service': 8,
 'site': 9,
 'way': 10,
 'internet': 11,
 '20': 12,
 'time': 13,
 'business': 14,
 '000': 15,
 'people': 16,
 'link': 17,
 'http': 18,
 'list': 19,
 'go': 20,
 'form': 21,
 'u': 22,
 'head': 23,
 'title': 24,
 '3d': 25,
 '22': 26,
 'body': 27,
 'bgcolor': 28,
 'table': 29,
 'width': 30,
 'border': 31,
 'cellspacing': 32,
 'cellpadding': 33,
 'tr': 34,
 'td': 35,
 'height': 36,
 'align': 37,
 'font': 38,
 'face': 39,
 'arial': 40,
 'helvetica': 41,
 'sans': 42,
 'serif': 43,
 'size': 44,
 'b': 45,
 'color': 46,
 'ie': 47,
 'valign': 48,
 'colspan': 49,
 'p': 50,
 'href': 51,
 'img': 52,
 'src': 53,
 'gif': 54,
 '2': 55,
 'nbsp': 56,
 'br': 57,
 'ffffff': 58,
 '000000': 59,
 'center': 60,
 'div': 61,
 'style': 62,
 '0': 63,
 'left': 64,
 '7': 65,
 'top': 66,
 '8': 67,
 '100': 68,
 'first': 69,
 'want': 70,
 'wa': 71,
 'right': 72,
 'name': 73,
 'year': 74,
 'like': 75,
 'system

In [13]:
def text_to_count_vector(text):
    count_vector=np.zeros(len(features))

    text_as_tokens=text_to_tokens(text)
    for token in text_as_tokens:
        if token not in features:
            continue
        index=features_dict[token]
        count_vector[index]+=1
    return count_vector


In [14]:
all_features=[]
for text in train_df['MESSAGE']:
    all_features.append(text_to_count_vector(text))

In [15]:
def df_to_X_y(df):
  y = df['CATEGORY'].to_numpy().astype(int)

  count_vectors = []

  for text in df['MESSAGE']:
    count_vector = text_to_count_vector(text)
    count_vectors.append(count_vector)

  X = np.array(count_vectors).astype(int)

  return X, y

In [16]:
X_train, y_train = df_to_X_y(train_df)

X_test, y_test = df_to_X_y(test_df)

X_train.shape, y_train.shape, X_test.shape, y_test.shape

((4636, 167), (4636,), (1160, 167), (1160,))

In [17]:
from xgboost import XGBClassifier
from sklearn import metrics
from sklearn.metrics import classification_report
model=XGBClassifier()

In [18]:
model.fit(X_train,y_train)

In [19]:
train_prediction=model.predict(X_train)
test_prediction=model.predict(X_test)

In [20]:
train_accuracy=metrics.accuracy_score(train_prediction,y_train)
test_accuracy=metrics.accuracy_score(test_prediction,y_test)
print(train_accuracy,test_accuracy)

0.998274374460742 0.9741379310344828


In [21]:
print(classification_report(test_prediction,y_test))

              precision    recall  f1-score   support

           0       0.98      0.98      0.98       773
           1       0.96      0.96      0.96       387

    accuracy                           0.97      1160
   macro avg       0.97      0.97      0.97      1160
weighted avg       0.97      0.97      0.97      1160

