In [1]:
import pandas as pd
import numpy as np
from google.colab import drive
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score, recall_score

drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [2]:
data = pd.read_csv('/content/gdrive/MyDrive/text-mining/spam_Emails_data.csv')
data['target'] = np.where(data['label'] == 'Spam', 1, 0)
data

Unnamed: 0,label,text,target
0,Spam,viiiiiiagraaaa\nonly for the ones that want to...,1
1,Ham,got ice thought look az original message ice o...,0
2,Spam,yo ur wom an ne eds an escapenumber in ch ma n...,1
3,Spam,start increasing your odds of success & live s...,1
4,Ham,author jra date escapenumber escapenumber esca...,0
...,...,...,...
193847,Ham,on escapenumber escapenumber escapenumber rob ...,0
193848,Spam,we have everything you need escapelong cialesc...,1
193849,Ham,hi quick question say i have a date variable i...,0
193850,Spam,thank you for your loan request which we recie...,1


In [None]:
spam_percent = len(data[data['target'] == 1]) / len(data) * 100
print("Spam percentage %s " %round(spam_percent,2))

Spam percentage 47.3 


In [3]:
data = data.sample(n = 20000, random_state=42)

In [4]:
data

Unnamed: 0,label,text,target
136112,Spam,a if you don't want this type of e mail pleas...,1
75977,Ham,a full list of news articles will be distribut...,0
53962,Spam,now online with profiles close to escapenumber...,1
120475,Spam,va pv rjgex py cuohox qqo jq poeedj cbqlrvncao...,1
17679,Spam,does size matter' escapenumber of women said t...,1
...,...,...,...
50992,Ham,chris provide thanks original message jackson ...,0
185785,Spam,opt in email special offer nbsp unsubscribe me...,1
1557,Spam,access all the popular software imaginable for...,1
57001,Ham,virus detected enron email environment offendi...,0


In [None]:
spam_percent = len(data[data['target'] == 1]) / len(data) * 100
print("Spam percentage %s " %spam_percent)

Spam percentage 47.115 


In [None]:
data.isnull().sum()

Unnamed: 0,0
label,0
text,0
target,0


In [None]:
data.dropna(inplace=True)

In [None]:
x, y = data['text'], data['target']
x_train, x_test, y_train, y_test = train_test_split(data['text'], data['target'], random_state=42)

In [None]:
vectorizer = TfidfVectorizer()
x_train_transformed = vectorizer.fit_transform(x_train)
x_test_transformed = vectorizer.transform(x_test)

In [None]:
vectorizer = CountVectorizer()
x_train_transformed = vectorizer.fit_transform(x_train)
x_test_transformed = vectorizer.transform(x_test)

# Decision tree

In [None]:
clf = DecisionTreeClassifier().fit(x_train_transformed, y_train)
y_pred = clf.predict(x_test_transformed)

print("Accuracy: %s" % accuracy_score(y_test,y_pred))
print("F1 score: %s" % f1_score(y_test,y_pred))
print("Recall score: %s" % recall_score(y_test,y_pred))

Accuracy: 0.9102
F1 score: 0.9030446987691644
Recall score: 0.9016817593790427


# SVC

In [None]:
clf = SVC().fit(x_train_transformed, y_train)
y_pred = clf.predict(x_test_transformed)

print("Accuracy: %s" % accuracy_score(y_test,y_pred))
print("F1 score: %s" % f1_score(y_test,y_pred))
print("Recall score: %s" % recall_score(y_test,y_pred))

Accuracy: 0.97
F1 score: 0.9680034129692833
Recall score: 0.9784389823199655


# Random Forest

In [None]:
clf = RandomForestClassifier().fit(x_train_transformed, y_train)
y_pred = clf.predict(x_test_transformed)

print("Accuracy: %s" % accuracy_score(y_test,y_pred))
print("F1 score: %s" % f1_score(y_test,y_pred))
print("Recall score: %s" % recall_score(y_test,y_pred))

Accuracy: 0.9566
F1 score: 0.9524227143170357
Recall score: 0.9366106080206986


# Cross Validation

In [None]:
n_splits = 5
skf = StratifiedKFold(n_splits=n_splits)
vectorizer = TfidfVectorizer()

fold = 0
dtc_performances = []
svc_performances = []
rfc_performances = []

for train_index, test_index in skf.split(x.values, y.values):
  fold += 1
  print('Fold:', fold , '-'*20)
  x_train, x_test = x.iloc[train_index], x.iloc[test_index]
  y_train, y_test = y.iloc[train_index], y.iloc[test_index]

  X_train_transformed = vectorizer.fit_transform(x_train)
  X_test_transformed = vectorizer.transform(x_test)


  clf = DecisionTreeClassifier().fit(X_train_transformed, y_train)
  y_pred = clf.predict(X_test_transformed)

  print("   Decision Tree")
  print("      Accuracy: %s" % accuracy_score(y_test,y_pred))
  print("      F1 score: %s" % f1_score(y_test,y_pred))
  dtc_performances.append(f1_score(y_test,y_pred))


  clf = SVC().fit(X_train_transformed, y_train)
  y_pred = clf.predict(X_test_transformed)

  print("   SVM")
  print("      Accuracy: %s" % accuracy_score(y_test,y_pred))
  print("      F1 score: %s" % f1_score(y_test,y_pred))
  svc_performances.append(f1_score(y_test,y_pred))


  clf = RandomForestClassifier().fit(X_train_transformed, y_train)
  y_pred = clf.predict(X_test_transformed)

  print("   RF")
  print("      Accuracy: %s" % accuracy_score(y_test,y_pred))
  print("      F1 score: %s" % f1_score(y_test,y_pred))
  rfc_performances.append(f1_score(y_test,y_pred))


Fold: 1 --------------------
   Decision Tree
      Accuracy: 0.90775
      F1 score: 0.9013105108317732
   SVM
      Accuracy: 0.968
      F1 score: 0.9664218258132214
   RF
      Accuracy: 0.95825
      F1 score: 0.9549986526542711
Fold: 2 --------------------
   Decision Tree
      Accuracy: 0.90825
      F1 score: 0.9026266914300876
   SVM
      Accuracy: 0.9735
      F1 score: 0.9721492380451918
   RF
      Accuracy: 0.958
      F1 score: 0.9546925566343042
Fold: 3 --------------------
   Decision Tree
      Accuracy: 0.9055
      F1 score: 0.8983870967741936
   SVM
      Accuracy: 0.97425
      F1 score: 0.9729445757814552
   RF
      Accuracy: 0.95925
      F1 score: 0.9559578492299379
Fold: 4 --------------------
   Decision Tree
      Accuracy: 0.9155
      F1 score: 0.9110057925223802
   SVM
      Accuracy: 0.97275
      F1 score: 0.971413585103593
   RF
      Accuracy: 0.96025
      F1 score: 0.957588690317418
Fold: 5 --------------------
   Decision Tree
      Accuracy: 0.9

In [None]:
print(f"Stratified K-Fold with {n_splits} splits gives the following")

print(f"Decision tree: {np.mean(dtc_performances):.2f} +- {np.std(dtc_performances):.3f}")
print(f"SVM tree:      {np.mean(svc_performances):.2f} +- {np.std(svc_performances):.3f}")
print(f"RF tree:       {np.mean(rfc_performances):.2f} +- {np.std(rfc_performances):.3f}")

Stratified K-Fold with 5 splits gives the following
Decision tree: 0.90 +- 0.005
SVM tree:      0.97 +- 0.003
RF tree:       0.95 +- 0.002
