## Preprocessing

In [46]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('punkt_tab')  # optional, for new versions


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\aly98\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\aly98\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\aly98\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.


True

In [15]:
df = pd.read_csv('spam.csv')
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [16]:
df.columns

Index(['v1', 'v2', 'Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], dtype='object')

In [17]:
df.drop(columns=['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], inplace=True)

In [18]:
df.rename(columns={'v1':'target', 'v2':'text'}, inplace=True)

In [28]:
df.head()

Unnamed: 0,target,text
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [22]:
df.duplicated().sum()

np.int64(403)

In [24]:
df.shape

(5572, 2)

In [25]:
df.drop_duplicates(inplace=True)
df.shape

(5169, 2)

In [26]:
df.isnull().sum()

target    0
text      0
dtype: int64

In [27]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
df['target'] = encoder.fit_transform(df.target)

df.head()

Unnamed: 0,target,text
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


## Feature Engineering

In [29]:
from nltk.stem.porter import PorterStemmer
import string

ps = PorterStemmer()

In [47]:
# Lowercase transformation and text preprocessing function

def transform_text(text): 

    # make text to lower
    text = text.lower()
    
    text = nltk.word_tokenize(text)
    y = []

    # Remove special character
    for i in text:
        if i.isalnum():
            y.append(i)

    text = y[:]
    y.clear()

    # Loop through the tokens and remove stopwords and puctuation
    for i in text:
        if i not in stopwords.words('english') and i not in string.punctuation:
            y.append(i)

    text = y.copy()
    y.clear()


    # transform each word to each root word using stemming technique

    for i in text:
        y.append(ps.stem(i))
    
    # join all the word into single string
    return  " ".join(y)


In [48]:

transform_text('Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...')


'go jurong point crazi avail bugi n great world la e buffet cine got amor wat'

In [50]:
df['text'] = df['text'].apply(transform_text)
df.head()

Unnamed: 0,target,text
0,0,go jurong point crazi avail bugi n great world...
1,0,ok lar joke wif u oni
2,1,free entri 2 wkli comp win fa cup final tkt 21...
3,0,u dun say earli hor u c alreadi say
4,0,nah think goe usf live around though


In [53]:
# Convert 
from sklearn.feature_extraction.text import TfidfVectorizer
tfid = TfidfVectorizer(max_features = 600)

In [57]:
X = tfid.fit_transform(df['text']).toarray()
y = df.target.values


In [59]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=2)

## Model Training

In [63]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier

In [64]:
svc = SVC(kernel= "sigmoid", gamma  = 1.0)
knc = KNeighborsClassifier()
mnb = MultinomialNB()
dtc = DecisionTreeClassifier(max_depth = 5)
lrc = LogisticRegression(solver = 'liblinear', penalty = 'l1')
rfc = RandomForestClassifier(n_estimators = 50, random_state = 2 )
abc = AdaBoostClassifier(n_estimators = 50, random_state = 2)
bc = BaggingClassifier(n_estimators = 50, random_state = 2)
etc = ExtraTreesClassifier(n_estimators = 50, random_state = 2)
gbdt = GradientBoostingClassifier(n_estimators = 50, random_state = 2)    
xgb  = XGBClassifier(n_estimators = 50, random_state = 2)

## Model Selection

In [75]:
clf = {
    "SVC":svc,
    "KNN":knc,
    "Naive Baye":mnb,
    "Decision Tree":dtc,
    "Logistic Reg": lrc,
    "Random Forest":rfc,
    "AdaBoost Clf":abc,
    "Bagging Clf":bc,
    "Extra Tree Clf":etc,
    "Gradient Boosting":gbdt,
    "XGBoost":xgb

}

In [76]:
def train_classifier(clf, X_train, y_train):
    model = clf.fit(X_train, y_train)
    return model

In [77]:
from sklearn.metrics import accuracy_score, precision_score
accuracy = []
precision = []

for name, clfs in clf.items():
    classifier = train_classifier(clfs, X_train, y_train)
    print()
    print("* "*5+name+" *"*5)
    print(round(accuracy_score(y_test, classifier.predict(X_test)),2))
    print(round(precision_score(y_test, classifier.predict(X_test)),2))



* * * * * SVC * * * * *
0.97
0.96

* * * * * KNN * * * * *
0.93
1.0

* * * * * Naive Baye * * * * *
0.97
0.96

* * * * * Decision Tree * * * * *
0.94
0.88

* * * * * Logistic Reg * * * * *
0.96
0.95

* * * * * Random Forest * * * * *
0.97
0.94

* * * * * AdaBoost Clf * * * * *
0.92
0.87

* * * * * Bagging Clf * * * * *
0.97
0.9

* * * * * Extra Tree Clf * * * * *
0.97
0.95

* * * * * Gradient Boosting * * * * *
0.95
0.92

* * * * * XGBoost * * * * *
0.97
0.94
