In [2]:
import numpy as np
import pandas as pd

In [4]:
df = pd.read_csv('spam.csv')
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [12]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()

df['v1'] = encoder.fit_transform(df['v1'])

In [13]:
df.duplicated().sum()

np.int64(403)

In [14]:
len(df)

5572

In [16]:
df = df.drop_duplicates(keep='first')

In [20]:
from nltk.stem.porter import PorterStemmer

ps = PorterStemmer()

In [32]:
import nltk

def transform_text(text):
    text = text.lower()
    text = nltk.word_tokenize(text)

    y = []

    for i in text:
        if i.isalnum():
            y.append(i)

    text = y[:]
    y.clear()
    for i in text:
        y.append(ps.stem(i))

    return " ".join(y)

In [33]:
transform_text("Although it's raining, we decided to go to the park; however, we brought umbrellas, just in case it pours!")

'although it rain we decid to go to the park howev we brought umbrella just in case it pour'

In [35]:
df['transformed_text'] = df['v2'].apply(transform_text)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['transformed_text'] = df['v2'].apply(transform_text)


In [39]:
df = df.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis='columns')

In [40]:
df.head()

Unnamed: 0,v1,v2,transformed_text
0,0,"Go until jurong point, crazy.. Available only ...",go until jurong point crazi avail onli in bugi...
1,0,Ok lar... Joking wif u oni...,ok lar joke wif u oni
2,1,Free entry in 2 a wkly comp to win FA Cup fina...,free entri in 2 a wkli comp to win fa cup fina...
3,0,U dun say so early hor... U c already then say...,u dun say so earli hor u c alreadi then say
4,0,"Nah I don't think he goes to usf, he lives aro...",nah i do think he goe to usf he live around he...


In [5]:
Y = df['v1']
X = df['v2']

In [43]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer


tfid = TfidfVectorizer(max_features=500)

In [48]:
X = tfid.fit_transform(df['transformed_text']).toarray()
Y = df['v1'].values

In [49]:
X

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], shape=(5169, 500))

In [52]:
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.1, random_state=42)

In [54]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, BaggingClassifier, ExtraTreesClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier

In [64]:
svc = SVC(kernel='sigmoid', gamma=1.0)
knc = KNeighborsClassifier()
lrc = LogisticRegression()
dtc = DecisionTreeClassifier(max_depth=5)
mnc = MultinomialNB()
xgc = XGBClassifier(n_estimators=50, random_state=2)
rfc = RandomForestClassifier(n_estimators=50, random_state=2)
abc = AdaBoostClassifier(n_estimators=50, random_state=2)
bc = BaggingClassifier(n_estimators=50, random_state=2, )
etc = ExtraTreesClassifier(n_estimators=50, random_state=2)
gbc = GradientBoostingClassifier(n_estimators=50, random_state=2)


In [65]:
clfs = {
    "SVC":svc,
    'knc':knc,
    "lrc":lrc,
    "dtc":dtc,
    "mnc":mnc,
    "xgc":xgc,
    "rfc":rfc,
    "abc":abc,
    "bc":bc,
    "etc":etc,
    "gbc":gbc
}

In [66]:
from sklearn.metrics import accuracy_score, precision_score

def trainer(clfs, X_train, X_test, Y_train, Y_test):
    clfs.fit(X_train, Y_train)
    Y_pred = clfs.predict(X_test)
    accuracy = accuracy_score(Y_pred, Y_test)
    precision = precision_score(Y_pred, Y_test)
    return accuracy, precision



In [58]:
X_train

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], shape=(4652, 500))

In [67]:
accuracy_scores = []
precision_scores = []

for name, clf in clfs.items():
    current_accuracy, current_precision = trainer(clf, X_train, X_test, Y_train, Y_test)
    print()
    print("For: ", name)
    print("Accuracy: ", current_accuracy)
    print("Precision: ", current_precision)
    accuracy_scores.append(current_accuracy)
    precision_scores.append(current_precision)


For:  SVC
Accuracy:  0.97678916827853
Precision:  0.8918918918918919

For:  knc
Accuracy:  0.9400386847195358
Precision:  0.5945945945945946

For:  lrc
Accuracy:  0.97678916827853
Precision:  0.8648648648648649

For:  dtc
Accuracy:  0.9400386847195358
Precision:  0.6756756756756757

For:  mnc
Accuracy:  0.9729206963249516
Precision:  0.8378378378378378

For:  xgc
Accuracy:  0.965183752417795
Precision:  0.8108108108108109

For:  rfc
Accuracy:  0.9709864603481625
Precision:  0.8378378378378378

For:  abc
Accuracy:  0.9342359767891683
Precision:  0.6216216216216216

For:  bc
Accuracy:  0.9574468085106383
Precision:  0.8378378378378378

For:  etc
Accuracy:  0.9748549323017408
Precision:  0.8378378378378378

For:  gbc
Accuracy:  0.9516441005802708
Precision:  0.6756756756756757
