In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report,confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer,TfidfVectorizer

In [2]:
data = "C:\\Users\\Aditya Shakya\\OneDrive\\Desktop\\datasets\\email_spam.csv"

In [3]:
df = pd.read_csv(data)

In [4]:
df.shape

(5572, 2)

In [5]:
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [6]:
df['Category'].value_counts()

ham     4825
spam     747
Name: Category, dtype: int64

In [7]:
df['Category'] = df['Category'].apply(lambda x : 1 if x=='ham' else 0)


In [8]:
df['Category']

0       1
1       1
2       0
3       1
4       1
       ..
5567    0
5568    1
5569    1
5570    1
5571    1
Name: Category, Length: 5572, dtype: int64

In [9]:
df['length'] = df['Message'].apply(lambda x : len(x))

In [10]:
df.head()

Unnamed: 0,Category,Message,length
0,1,"Go until jurong point, crazy.. Available only ...",111
1,1,Ok lar... Joking wif u oni...,29
2,0,Free entry in 2 a wkly comp to win FA Cup fina...,155
3,1,U dun say so early hor... U c already then say...,49
4,1,"Nah I don't think he goes to usf, he lives aro...",61


In [11]:
ham = df[df['Category']==1]
spam = df[df['Category']==0]

In [12]:
spam.shape

(747, 3)

In [13]:
tfidf = TfidfVectorizer()
X = tfidf.fit_transform(df['Message'])
X = X.toarray()

In [14]:
y = df['Category'].values

### Random Forest

In [15]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2, random_state=42)

In [16]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(4457, 8709)
(1115, 8709)
(4457,)
(1115,)


In [17]:
clf = RandomForestClassifier(n_estimators=100, criterion = 'gini',n_jobs=2,min_samples_leaf=2)

In [18]:
model = clf.fit(X_train,y_train)

In [19]:
y_predict = model.predict(X_test)

In [20]:
print(accuracy_score(y_test,y_predict))

0.9766816143497757


In [21]:
print(confusion_matrix(y_test,y_predict))

[[123  26]
 [  0 966]]


In [22]:
print(classification_report(y_test,y_predict))

              precision    recall  f1-score   support

           0       1.00      0.83      0.90       149
           1       0.97      1.00      0.99       966

    accuracy                           0.98      1115
   macro avg       0.99      0.91      0.95      1115
weighted avg       0.98      0.98      0.98      1115



### SVM

In [23]:
sv = SVC(C = 1, kernel='sigmoid')

In [24]:
model_svm = sv.fit(X_train,y_train)

In [25]:
y_predict_sv = model.predict(X_test)

In [26]:
accuracy_score(y_test,y_predict)

0.9766816143497757

In [27]:
print(classification_report(y_test,y_predict))

              precision    recall  f1-score   support

           0       1.00      0.83      0.90       149
           1       0.97      1.00      0.99       966

    accuracy                           0.98      1115
   macro avg       0.99      0.91      0.95      1115
weighted avg       0.98      0.98      0.98      1115



In [28]:
print(confusion_matrix(y_test,y_predict))

[[123  26]
 [  0 966]]


In [56]:
def predict(x):
    x = tfidf.transform([x])
    predicted = model.predict(x)
    if(predicted[0]):
        return "ham"
    else:
        return "Spam"

In [57]:
predict(df["Message"][2])

'Spam'