In [1]:
import numpy as np
import pandas as pd

In [2]:
df=pd.read_csv('SMC-Labeled.csv',encoding='ISO-8859-1')

In [3]:
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [4]:
df.isnull().sum()

v1               0
v2               0
Unnamed: 2    5522
Unnamed: 3    5560
Unnamed: 4    5566
dtype: int64

In [5]:
df.drop(columns=['Unnamed: 2','Unnamed: 3','Unnamed: 4'],inplace=True)

**checking if there is empty string**

In [6]:
blanks=[]
for index,v1,v2 in df.itertuples():
    if type(v2)==str:
        if v2.isspace():
            blanks.append(index)

print(blanks)

[]


**check the dataset, finding it unbalanced.**

In [7]:
df['v1'].value_counts()

ham     4825
spam     747
Name: v1, dtype: int64

In [8]:
X=df['v2']
y=df['v1']

In [9]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,random_state=42)

In [10]:
import spacy
nlp=spacy.load('en_core_web_sm')

In [11]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression

In [12]:
text_clf_nb=Pipeline([('tfidf',TfidfVectorizer()),('clf',MultinomialNB())])
text_clf_svc=Pipeline([('tfidf',TfidfVectorizer()),('clf',LinearSVC())])
text_clf_lr=Pipeline([('tfidf',TfidfVectorizer()),('clf',LogisticRegression(solver='lbfgs'))])

In [13]:
text_clf_nb.fit(X_train,y_train)
text_clf_svc.fit(X_train,y_train)
text_clf_lr.fit(X_train,y_train)

Pipeline(steps=[('tfidf', TfidfVectorizer()), ('clf', LogisticRegression())])

In [14]:
predictions_nb=text_clf_nb.predict(X_test)
predictions_svc=text_clf_svc.predict(X_test)
predictions_lr=text_clf_lr.predict(X_test)

In [15]:
from sklearn.metrics import confusion_matrix,classification_report,accuracy_score

### TF-IDF + NB

In [16]:
print(accuracy_score(y_test,predictions_nb))

0.9599282296650717


In [17]:
print(classification_report(y_test,predictions_nb))

              precision    recall  f1-score   support

         ham       0.96      1.00      0.98      1453
        spam       1.00      0.69      0.82       219

    accuracy                           0.96      1672
   macro avg       0.98      0.85      0.90      1672
weighted avg       0.96      0.96      0.96      1672



In [18]:
print(confusion_matrix(y_test,predictions_nb))

[[1453    0]
 [  67  152]]


### TF-IDF + SVM

In [19]:
print(accuracy_score(y_test,predictions_svc))

0.9832535885167464


In [20]:
print(classification_report(y_test,predictions_svc))

              precision    recall  f1-score   support

         ham       0.98      1.00      0.99      1453
        spam       0.98      0.89      0.93       219

    accuracy                           0.98      1672
   macro avg       0.98      0.94      0.96      1672
weighted avg       0.98      0.98      0.98      1672



In [21]:
print(confusion_matrix(y_test,predictions_svc))

[[1450    3]
 [  25  194]]


### TF-IDF + LR

In [22]:
print(accuracy_score(y_test,predictions_lr))

0.9659090909090909


In [23]:
print(classification_report(y_test,predictions_lr))

              precision    recall  f1-score   support

         ham       0.96      1.00      0.98      1453
        spam       0.99      0.74      0.85       219

    accuracy                           0.97      1672
   macro avg       0.98      0.87      0.92      1672
weighted avg       0.97      0.97      0.96      1672



In [24]:
print(confusion_matrix(y_test,predictions_lr))

[[1452    1]
 [  56  163]]


# parameter tunning

In [25]:
from sklearn.model_selection import GridSearchCV

# imbalance mitigation

In [34]:
ham_balanced=df[df['v1']=='ham']
ham_balanced=ham_balanced.reset_index(drop=True)[:1000]
spam_balanced=df[df['v1']=='spam']
spam_balanced=spam_balanced.reset_index(drop=True)

In [37]:
balanced_data = pd.concat([ham_balanced, spam_balanced]).sample(frac=1).reset_index(drop=True)

In [38]:
X=balanced_data['v2']
y=balanced_data['v1']

In [39]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,random_state=42)

In [40]:
text_clf_nb=Pipeline([('tfidf',TfidfVectorizer()),('clf',MultinomialNB())])
text_clf_svc=Pipeline([('tfidf',TfidfVectorizer()),('clf',LinearSVC())])
text_clf_lr=Pipeline([('tfidf',TfidfVectorizer()),('clf',LogisticRegression(solver='lbfgs'))])
text_clf_nb.fit(X_train,y_train)
text_clf_svc.fit(X_train,y_train)
text_clf_lr.fit(X_train,y_train)

Pipeline(steps=[('tfidf', TfidfVectorizer()), ('clf', LogisticRegression())])

In [41]:
predictions_nb=text_clf_nb.predict(X_test)
predictions_svc=text_clf_svc.predict(X_test)
predictions_lr=text_clf_lr.predict(X_test)

In [42]:
print(accuracy_score(y_test,predictions_nb))

0.9580952380952381


In [43]:
print(accuracy_score(y_test,predictions_svc))

0.9676190476190476


In [44]:
print(accuracy_score(y_test,predictions_lr))

0.9561904761904761


In [45]:
print(confusion_matrix(y_test,predictions_nb))

[[293   3]
 [ 19 210]]


In [46]:
print(confusion_matrix(y_test,predictions_svc))

[[290   6]
 [ 11 218]]


In [47]:
print(confusion_matrix(y_test,predictions_lr))

[[291   5]
 [ 18 211]]
