In [26]:
import pandas as pd

In [27]:
df = pd.read_csv('spam.csv', encoding='ISO-8859-1')

In [28]:
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [29]:
df = df[['v1','v2']]

In [30]:
df = df.rename(columns ={'v1':'target','v2':'email'})

In [31]:
df['target'].value_counts()

ham     4825
spam     747
Name: target, dtype: int64

In [32]:
import text_hammer as th

In [33]:
def text_cleaning(df,col_name):
    df[col_name] = df[col_name].apply(lambda x: th.remove_stopwords(x))
    df[col_name] = df[col_name].apply(lambda x: th.remove_special_chars(x))
    df[col_name] = df[col_name].apply(lambda x: th.remove_accented_chars(x))
    df[col_name] = df[col_name].apply(lambda x: th.remove_html_tags(x))
    df[col_name] = df[col_name].apply(lambda x: th.remove_urls(x))
    df[col_name] = df[col_name].apply(lambda x: th.make_base(x))
    return df
    

In [34]:
text_cleaning(df,'email')

Unnamed: 0,target,email
0,ham,go jurong point crazy Available bugis n great ...
1,ham,ok lar Joking wif u oni
2,spam,free entry 2 wkly comp win FA Cup final tkts 2...
3,ham,u dun early hor U c say
4,ham,nah I do not think go usf life
...,...,...
5567,spam,this 2nd time try 2 contact u U win a750 Pound...
5568,ham,will i _ b go esplanade fr home
5569,ham,pity mood that soany suggestion
5570,ham,the guy bitch I act like i d interested buying...


In [36]:
df['target']=df['target'].map({'ham':0, 'spam':1})

In [38]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(df['email'],df['target'],test_size=0.2,stratify=df['target']
                                                ,random_state=42)

In [39]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
vectorizer = TfidfVectorizer()

In [40]:
classifier = Pipeline([('feature_generation', vectorizer),('model',MultinomialNB())])

In [41]:
classifier.fit(x_train,y_train)

Pipeline(steps=[('feature_generation', TfidfVectorizer()),
                ('model', MultinomialNB())])

In [42]:
text = ['Hey Abhishek can we get together to watch a football match', 
       'Free entry in 2 a weekly competition to win FA Cup final']
classifier.predict(text)

array([0, 1], dtype=int64)

In [43]:
from sklearn import metrics

In [47]:
y_pred = classifier.predict(x_test)
print(metrics.classification_report(y_test,classifier.predict(x_test)))

              precision    recall  f1-score   support

           0       0.96      1.00      0.98       966
           1       0.99      0.72      0.84       149

    accuracy                           0.96      1115
   macro avg       0.98      0.86      0.91      1115
weighted avg       0.96      0.96      0.96      1115



In [48]:
from sklearn.metrics import confusion_matrix

In [49]:
confusion_matrix(y_test,y_pred)

array([[965,   1],
       [ 41, 108]], dtype=int64)