In [None]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns  

from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer,TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score
from sklearn.metrics import roc_auc_score, f1_score, confusion_matrix

from wordcloud import WordCloud


In [None]:
data = pd.read_csv('./spam.csv', encoding='ISO-8859-1')

data.head()

In [None]:
pd.set_option('display.max_colwidth',None)

data.drop(['Unnamed: 2','Unnamed: 3','Unnamed: 4'],axis=1,inplace=True)
data.rename(columns={'v1':'label','v2':'text'},inplace=True)

print('The dataset has {} rows and {} columns.'.format(data.shape[0],data.shape[1]))

In [None]:
data.isna().sum()

In [None]:
data.duplicated().sum()

In [None]:
data[data.duplicated(keep='first')].head()

In [None]:
data.drop_duplicates(inplace=True)

data.duplicated().sum()

In [None]:
sns.histplot(data=data,x='label')

plt.title('Distribution of Target Classes',fontsize=25)
plt.xlabel('target classes',fontsize=15)
plt.ylabel('count',fontsize=15)
plt.tight_layout()

In [None]:
text = ' '.join(data[data['label']=='ham']['text'])
wordcloud = WordCloud(max_font_size=50,max_words=40).generate(text)

# Generate plot
plt.imshow(wordcloud)
plt.axis("off")
plt.title("Wordcloud for 'Ham' Messages",fontsize=20)
plt.show()

In [None]:
text = ' '.join(data[data['label']=='spam']['text'])
wordcloud = WordCloud(max_font_size=50,max_words=40).generate(text)

# Generate plot
plt.imshow(wordcloud)
plt.axis("off")
plt.title("Wordcloud for 'Spam' Messages",fontsize=20)
plt.show()

In [None]:
X = data['text']
y = data['label']

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=42)

X_train.head(2)

In [None]:
def get_test_scores(model_name:str,preds,y_test_data):
    '''
    Generate a table of test scores.

    In:
        model_name (string): Your choice: how the model will be named in the output table
        preds: numpy array of test predictions
        y_test_data: numpy array of y_test data

    Out:
        table: a pandas df of precision, recall, f1, and accuracy scores for your model
    '''
    accuracy  = accuracy_score(y_test_data,preds)
    precision = precision_score(y_test_data,preds,average='macro')
    recall    = recall_score(y_test_data,preds,average='macro')
    f1        = f1_score(y_test_data,preds,average='macro')

    table = pd.DataFrame({'model': [model_name],'precision': [precision],'recall': [recall],
                          'F1': [f1],'accuracy': [accuracy]})

    return table 

In [None]:
# The option "decode_error='ignore'" is set to take care of the wrongly decoded characters
featurizer = CountVectorizer(decode_error='ignore')

X_train2 = featurizer.fit_transform(X_train)
X_test2 = featurizer.transform(X_test)

In [None]:
MNB = MultinomialNB()
MNB.fit(X_train2,y_train)

MNB_train_preds = MNB.predict(X_train2)
MNB_train_results = get_test_scores('MNB (train)',MNB_train_preds,y_train)

MNB_test_preds = MNB.predict(X_test2)
MNB_test_results = get_test_scores('MNB (test)',MNB_test_preds,y_test)

MNB_results = pd.concat([MNB_train_results,MNB_test_results],axis=0)
MNB_results

In [None]:
# Generate array of values for confusion matrix
cm = confusion_matrix(y_test,MNB_test_preds,labels=MNB.classes_)

ax = sns.heatmap(cm,annot=True)
ax.set_title('Confusion Matrix (CountVectorizer + MultinomialNB)',fontsize=16)
ax.xaxis.set_ticklabels(['ham','spam'],fontsize=12) 
ax.yaxis.set_ticklabels(['ham','spam'],fontsize=12) 
ax.set_xlabel("Predicted",fontsize=14)
ax.set_ylabel("Target",fontsize=14)

plt.tight_layout()

In [None]:
prob_train = MNB.predict_proba(X_train2)[:,1]
prob_test  = MNB.predict_proba(X_test2)[:,1]

print("train AUC:",roc_auc_score(y_train,prob_train))
print("test AUC:",roc_auc_score(y_test,prob_test))

In [None]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()

y_train2 = encoder.fit_transform(y_train) 
y_test2  = encoder.fit_transform(y_test) 

In [None]:
from sklearn.metrics import roc_curve

def plot_roc_curve(true_y,y_prob,text):
    """
    plots the roc curve based of the probabilities
    """
    fpr, tpr, thresholds = roc_curve(true_y,y_prob)
    plt.plot(fpr, tpr)
    plt.title(f'ROC Curve {text}')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.tight_layout()

plot_roc_curve(y_test2,prob_test,'(CountVectorizer + MultinomialNB)')

In [None]:
featurizer = TfidfVectorizer(decode_error='ignore')

X_train3 = featurizer.fit_transform(X_train)
X_test3  = featurizer.transform(X_test)

In [None]:
MNB = MultinomialNB()
MNB.fit(X_train3,y_train)

MNB_train_preds2   = MNB.predict(X_train3)
MNB_train_results2 = get_test_scores('MNB (train)',MNB_train_preds2,y_train)

MNB_test_preds2   = MNB.predict(X_test3)
MNB_test_results2 = get_test_scores('MNB (test)',MNB_test_preds2,y_test)

MNB_results2 = pd.concat([MNB_train_results2,MNB_test_results2],axis=0)
MNB_results2

In [None]:
# Generate array of values for confusion matrix
cm = confusion_matrix(y_test,MNB_test_preds2,labels=MNB.classes_)

ax = sns.heatmap(cm,annot=True)
ax.set_title('Confusion Matrix (CountVectorizer + MultinomialNB)',fontsize=16)
ax.xaxis.set_ticklabels(['ham','spam'],fontsize=12) 
ax.yaxis.set_ticklabels(['ham','spam'],fontsize=12) 
ax.set_xlabel("Predicted",fontsize=14)
ax.set_ylabel("Target",fontsize=14)

plt.tight_layout()