In [None]:
import pandas as pd
import numpy as np
import nltk
import re
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split, GridSearchCV, cross_validate
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.pipeline import Pipeline

In [None]:
df = pd.read_csv('spam.csv', encoding='latin-1')

In [None]:
df = df[['v2', 'v1']]
df.rename(columns={'v2': 'messages', 'v1': 'category'}, inplace=True)

In [None]:
df.isnull().sum()

In [None]:
nltk.download('stopwords')

In [None]:
words = set(stopwords.words('english'))

def cleaning_text(text):
    text = text.lower()
    text = re.sub(r'[^0-9a-zA-Z]', ' ', text)
    text = " ".join(word for word in text.split() if word not in words)
    return text

In [None]:
df['clean_messages'] = df['messages'].apply(cleaning_text)
df_clean = df[['clean_messages', 'category']]
df_clean.head()

In [None]:
X = df_clean['clean_messages']
y = df['category']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, shuffle=True, stratify=y)

In [None]:
from sklearn.naive_bayes import MultinomialNB

multi = MultinomialNB()
multi.get_params()

In [None]:
pipe_model = Pipeline([('vect', CountVectorizer()),
                        ('tfidf', TfidfTransformer()),
                        ('model', multi)
                        ])
model = pipe_model.fit(X_train, y_train)

In [None]:
prediction = model.predict(X_test)

In [None]:
acc = accuracy_score(prediction, y_test)
print("Score of model: {}".format(acc))

In [None]:
from yellowbrick.classifier import ConfusionMatrix

cm = ConfusionMatrix(model)
cm.fit(X_train, y_train)
cm.score(X_test, y_test)
cm.show()

In [None]:
from yellowbrick.classifier import ClassificationReport

cr = ClassificationReport(model, support=True)
cr.fit(X_train, y_train)
cr.score(X_test, y_test)
cr.show()

In [None]:
new_dataset = np.array(['make jahdf hjjd', 'jaha got yout u'])

model.predict(new_dataset)