In [1]:
# Importing tools 
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix, classification_report

In [2]:
# Import data set
df = pd.read_csv("data/spam2.csv", sep=",", encoding="ISO-8859-1")

# Drop the last three columns
df = df.drop(list(df.columns[2:]), axis=1)
df

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will Ì_ b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [3]:
# Rename the columns as 'label' and 'message'
df = df.rename(columns = {"v1": "label", "v2": "message"})

In [4]:
df["label"]

0        ham
1        ham
2       spam
3        ham
4        ham
        ... 
5567    spam
5568     ham
5569     ham
5570     ham
5571     ham
Name: label, Length: 5572, dtype: object

In [5]:
# Turning label data into 0s and 1s
df["label"] = df["label"].map({"ham" : 0, "spam" : 1})

In [6]:
df["label"]

0       0
1       0
2       1
3       0
4       0
       ..
5567    1
5568    0
5569    0
5570    0
5571    0
Name: label, Length: 5572, dtype: int64

In [7]:
# Instantiate the model
model = MultinomialNB()

In [8]:
# Split the data 
X = df["message"]
y = df["label"]

# Transform the messages into vectors with TfidVectorizer
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(X)
X

<5572x8672 sparse matrix of type '<class 'numpy.float64'>'
	with 73916 stored elements in Compressed Sparse Row format>

In [9]:
# Split the data into test and train sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Fit the model
model.fit(X_train, y_train)

In [10]:
# Score the model
model.score(X_test, y_test)

0.9614349775784753

In [11]:
# Make the preds
y_pred = model.predict(X_test)

In [34]:
# Printing classification

report = classification_report(y_test, y_pred)
print(report)

              precision    recall  f1-score   support

           0       0.96      1.00      0.98       974
           1       1.00      0.70      0.82       141

    accuracy                           0.96      1115
   macro avg       0.98      0.85      0.90      1115
weighted avg       0.96      0.96      0.96      1115



In [13]:
# Showing confusion matrix

cm = pd.DataFrame(confusion_matrix(y_test, y_pred))
cm

Unnamed: 0,0,1
0,974,0
1,43,98


In [26]:
def predict(sms, model=model):
    """
    Function that tranforms a sms message and return the prediction for the message.
    """
    transformed_sms = vectorizer.transform([sms])
    
    pred = model.predict(transformed_sms)
    return "Spam" if pred[0] == 1 else "Ham"

In [35]:
# Example of prediction
print(predict("Congratulations! You’ve won a $500 Amazon gift card."))

Spam
