In [1]:
# You may need to install libraries
! pip install pandas
! pip install nltk
! pip install scikit-learn

# Import libraries
import string
import nltk
import pandas as pd
from nltk.corpus import stopwords
from sklearn import metrics
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

Collecting pandas
  Downloading pandas-1.5.2-cp310-cp310-macosx_10_9_x86_64.whl (12.0 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.0/12.0 MB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m[36m0:00:01[0mm
Installing collected packages: pandas
Successfully installed pandas-1.5.2
Collecting scikit-learn
  Downloading scikit_learn-1.1.3-cp310-cp310-macosx_10_9_x86_64.whl (8.7 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.7/8.7 MB[0m [31m788.6 kB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m[36m0:00:01[0m
Collecting threadpoolctl>=2.0.0
  Downloading threadpoolctl-3.1.0-py3-none-any.whl (14 kB)
Installing collected packages: threadpoolctl, scikit-learn
Successfully installed scikit-learn-1.1.3 threadpoolctl-3.1.0


In [20]:
# Read the dataset
messages = pd.read_csv(
    "spam.csv", encoding="latin-1",
    index_col=None
)
messages.head()

0       Go until jurong point, crazy.. Available only ...
1                           Ok lar... Joking wif u oni...
2       Free entry in 2 a wkly comp to win FA Cup fina...
3       U dun say so early hor... U c already then say...
4       Nah I don't think he goes to usf, he lives aro...
                              ...                        
5567    This is the 2nd time we have tried 2 contact u...
5568                Will Ì_ b going to esplanade fr home?
5569    Pity, * was in mood for that. So...any other s...
5570    The guy did some bitching but I acted like i'd...
5571                           Rofl. Its true to its name
Name: v2, Length: 5572, dtype: object


Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [3]:
def text_preprocess(message):
    # Remove punctuations
    nopunc = [char for char in message if char not in string.punctuation]

    # Join the characters again
    nopunc = "".join(nopunc)
    nopunc = nopunc.lower()

    # Remove any stopwords and non-alphabetic characters
    nostop = [
        word
        for word in nopunc.split()
        if word.lower() not in stopwords.words("english") and word.isalpha()
    ]

    return nostop

In [21]:
spam_messages = messages[messages["v1"] == "spam"]["v2"]
ham_messages = messages[messages["v1"] == "ham"]["v2"]
print(f"Number of spam messages: {len(spam_messages)}")
print(f"Number of ham messages: {len(ham_messages)}")

Number of spam messages: 747
Number of ham messages: 4825


In [22]:
# Download stopwords
nltk.download('stopwords')

# Words in spam messages
spam_words = []
for each_message in spam_messages:
    spam_words += text_preprocess(each_message)
    
print(f"Top 10 spam words are:\n {pd.Series(spam_words).value_counts().head(10)}")

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/aamingem/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Top 10 spam words are:
 call      347
free      216
txt       150
u         147
ur        144
mobile    123
text      120
claim     113
stop      113
reply     101
dtype: int64


In [23]:
# Words in ham messages
ham_words = []
for each_message in ham_messages:
    ham_words += text_preprocess(each_message)
    
print(f"Top 10 ham words are:\n {pd.Series(ham_words).value_counts().head(10)}")

Top 10 ham words are:
 u       972
im      449
get     303
ltgt    276
ok      272
dont    257
go      247
ur      240
ill     236
know    232
dtype: int64


In [25]:
# Remove punctuations/stopwords from all messages
messages["v2"] = messages["v2"].apply(text_preprocess)
messages.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"[go, jurong, point, crazy, available, bugis, n...",,,
1,ham,"[ok, lar, joking, wif, u, oni]",,,
2,spam,"[free, entry, wkly, comp, win, fa, cup, final,...",,,
3,ham,"[u, dun, say, early, hor, u, c, already, say]",,,
4,ham,"[nah, dont, think, goes, usf, lives, around, t...",,,


In [28]:
# Convert messages (as lists of string tokens) to strings
messages["v2"] = messages["v2"].agg(lambda x: " ".join(map(str, x)))
messages.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,go jurong point crazy available bugis n great ...,,,
1,ham,ok lar joking wif u oni,,,
2,spam,free entry wkly comp win fa cup final tkts may...,,,
3,ham,u dun say early hor u c already say,,,
4,ham,nah dont think goes usf lives around though,,,


In [29]:
# Initialize count vectorizer
vectorizer = CountVectorizer()
bow_transformer = vectorizer.fit(messages["v2"])

# Fetch the vocabulary set
print(f"20 BOW Features: {vectorizer.get_feature_names()[20:40]}")
print(f"Total number of vocab words: {len(vectorizer.vocabulary_)}")

20 BOW Features: ['absence', 'absolutely', 'abstract', 'abt', 'abta', 'aburo', 'abuse', 'abusers', 'ac', 'academic', 'acc', 'accent', 'accenture', 'accept', 'access', 'accessible', 'accidant', 'accident', 'accidentally', 'accommodation']
Total number of vocab words: 8084




In [31]:
# Convert strings to vectors using BoW
messages_bow = bow_transformer.transform(messages["v2"])

# Print the shape of the sparse matrix and count the number of non-zero occurrences
print(f"Shape of sparse matrix: {messages_bow.shape}")
print(f"Amount of non-zero occurrences: {messages_bow.nnz}")

Shape of sparse matrix: (5572, 8084)
Amount of non-zero occurrences: 44211


In [32]:
# TF-IDF
from sklearn.feature_extraction.text import TfidfTransformer

tfidf_transformer = TfidfTransformer().fit(messages_bow)

# Transform entire BoW into tf-idf corpus
messages_tfidf = tfidf_transformer.transform(messages_bow)
print(messages_tfidf.shape)

(5572, 8084)


In [34]:
# Convert spam and ham labels to 0 and 1 (or, vice-versa)
FactorResult = pd.factorize(messages["v1"])
messages["v1"] = FactorResult[0]
messages.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,0,go jurong point crazy available bugis n great ...,,,
1,0,ok lar joking wif u oni,,,
2,1,free entry wkly comp win fa cup final tkts may...,,,
3,0,u dun say early hor u c already say,,,
4,0,nah dont think goes usf lives around though,,,


In [36]:
# Split the dataset to train and test sets
msg_train, msg_test, label_train, label_test = train_test_split(
    messages_tfidf, messages["v1"], test_size=0.2
)

print(f"train dataset features size: {msg_train.shape}")
print(f"train dataset label size: {label_train.shape}")

print(f"test dataset features size: {msg_test.shape}")
print(f"test dataset label size: {label_test.shape}")

train dataset features size: (4457, 8084)
train dataset label size: (4457,)
test dataset features size: (1115, 8084)
test dataset label size: (1115,)


In [37]:
# Install xgboost library
! pip install xgboost

Collecting xgboost
  Downloading xgboost-1.7.1-py3-none-macosx_10_15_x86_64.macosx_11_0_x86_64.macosx_12_0_x86_64.whl (1.8 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m313.1 kB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m[36m0:00:01[0m
Installing collected packages: xgboost
Successfully installed xgboost-1.7.1


In [38]:
# Train an xgboost classifier
from xgboost import XGBClassifier

# Instantiate our model
clf = XGBClassifier()

# Fit the model to the training data
clf.fit(msg_train, label_train)

In [39]:
# Make predictions
predict_train = clf.predict(msg_train)

print(
    f"Accuracy of Train dataset: {metrics.accuracy_score(label_train, predict_train):0.3f}"
)

Accuracy of Train dataset: 0.989


In [41]:
# an example prediction
print(
    "predicted:",
    clf.predict(
        tfidf_transformer.transform(bow_transformer.transform([messages["v2"][9]]))
    )[0],
)
print("expected:", messages["v1"][9])

predicted: 1
expected: 1


In [42]:
# print the overall accuracy of the model
label_predictions = clf.predict(msg_test)
print(f"Accuracy of the model: {metrics.accuracy_score(label_test, label_predictions):0.3f}")

Accuracy of the model: 0.970


In [43]:
import joblib
#save model
joblib.dump(xgb, "spam") 

#load saved model
xgb = joblib.load(filename)