### Download the nltk data only once

In [2]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /home/aumii/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/aumii/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/aumii/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [10]:
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /home/aumii/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

### Import the libraries

In [27]:
import pandas as pd, numpy as np
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from nltk.stem import WordNetLemmatizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
import joblib

### Load dataset

In [2]:
data = pd.read_csv('../dataset/dataset.csv')

In [3]:
data

Unnamed: 0,text_type,text
0,spam,naturally irresistible your corporate identity...
1,spam,the stock trading gunslinger fanny is merrill ...
2,spam,unbelievable new homes made easy im wanting to...
3,spam,4 color printing special request additional in...
4,spam,do not have money get software cds from here s...
...,...,...
20343,ham,/ban
20344,ham,/ban
20345,ham,/ban
20346,ham,Kaisi hii


In [4]:
data.shape

(20348, 2)

In [5]:
data.head(10)

Unnamed: 0,text_type,text
0,spam,naturally irresistible your corporate identity...
1,spam,the stock trading gunslinger fanny is merrill ...
2,spam,unbelievable new homes made easy im wanting to...
3,spam,4 color printing special request additional in...
4,spam,do not have money get software cds from here s...
5,spam,great nnews hello welcome to medzonline sh gro...
6,spam,here s a hot play in motion homeland security ...
7,spam,save your money buy getting this thing here yo...
8,spam,undeliverable home based business for grownups...
9,spam,save your money buy getting this thing here yo...


### Text preprocessing

In [6]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

In [7]:
def preprocess_text(text):
    #tokenize
    words = nltk.word_tokenize(text.lower())
    #remove stopwords
    words = [word for word in words if word.isalpha() and word not in stop_words]
    #lemmatizing
    words = [lemmatizer.lemmatize(word) for word in words]
    return " ".join(words)

In [11]:
# apply preprocessing to each row and create a new column which has been lemmatized and stopwords removed
data['cleaned_text'] = data['text'].apply(preprocess_text)

In [12]:
data

Unnamed: 0,text_type,text,cleaned_text
0,spam,naturally irresistible your corporate identity...,naturally irresistible corporate identity lt r...
1,spam,the stock trading gunslinger fanny is merrill ...,stock trading gunslinger fanny merrill muzo co...
2,spam,unbelievable new homes made easy im wanting to...,unbelievable new home made easy im wanting sho...
3,spam,4 color printing special request additional in...,color printing special request additional info...
4,spam,do not have money get software cds from here s...,money get software cd software compatibility g...
...,...,...,...
20343,ham,/ban,
20344,ham,/ban,
20345,ham,/ban,
20346,ham,Kaisi hii,kaisi hii


### Data split, model training and prediction

In [14]:
x = data['cleaned_text']
y = data['text_type']

In [16]:
x

0        naturally irresistible corporate identity lt r...
1        stock trading gunslinger fanny merrill muzo co...
2        unbelievable new home made easy im wanting sho...
3        color printing special request additional info...
4        money get software cd software compatibility g...
                               ...                        
20343                                                     
20344                                                     
20345                                                     
20346                                            kaisi hii
20347                                              shock q
Name: cleaned_text, Length: 20348, dtype: object

In [17]:
y

0        spam
1        spam
2        spam
3        spam
4        spam
         ... 
20343     ham
20344     ham
20345     ham
20346     ham
20347     ham
Name: text_type, Length: 20348, dtype: object

In [18]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)

In [19]:
model = MultinomialNB()

In [22]:
vectorizer = TfidfVectorizer(max_features=5000)
x_train_text = vectorizer.fit_transform(x_train)
x_test_text = vectorizer.transform(x_test)

In [23]:
model.fit(x_train_text, y_train)

0,1,2
,alpha,1.0
,force_alpha,True
,fit_prior,True
,class_prior,


In [24]:
y_pred = model.predict(x_test_text)

In [25]:
print("Accuracy score:", accuracy_score(y_test, y_pred))

Accuracy score: 0.9203931203931204


In [26]:
print("Classification Report:", classification_report(y_test, y_pred))

Classification Report:               precision    recall  f1-score   support

         ham       0.93      0.96      0.94      4338
        spam       0.89      0.83      0.86      1767

    accuracy                           0.92      6105
   macro avg       0.91      0.89      0.90      6105
weighted avg       0.92      0.92      0.92      6105



In [30]:
#save the model and vectorizer in specific location for later use
joblib.dump(model, '../model/model.pkl')
joblib.dump(vectorizer, '../model/vectorizer.pkl')

['../model/vectorizer.pkl']

In [31]:
#to load the model and vectorizer later
# load_model = joblib.load('<location>')
# load_vectorizer = joblib.load('<location>')

Link to the dataset: https://www.kaggle.com/datasets/mexwell/telegram-spam-or-ham