In [1]:
import pandas as pd

import os
for dirname, _, filenames in os.walk('/spam-detection'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [2]:
ytdata = pd.read_csv("Youtube-Spam-Dataset.csv")[['CONTENT', 'CLASS']]
ytdata.head()

Unnamed: 0,CONTENT,CLASS
0,"Huh, anyway check out this you[tube] channel: ...",1
1,Hey guys check out my new channel and our firs...,1
2,just for test I have to say murdev.com,1
3,me shaking my sexy ass on my channel enjoy ^_^ ﻿,1
4,watch?v=vtaRGgvGtWQ Check this out .﻿,1


In [3]:
ytdata.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1956 entries, 0 to 1955
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   CONTENT  1956 non-null   object
 1   CLASS    1956 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 30.7+ KB


In [4]:
ytdata.isnull().sum()

CONTENT    0
CLASS      0
dtype: int64

In [5]:
tgpsdf = pd.read_csv('Transformation-GPS-Cycle-2_26052025 dataset.csv')[['English_Translation', 'Is_Spam']]
tgpsdf.head()

Unnamed: 0,English_Translation,Is_Spam
0,i m considering to responses the current busin...,0
1,but not only. we need to talk in small groups ...,0
2,ON,1
3,"Present a career plan to aim for, and think to...",0
4,Great support,1


In [6]:
tgpsdf.isnull().sum()

English_Translation    37
Is_Spam                 0
dtype: int64

In [7]:
data = pd.concat([ytdata, tgpsdf.rename(columns={'English_Translation':'CONTENT',\
                            'Is_Spam':'CLASS'})], ignore_index=True)

In [8]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
import re

In [9]:
def clean_text(text: str) -> str:
    text = str(text)
    text = re.sub(r"http\S+", "", text)
    text = re.sub(r"[^a-zA-Z0-9\s]", "", text)
    text = text.lower().strip()
    return text

In [10]:
data.info

<bound method DataFrame.info of                                                 CONTENT  CLASS
0     Huh, anyway check out this you[tube] channel: ...      1
1     Hey guys check out my new channel and our firs...      1
2                just for test I have to say murdev.com      1
3      me shaking my sexy ass on my channel enjoy ^_^ ﻿      1
4               watch?v=vtaRGgvGtWQ   Check this out .﻿      1
...                                                 ...    ...
3948  Faster decision making re new leaders of new u...      0
3949  More change in the function dimension expected...      0
3950                                        Communicate      1
3951  Continue to communicate where we in TSC will l...      0
3952  I believe that in this new stage it is not onl...      0

[3953 rows x 2 columns]>

In [11]:
data['CONTENT'] = data['CONTENT'].apply(clean_text)
X = data['CONTENT']
y = data['CLASS']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

vectorizer = TfidfVectorizer(max_features=5000)
X_train_vect = vectorizer.fit_transform(X_train)
X_test_vect = vectorizer.transform(X_test)

In [12]:
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB()

In [13]:
model.fit(X_train_vect, y_train)

In [14]:
y_pred = model.predict(X_test_vect)

In [15]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.86      0.98      0.92       524
           1       0.95      0.69      0.80       267

    accuracy                           0.88       791
   macro avg       0.90      0.83      0.86       791
weighted avg       0.89      0.88      0.88       791



In [16]:

import joblib

# Save the model to a file
joblib.dump(model, 'naive_bayes_model.pkl')

# Save the vectorizer to a file
joblib.dump(vectorizer, 'vectorizer.pkl')


['vectorizer.pkl']

In [17]:
# Load the model and vectorizer from the files
model = joblib.load('naive_bayes_model.pkl')
vectorizer = joblib.load('vectorizer.pkl')

def classify_comment(comment):
    # Transform the input text using the loaded vectorizer
    comment_vec = vectorizer.transform([comment])
    
    # Predict the class using the loaded model
    prediction = model.predict(comment_vec)
    
    # Return the prediction
    return prediction[0]



In [18]:

# Example usage
comment = "ON"
result = classify_comment(comment)
print(f'The comment is classified as: {result}')

The comment is classified as: 1
