In [3]:
# Part 1: Import necessary libraries
import pandas as pd
import numpy as np
import re
import string
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [4]:
# File names
files = [
    "comments1.csv",
    "comments2.csv",
    "comments3.csv",
    "comments4.csv",
    "comments5.csv"
]

# Read and merge into one DataFrame
dataframes = [pd.read_csv(f) for f in files]
df = pd.concat(dataframes, ignore_index=True)

# Display dataset information
print("Dataset successfully loaded and combined.")
print(f"Total rows and columns: {df.shape}")
print(f"Column names: {df.columns.tolist()}")
print("\nFirst 3 sample rows:")
print(df.head(3))

Dataset successfully loaded and combined.
Total rows and columns: (4725012, 10)
Column names: ['kind', 'commentId', 'channelId', 'videoId', 'authorId', 'textOriginal', 'parentCommentId', 'likeCount', 'publishedAt', 'updatedAt']

First 3 sample rows:
              kind  commentId  channelId  videoId  authorId  \
0  youtube#comment    1781382      14492    74288   2032536   
1  youtube#comment     289571      14727    79618   3043229   
2  youtube#comment     569077       3314    51826    917006   

                                        textOriginal  parentCommentId  \
0  PLEASE LESBIAN FLAG I BEG YOU \n\nYou would ro...              NaN   
1   Apply mashed potato juice and mixed it with curd        3198066.0   
2                         69 missed calls from mars👽              NaN   

   likeCount                publishedAt                  updatedAt  
0          0  2023-08-15 21:48:52+00:00  2023-08-15 21:48:52+00:00  
1          0  2023-10-02 13:08:22+00:00  2023-10-02 13:08:22+00:00

In [6]:
# Part 2: Data Preprocessing
import re
import string

# Step 1: Keep only relevant column for spam detection
comments = df[['textOriginal']].copy()

# Step 2: Drop rows with missing text
comments.dropna(subset=['textOriginal'], inplace=True)
print(f"Remaining rows after dropping missing comments: {len(comments)}")

# Step 3: Define a function to clean text
def clean_text(text):
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)
    # Remove mentions (@username) and hashtags
    text = re.sub(r'@\w+|#\w+', '', text)
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Lowercase
    text = text.lower()
    # Remove extra whitespace
    text = text.strip()
    return text

# Step 4: Apply cleaning
comments['cleaned_text'] = comments['textOriginal'].apply(clean_text)

# Step 5: Preview
print("\nSample cleaned comments:")
print(comments[['textOriginal', 'cleaned_text']].head(5))


Remaining rows after dropping missing comments: 4724755

Sample cleaned comments:
                                        textOriginal  \
0  PLEASE LESBIAN FLAG I BEG YOU \n\nYou would ro...   
1   Apply mashed potato juice and mixed it with curd   
2                         69 missed calls from mars👽   
3                                               Baaa   
4    you look like raven from phenomena raven no cap   

                                        cleaned_text  
0  please lesbian flag i beg you \n\nyou would ro...  
1   apply mashed potato juice and mixed it with curd  
2                            missed calls from mars👽  
3                                               baaa  
4    you look like raven from phenomena raven no cap  


Train Spam Dataset

import pandas as pd

# Load dataset
df = pd.read_csv("Youtube-Spam-Dataset.csv")

# Check first rows
print(df.head())

# Check class distribution (spam vs not spam)
print(df['CLASS'].value_counts())

import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

nltk.download("stopwords")
stop_words = set(stopwords.words("english"))
stemmer = PorterStemmer()

def clean_text(text):
    text = str(text).lower()
    text = re.sub(r"http\S+|www\S+|https\S+", "", text)  # remove URLs
    text = re.sub(r"[^a-z\s]", "", text)  # remove punctuation/numbers
    words = text.split()
    words = [stemmer.stem(w) for w in words if w not in stop_words]
    return " ".join(words)

df["cleaned"] = df["CONTENT"].apply(clean_text)
print(df[["CONTENT", "cleaned"]].head())

In [None]:
import pandas as pd
import re
import nltk
import pickle
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# -----------------------------
# 1. Load Spam Dataset
# -----------------------------
df = pd.read_csv("Youtube-Spam-Dataset.csv")

# Preprocess text
nltk.download("stopwords")
stop_words = set(stopwords.words("english"))
stemmer = PorterStemmer()

def clean_text(text):
    text = str(text).lower()
    text = re.sub(r"http\S+|www\S+|https\S+", "", text)  # remove URLs
    text = re.sub(r"[^a-z\s]", "", text)  # remove punctuation/numbers
    words = text.split()
    words = [stemmer.stem(w) for w in words if w not in stop_words]
    return " ".join(words)

df["cleaned"] = df["CONTENT"].apply(clean_text)

# -----------------------------
# 2. Vectorize + Train Random Forest
# -----------------------------
X = df["cleaned"]
y = df["CLASS"]

vectorizer = TfidfVectorizer(max_features=5000)
X_vec = vectorizer.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_vec, y, test_size=0.2, random_state=42)

rf = RandomForestClassifier(n_estimators=200, random_state=42)
rf.fit(X_train, y_train)

# Evaluate
y_pred = rf.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

# -----------------------------
# 3. Save Model + Vectorizer
# -----------------------------
pickle.dump(rf, open("rf_model.pkl", "wb"))
pickle.dump(vectorizer, open("vectorizer.pkl", "wb"))


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Acer\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Accuracy: 0.8801020408163265
Classification Report:
               precision    recall  f1-score   support

           0       0.92      0.81      0.86       176
           1       0.86      0.94      0.90       216

    accuracy                           0.88       392
   macro avg       0.89      0.87      0.88       392
weighted avg       0.88      0.88      0.88       392



In [25]:
import pickle

# Load the trained model and vectorizer
with open("rf_model.pkl", "rb") as f:
    rf_model = pickle.load(f)

with open("vectorizer.pkl", "rb") as f:
    tfidf_vectorizer = pickle.load(f)

# Transform the cleaned comments using the same TF-IDF vectorizer
X_comments = tfidf_vectorizer.transform(comments["cleaned_text"])

# Apply model to predict
comments["prediction"] = rf_model.predict(X_comments)

# If you want probability scores too
comments["probability_spam"] = rf_model.predict_proba(X_comments)[:, 1]

# Preview results
print(comments[["textOriginal", "cleaned_text", "prediction", "probability_spam"]].head(10))

# Save results to CSV
comments.to_csv("comments_with_predictions.csv", index=False)
print("✅ Predictions saved to comments_with_predictions.csv")


                                        textOriginal  \
0  PLEASE LESBIAN FLAG I BEG YOU \n\nYou would ro...   
1   Apply mashed potato juice and mixed it with curd   
2                         69 missed calls from mars👽   
3                                               Baaa   
4    you look like raven from phenomena raven no cap   
5                                           American   
6         Sahi disha me ja ja raha india ka Future..   
7                                         ❤❤❤❤❤❤❤❤❤❤   
8                    Love your videos. Thank you ❤❤❤   
9  India is  the best and  very beautiful 😍😍😍😍😍😍😍...   

                                        cleaned_text  prediction  \
0  please lesbian flag i beg you \n\nyou would ro...           0   
1   apply mashed potato juice and mixed it with curd           1   
2                            missed calls from mars👽           1   
3                                               baaa           1   
4    you look like raven from phenomena rav

In [26]:
print(comments['prediction'].value_counts())
print(comments.groupby('prediction')['probability_spam'].mean())

prediction
1    3158459
0    1566296
Name: count, dtype: int64
prediction
0    0.244680
1    0.755141
Name: probability_spam, dtype: float64


In [27]:
import streamlit as st

user_input = st.text_area("Enter a comment:")
if st.button("Check Spam"):
    X = vectorizer.transform([user_input])
    prob = model.predict_proba(X)[0,1]
    st.write("Spam probability:", round(prob,2))
    st.write("Prediction:", "Spam" if prob > 0.5 else "Not Spam")

2025-09-01 15:53:01.017 
  command:

    streamlit run c:\Users\Acer\anaconda3\new\Lib\site-packages\ipykernel_launcher.py [ARGUMENTS]
