In [4]:
# 1. Imports and Setup
import pandas as pd
import numpy as np
import re
import spacy
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import classification_report, roc_auc_score

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

import warnings
warnings.filterwarnings("ignore")

nlp = spacy.load('en_core_web_sm', disable=['ner', 'parser'])


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
df = pd.read_csv('/content/train.csv')
df = df.drop(columns=['id'])
labels = ['toxic','severe_toxic','obscene','threat','insult','identity_hate']


In [None]:
# 3. Preprocessing with SpaCy
stop_words = set(stopwords.words('english'))

def spacy_preprocess(text):
    doc = nlp(text.lower())
    return " ".join([token.lemma_ for token in doc if token.is_alpha and token.text not in stop_words])

df['comment_text'] = df['comment_text'].astype(str).apply(spacy_preprocess)

In [5]:

# 4. Split Data
X = df['comment_text']
y = df[labels]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [6]:
# 5. TF-IDF + Logistic Regression Pipeline
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(
        stop_words='english', max_df=0.8, min_df=3, ngram_range=(1,2))),
    ('clf', OneVsRestClassifier(LogisticRegression(
        C=5, class_weight='balanced', solver='liblinear')))
])

In [7]:
# 6. Train and Evaluate
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)
y_prob = pipeline.predict_proba(X_test)

print("Classification Report:\n")
print(classification_report(y_test, y_pred, target_names=labels))


Classification Report:

               precision    recall  f1-score   support

        toxic       0.73      0.81      0.77      3056
 severe_toxic       0.33      0.71      0.45       321
      obscene       0.76      0.84      0.80      1715
       threat       0.28      0.55      0.37        74
       insult       0.64      0.80      0.71      1614
identity_hate       0.35      0.60      0.45       294

    micro avg       0.65      0.80      0.72      7074
    macro avg       0.51      0.72      0.59      7074
 weighted avg       0.68      0.80      0.73      7074
  samples avg       0.06      0.07      0.07      7074



In [8]:
# 7. ROC AUC Scores
roc_scores = {}
for idx, label in enumerate(labels):
    roc = roc_auc_score(y_test[label], y_prob[:, idx])
    roc_scores[label] = roc
    print(f"ROC AUC for {label}: {roc:.4f}")


ROC AUC for toxic: 0.9699
ROC AUC for severe_toxic: 0.9789
ROC AUC for obscene: 0.9844
ROC AUC for threat: 0.9875
ROC AUC for insult: 0.9752
ROC AUC for identity_hate: 0.9680


In [9]:
# 8. Sample Predictions
def classify_comment(text):
    cleaned = spacy_preprocess(text)
    pred = pipeline.predict([cleaned])[0]
    for label, result in zip(labels, pred):
        print(f"{label:15}: {result}")



In [11]:
print("\nüîç Example Prediction:")
classify_comment("You're a stupid idiot.")



üîç Example Prediction:
toxic          : 1
severe_toxic   : 0
obscene        : 1
threat         : 0
insult         : 1
identity_hate  : 1


In [13]:
import pickle

with open("toxic_model_1.pkl", "wb") as f:
    pickle.dump(pipeline, f)
