# Toxic Comment Classification Challenge

This project implements a multi-label text classification model to detect toxic comments. Using TF-IDF vectorization and Logistic Regression, the model identifies six types of toxicity: general toxic, severe toxic, obscene, threat, insult, and identity hate.

Dataset: https://www.kaggle.com/competitions/jigsaw-toxic-comment-classification-challenge/data

Hugging Face: https://huggingface.co/spaces/alperugurcan/toxic-comment-classifier

In [2]:
# 1. Import essential libraries
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.multioutput import MultiOutputClassifier
from sklearn.preprocessing import StandardScaler
import re

# 2. Load data
train = pd.read_csv('/kaggle/input/jigsaw-toxic-comment-classification-challenge/train.csv.zip')
test = pd.read_csv('/kaggle/input/jigsaw-toxic-comment-classification-challenge/test.csv.zip')

# 3. Basic text preprocessing
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

X = train['comment_text'].apply(clean_text)
y = train[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']]

# 4. Improved TF-IDF settings
vectorizer = TfidfVectorizer(
    max_features=10000,
    min_df=3,
    max_df=0.9,
    strip_accents='unicode',
    analyzer='word'
)
X_tfidf = vectorizer.fit_transform(X)
X_test_tfidf = vectorizer.transform(test['comment_text'].apply(clean_text))

# 5. Train model with increased iterations and adjusted solver
model = MultiOutputClassifier(
    LogisticRegression(
        max_iter=300,  # Increased from 100
        solver='saga',  # Changed solver
        random_state=42
    )
)
model.fit(X_tfidf, y)

# 6. Make predictions
predictions = model.predict_proba(X_test_tfidf)

# 7. Create submission file
submission = pd.DataFrame(columns=['id'] + list(y.columns))
submission['id'] = test['id']
for i, column in enumerate(y.columns):
    submission[column] = [pred[1] for pred in predictions[i]]

# 8. Save submission
submission.to_csv('submission.csv', index=False)

In [3]:
# Save Model and Vectorizer
import pickle

with open('toxic_model.pkl', 'wb') as f:
    pickle.dump(model, f)

with open('toxic_vectorizer.pkl', 'wb') as f:
    pickle.dump(vectorizer, f)