In [3]:
# Importing necessary libraries
import pandas as pd
import nltk
from nltk import WordNetLemmatizer
from nltk import pos_tag, word_tokenize
from nltk.corpus import stopwords as nltk_stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import f1_score, confusion_matrix
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
import re
import pickle

# Load the dataset
df = pd.read_csv("datasets.csv")

# Drop unnecessary columns
df.drop(['id', 'threat', 'identity_hate','severe_toxic'], axis=1, inplace=True)

# Filter and balance the dataset
df_non_toxic = df[(df[['toxic', 'obscene', 'insult']] == 0).all(axis=1)]
if len(df_non_toxic) > 20000:
    df_non_toxic = df_non_toxic.sample(n=20000, random_state=42)
df_toxic = df[(df[['toxic', 'obscene', 'insult']] != 0).any(axis=1)]
df_combined = pd.concat([df_toxic, df_non_toxic])

# Download NLTK resources
nltk.download('punkt')
nltk.download('omw-1.4')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')

# Custom transformer for text preprocessing
class TextPreprocessor(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.wordnet_lemmatizer = WordNetLemmatizer()
        self.stopwords = set(nltk_stopwords.words('english'))
    
    def get_wordnet_pos(self, treebank_tag):
        if treebank_tag.startswith('J'):
            return wordnet.ADJ
        elif treebank_tag.startswith('V'):
            return wordnet.VERB
        elif treebank_tag.startswith('N'):
            return wordnet.NOUN
        elif treebank_tag.startswith('R'):
            return wordnet.ADV
        else:
            return wordnet.NOUN

    def prepare_text(self, text):
        text = re.sub(r'[^a-zA-Z]', ' ', text)
        text = word_tokenize(text)
        text = pos_tag(text)
        lemma = [self.wordnet_lemmatizer.lemmatize(i[0], pos=self.get_wordnet_pos(i[1])) for i in text]
        lemma = ' '.join(lemma)
        return lemma
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        return X.apply(self.prepare_text)

# Initialize dictionaries to store models
models = {}

# Apply SMOTE, train, and evaluate for each label independently
for toxicity_type in df_combined.columns[1:]:
    print(f"Applying SMOTE and training for label: {toxicity_type}")

    # Define the pipeline
    text_pipeline = Pipeline(steps=[
        ('preprocessor', TextPreprocessor()),
        ('tfidf', TfidfVectorizer(stop_words='english'))
    ])

    # Apply SMOTE to the specific label
    X = df_combined['comment_text']
    y = df_combined[toxicity_type]
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    X_train_transformed = text_pipeline.fit_transform(X_train)
    
    smote = SMOTE(random_state=42)
    X_resampled, y_resampled = smote.fit_resample(X_train_transformed, y_train)
    
    svc_model = SVC(kernel='linear', probability=True)
    svc_model.fit(X_resampled, y_resampled)
    
    models[toxicity_type] = Pipeline(steps=[
        ('text_pipeline', text_pipeline),
        ('classifier', svc_model)
    ])

    # Evaluate the model
    y_pred = models[toxicity_type].predict(X_test)
    f1 = f1_score(y_test, y_pred)
    print(f"F1 Score for {toxicity_type}: {f1}")


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/tsheltrimpemo/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/tsheltrimpemo/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/tsheltrimpemo/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/tsheltrimpemo/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/tsheltrimpemo/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


Applying SMOTE and training for label: toxic
F1 Score for toxic: 0.8564616118264187
Applying SMOTE and training for label: obscene
F1 Score for obscene: 0.7845855281397283
Applying SMOTE and training for label: insult
F1 Score for insult: 0.711046511627907


In [4]:
# Save the models using pickle
with open('model2.pkl', 'wb') as f:
    pickle.dump(models, f)

In [12]:
from joblib import dump

# Save the models using joblib
dump(models, 'model3.pkt')

['model3.pkt']

In [7]:
import pickle

# Load the trained models from the pickle file
with open('model2.pkl', 'rb') as f:
    models = pickle.load(f)

# Define a function to predict toxicity probabilities
def predict_toxicity(new_comment):
    toxicity_probs = {}
    for toxicity_type, model in models.items():
        # Preprocess the comment using the text pipeline in the model
        preprocessed_comment = model.named_steps['text_pipeline'].transform(pd.Series([new_comment]))
        # Predict the probability of being toxic for each label
        toxicity_probs[toxicity_type] = model.named_steps['classifier'].predict_proba(preprocessed_comment)[0][1]
    
    return toxicity_probs

# Example usage
new_comment = "Your video content is very relatable"
toxicity_probs = predict_toxicity(new_comment)
print("Toxicity Probabilities:", toxicity_probs)

Toxicity Probabilities: {'toxic': 0.06001845505733713, 'obscene': 0.14729501652439078, 'insult': 0.066346275214812}


In [8]:
import pickle

# Load the trained models from the pickle file
with open('model2.pkl', 'rb') as f:
    models = pickle.load(f)

# Define a function to predict toxicity probabilities
def predict_toxicity(new_comment):
    toxicity_probs = {}
    for toxicity_type, model in models.items():
        # Preprocess the comment using the text pipeline in the model
        preprocessed_comment = model.named_steps['text_pipeline'].transform(pd.Series([new_comment]))
        # Predict the probability of being toxic for each label
        toxicity_probs[toxicity_type] = model.named_steps['classifier'].predict_proba(preprocessed_comment)[0][1]
    
    return toxicity_probs

# Example usage
new_comment = "Shut up idiot"
toxicity_probs = predict_toxicity(new_comment)
print("Toxicity Probabilities:", toxicity_probs)

Toxicity Probabilities: {'toxic': 0.9999999999999699, 'obscene': 0.9625367654895076, 'insult': 0.9999999999188693}


In [9]:
import pickle

# Load the trained models from the pickle file
with open('model2.pkl', 'rb') as f:
    models = pickle.load(f)

# Define a function to predict toxicity probabilities
def predict_toxicity(new_comment):
    toxicity_probs = {}
    for toxicity_type, model in models.items():
        # Preprocess the comment using the text pipeline in the model
        preprocessed_comment = model.named_steps['text_pipeline'].transform(pd.Series([new_comment]))
        # Predict the probability of being toxic for each label
        toxicity_probs[toxicity_type] = model.named_steps['classifier'].predict_proba(preprocessed_comment)[0][1]
    
    return toxicity_probs

# Example usage
new_comment = "You are loser"
toxicity_probs = predict_toxicity(new_comment)
print("Toxicity Probabilities:", toxicity_probs)

Toxicity Probabilities: {'toxic': 0.9999873724647066, 'obscene': 0.13460504920713032, 'insult': 0.999986419917415}
