In [10]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, VotingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, f1_score
from nltk.corpus import stopwords
import string
import nltk

In [11]:
# Download stopwords
nltk.download('stopwords')

# Load dataset and take only 2 lakh entries
df = pd.read_csv('HateSpeechDatasetBalanced.csv').sample(n=100000, random_state=42)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [12]:
from sklearn.utils import resample

# Separate majority and minority classes
df_majority = df[df.Label == 1]
df_minority = df[df.Label == 0]

# Check the number of samples in the minority class
n_minority_samples = len(df_minority)

# Downsample majority class
df_majority_downsampled = resample(df_majority,
                                   replace=False,    # sample without replacement
                                   n_samples=min(len(df_majority), n_minority_samples),     # to match minority class
                                   random_state=42) # reproducible results

# Combine minority class with downsampled majority class
df_balanced = pd.concat([df_majority_downsampled, df_minority])

# Shuffle the dataset
df_balanced = df_balanced.sample(frac=1, random_state=42).reset_index(drop=True)

# Update the dataframe
df = df_balanced

In [13]:
import re


# Preprocess text data
def preprocess_text(text):
    stop_words = set(stopwords.words('english'))
    text = text.lower()
    text = re.sub(r'http\S+', '', text)  # Remove URLs
    text = re.sub(r'\s+', ' ', text)  # Remove double spaces
    text = ''.join([char for char in text if char not in string.punctuation])
    text = ' '.join([word for word in text.split() if word not in stop_words])
    return text

df['Content'] = df['Content'].apply(preprocess_text)

In [14]:
from sklearn.feature_extraction.text import CountVectorizer


# Remove special characters and punctuation
df['Content'] = df['Content'].apply(lambda x: re.sub(r'[^\w\s]', '', x))

# Convert all words to lowercase
df['Content'] = df['Content'].apply(lambda x: x.lower())

# Counter Vectorizer
count_vectorizer = CountVectorizer()
X_count = count_vectorizer.fit_transform(df['Content'])

# TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer()
X_tfidf = tfidf_vectorizer.fit_transform(df['Content'])

In [15]:
# Define features and target variable
X = X_tfidf
y = df['Label']

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [16]:
# Initialize models
models = {
    'Logistic Regression': LogisticRegression(),
    'Random Forest': RandomForestClassifier(),
    'SVM': SVC(),
    'AdaBoost': AdaBoostClassifier(),
    'KNN': KNeighborsClassifier(),
    'Naive Bayes': MultinomialNB(),
    'Decision Tree': DecisionTreeClassifier()
}

In [17]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from scipy import sparse

# Assuming X_train and X_test are sparse matrices
scaler = StandardScaler(with_mean=False)  # with_mean=False is necessary for sparse matrices

# Scale the data
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train and evaluate models
accuracy_scores = {}
for model_name, model in models.items():
    if model_name == 'Naive Bayes':
        nb_scaler = MinMaxScaler()
        X_train_nb_scaled = nb_scaler.fit_transform(X_train.toarray())
        X_test_nb_scaled = nb_scaler.transform(X_test.toarray())
        model.fit(X_train_nb_scaled, y_train)
        y_pred = model.predict(X_test_nb_scaled)
    else:
        if model_name == 'Logistic Regression':
            model.set_params(max_iter=1000)  # Increase max_iter for Logistic Regression
        if model_name == 'AdaBoost':
            model.set_params(algorithm='SAMME')  # Use SAMME algorithm for AdaBoost
        model.fit(X_train_scaled, y_train)
        y_pred = model.predict(X_test_scaled)
    # Collect accuracy scores
    accuracy_scores[model_name] = accuracy_score(y_test, y_pred)

# Create a DataFrame to display the results
accuracy_df = pd.DataFrame(list(accuracy_scores.items()), columns=['Model', 'Accuracy'])
print(accuracy_df)



                 Model  Accuracy
0  Logistic Regression   0.67975
1        Random Forest   0.75525
2                  SVM   0.71700
3             AdaBoost   0.67175
4                  KNN   0.50225
5          Naive Bayes   0.75725
6        Decision Tree   0.71750


In [18]:
# Multi-Modal Ensemble (Hard Voting)
ensemble_model = VotingClassifier(estimators=[
    ('lr', LogisticRegression()),
    ('rf', RandomForestClassifier()),
    ('svm', SVC()),
    ('ada', AdaBoostClassifier()),
    ('knn', KNeighborsClassifier()),
    ('nb', MultinomialNB()),
    ('dt', DecisionTreeClassifier())
], voting='hard')


In [20]:
ensemble_model.fit(X_train_scaled, y_train)
y_pred_ensemble = ensemble_model.predict(X_test_scaled)
print(f'Ensemble Model Accuracy: {accuracy_score(y_test, y_pred_ensemble)}')


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Ensemble Model Accuracy: 0.7535
