In [112]:
import nltk
import pandas as pd
import numpy as np
import re
import string
import math

from nltk.corpus import stopwords

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

from sklearn.feature_selection import chi2

from sklearn.metrics import accuracy_score

nltk.download("stopwords")
nltk.download("wordnet")
nltk.download("punkt")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Asus\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Asus\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Asus\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [113]:
file_path = "toxicData/train.csv"
df = pd.read_csv(file_path)
df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [114]:
# Transform the dataframe
df["isToxic"] = df["toxic"]+df["severe_toxic"]
df["isToxic"] = df["isToxic"].apply(lambda x: 1 if x>=1 else -1)
df.drop(columns = ["id","toxic","severe_toxic","obscene","threat","insult","identity_hate"],inplace=True)
df.drop_duplicates(inplace=True)
df.head()

Unnamed: 0,comment_text,isToxic
0,Explanation\nWhy the edits made under my usern...,-1
1,D'aww! He matches this background colour I'm s...,-1
2,"Hey man, I'm really not trying to edit war. It...",-1
3,"""\nMore\nI can't make any real suggestions on ...",-1
4,"You, sir, are my hero. Any chance you remember...",-1


In [115]:
X_train, X_test, y_train, y_test = train_test_split(df["comment_text"],df["isToxic"],test_size=0.25,shuffle=True)

In [116]:
# Remove Excess White Spaces
X_train = X_train.str.replace("\n", " ")
# Remove Punctuations
X_train = X_train.str.translate(str.maketrans('', '',string.punctuation))
# Keep English Letters
X_train = X_train.apply(lambda x: re.sub(r'[^a-zA-Z\s]', '', x))
# Remove UTC
X_train = X_train.str.replace("UTC","")
# Remove Month
pattern = r"\b(January|February|March|April|May|June|July|August|September|October|November|December)\b"
X_train = X_train.str.replace(pattern, "", regex=True, case=False)
# Lowercase all Letters
X_train = X_train.str.lower()
# Remove Links
pattern = r"\bhttp\S*\b"
X_train = X_train.str.replace(pattern, "", regex=True, case=False)
# Remove Stop Words
stop_words = set(stopwords.words('english'))
X_train = X_train.apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))

In [117]:
X_train.head()

78125                     tide laundry detergent delete jan
61116                                         speak america
79933     peace walk course satish kumar companion used ...
116124    change change newer stop support syrian govern...
75406     partially idea squashed deformed wikipedia log...
Name: comment_text, dtype: object

In [118]:
vectorizer = CountVectorizer(binary=True)
X_train_counts = vectorizer.fit_transform(X_train)

chi2_scores, p_values = chi2(X_train_counts, y_train)
feature_scores = pd.DataFrame({
    'Feature': vectorizer.get_feature_names_out(),
    'Chi2 Score': chi2_scores,
    'P-Value': p_values
})
feature_scores = feature_scores.sort_values(by='Chi2 Score', ascending=False)

In [119]:
print("Top Features by Chi2 Score:")
feature_scores.head()

Top Features by Chi2 Score:


Unnamed: 0,Feature,Chi2 Score,P-Value
58179,fuck,16153.814986,0.0
58214,fucking,10657.988328,0.0
138309,shit,7346.764213,0.0
10447,ass,4850.583048,0.0
17206,bitch,4661.37909,0.0


In [120]:
significantFeatures = feature_scores[feature_scores["P-Value"] <= 0.1]
significantFeatures.head()

Unnamed: 0,Feature,Chi2 Score,P-Value
58179,fuck,16153.814986,0.0
58214,fucking,10657.988328,0.0
138309,shit,7346.764213,0.0
10447,ass,4850.583048,0.0
17206,bitch,4661.37909,0.0


In [121]:
filteredFeatures = feature_scores[
    feature_scores['P-Value'] <= 0.1
].nlargest(1000, 'Chi2 Score')['Feature'].tolist()

filteredVectorizer = CountVectorizer(vocabulary=filteredFeatures,binary=True)
X_train_filtered = filteredVectorizer.transform(X_train)

In [122]:
nTrain = X_train_filtered.shape[0]
X_test = filteredVectorizer.transform(X_test)

In [123]:
weights = np.ones(nTrain) / nTrain

In [124]:
weights_list = weights.tolist()

def calculateError(weak_classifier, X, y):
    predictions = weak_classifier.predict(X)
    misclassified_indices = np.where(y != predictions)[0]
    error_t = sum(weights_list[i] for i in misclassified_indices) / sum(weights_list)

    accuracy = accuracy_score(y, predictions)
    print(f'Accuracy: {accuracy}')
    
    return error_t, predictions

def gym(alpha_t, y_true, y_pred):
    global weights_list
    y_true = np.array(y_true)
    y_pred = np.array(y_pred) 

    for i in range(len(weights_list)):
        weights_list[i] *= math.exp(-alpha_t * y_true[i] * y_pred[i])

    weight_sum = sum(weights_list)
    weights_list = [weight / weight_sum for weight in weights_list]

def deathlyBoosted(X, y, weak_classifier):
    weak_classifier.fit(X, y, sample_weight=weights_list)
    error_t, predictions = calculateError(weak_classifier, X, y)
    alpha_t = 0.5 * math.log((1 - error_t) / error_t)

    gym(alpha_t, y, predictions)

### SVM, Decision Trees, Logistic Regression

In [79]:
svmModel = SVC(kernel='linear')
svmModel.fit(X_train_filtered, y_train, sample_weight=weights)

In [80]:
error_t,pred= calculateError(svmModel, X_train_filtered, y_train)

Accuracy: 0.9044937248282893


In [81]:
alpha_t = 0.5 * math.log((1 - error_t) / error_t)
gym(alpha_t, y_train, pred)

In [82]:
decisionTree =  DecisionTreeClassifier(random_state=42)
deathlyBoosted(X_train_filtered, y_train, decisionTree)

Accuracy: 0.98872808703354


In [83]:
logisModel = LogisticRegression()
deathlyBoosted(X_train_filtered, y_train, logisModel)

Accuracy: 0.09550627517171076


In [84]:
logisModel = LogisticRegression()
deathlyBoosted(X_train_filtered, y_train, logisModel)

Accuracy: 0.6415882618359264


In [85]:
test_predictions = logisModel.predict(X_test)

In [86]:
accuracy = accuracy_score(y_test, test_predictions)
print(f'Accuracy: {accuracy}')

Accuracy: 0.6421928659163262


### SVM, Logistic Regression, Decision Tees

In [16]:
svmModel = SVC(kernel='linear')
svmModel.fit(X_train_filtered, y_train, sample_weight=weights_list)
error_t,pred= calculateError(svmModel, X_train_filtered, y_train)
alpha_t = 0.5 * math.log((1 - error_t) / error_t)
gym(alpha_t, y_train, pred)

Accuracy: 0.903557880312171


In [17]:
logisModel = LogisticRegression()
deathlyBoosted(X_train_filtered, y_train, logisModel)

Accuracy: 0.6200304149467738


In [18]:
decisionTree =  DecisionTreeClassifier(random_state=42)
deathlyBoosted(X_train_filtered, y_train, decisionTree)

Accuracy: 0.9921205234044687


In [19]:
decisionTree =  DecisionTreeClassifier(random_state=42)
deathlyBoosted(X_train_filtered, y_train, decisionTree)

Accuracy: 0.923996056083825


In [20]:
test_predictions = decisionTree.predict(X_test)
accuracy = accuracy_score(y_test, test_predictions)
print(f'Accuracy: {accuracy}')

Accuracy: 0.8398215225728825


### Logistic Regression, SVM, Decision Tees

In [38]:
logisModel = LogisticRegression()
deathlyBoosted(X_train_filtered, y_train, logisModel)

Accuracy: 0.9043516769999499


In [40]:
svmModel = SVC(kernel='linear')
svmModel.fit(X_train_filtered, y_train, sample_weight=weights_list)
error_t,pred= calculateError(svmModel, X_train_filtered, y_train)
alpha_t = 0.5 * math.log((1 - error_t) / error_t)
gym(alpha_t, y_train, pred)

Accuracy: 0.9043516769999499


In [41]:
decisionTree =  DecisionTreeClassifier(random_state=42)
deathlyBoosted(X_train_filtered, y_train, decisionTree)

Accuracy: 0.9883604338307792


In [42]:
decisionTree =  DecisionTreeClassifier(random_state=42)
deathlyBoosted(X_train_filtered, y_train, decisionTree)

Accuracy: 0.9253831113487859


In [43]:
test_predictions = decisionTree.predict(X_test)
accuracy = accuracy_score(y_test, test_predictions)
print(f'Accuracy: {accuracy}')

Accuracy: 0.8464642919810493


### Logistic Regression, Decision Tees, SVM

In [58]:
logisModel = LogisticRegression()
deathlyBoosted(X_train_filtered, y_train, logisModel)

Accuracy: 0.9044770133190728


In [59]:
decisionTree =  DecisionTreeClassifier(random_state=42)
deathlyBoosted(X_train_filtered, y_train, decisionTree)

Accuracy: 0.9869984458296429


In [61]:
svmModel = SVC(kernel='linear')
svmModel.fit(X_train_filtered, y_train, sample_weight=weights_list)
error_t,pred= calculateError(svmModel, X_train_filtered, y_train)
alpha_t = 0.5 * math.log((1 - error_t) / error_t)
gym(alpha_t, y_train, pred)

Accuracy: 0.4066912882902455


In [62]:
svmModel = SVC(kernel='linear')
svmModel.fit(X_train_filtered, y_train, sample_weight=weights_list)
error_t,pred= calculateError(svmModel, X_train_filtered, y_train)
alpha_t = 0.5 * math.log((1 - error_t) / error_t)
gym(alpha_t, y_train, pred)

Accuracy: 0.9044770133190728


In [63]:
test_predictions = svmModel.predict(X_test)
accuracy = accuracy_score(y_test, test_predictions)
print(f'Accuracy: {accuracy}')

Accuracy: 0.9031910360213571


### Decision Tees, Logistic Regression, SVM

In [125]:
decisionTree =  DecisionTreeClassifier(random_state=42)
deathlyBoosted(X_train_filtered, y_train, decisionTree)

Accuracy: 0.993440732632564


In [126]:
logisModel = LogisticRegression()
deathlyBoosted(X_train_filtered, y_train, logisModel)

Accuracy: 0.8127141162118351


In [127]:
svmModel = SVC(kernel='linear')
svmModel.fit(X_train_filtered, y_train, sample_weight=weights_list)
error_t,pred= calculateError(svmModel, X_train_filtered, y_train)
alpha_t = 0.5 * math.log((1 - error_t) / error_t)
gym(alpha_t, y_train, pred)

Accuracy: 0.903808552950417


In [128]:
svmModel = SVC(kernel='linear')
svmModel.fit(X_train_filtered, y_train, sample_weight=weights_list)
error_t,pred= calculateError(svmModel, X_train_filtered, y_train)
alpha_t = 0.5 * math.log((1 - error_t) / error_t)
gym(alpha_t, y_train, pred)

Accuracy: 0.09619144704958305


In [129]:
test_predictions = svmModel.predict(X_test)
accuracy = accuracy_score(y_test, test_predictions)
print(f'Accuracy: {accuracy}')

Accuracy: 0.0948035996290076


### Decision Tees, SVM, Logistic Regression

In [107]:
decisionTree =  DecisionTreeClassifier(random_state=42)
deathlyBoosted(X_train_filtered, y_train, decisionTree)

Accuracy: 0.99369140527081


In [108]:
svmModel = SVC(kernel='linear')
svmModel.fit(X_train_filtered, y_train, sample_weight=weights_list)
error_t,pred= calculateError(svmModel, X_train_filtered, y_train)
alpha_t = 0.5 * math.log((1 - error_t) / error_t)
gym(alpha_t, y_train, pred)

Accuracy: 0.9041511388893531


In [109]:
logisModel = LogisticRegression()
deathlyBoosted(X_train_filtered, y_train, logisModel)

Accuracy: 0.6064690252176674


In [110]:
logisModel = LogisticRegression()
deathlyBoosted(X_train_filtered, y_train, logisModel)

Accuracy: 0.9041511388893531


In [111]:
test_predictions = logisModel.predict(X_test)
accuracy = accuracy_score(y_test, test_predictions)
print(f'Accuracy: {accuracy}')

Accuracy: 0.9041686511418043
