In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
url='https://drive.google.com/file/d/1F5KnTS79MODxHypKVMB6hz1ZqUZyn0Cy/view?usp=sharing'
file_id=url.split('/')[-2]
dwn_url='https://drive.google.com/uc?id=' + file_id
df = pd.read_csv(dwn_url)
print(df.head())

   CommentId  ...                                 translated_value
0     411618  ...                                     steevejoseph
1     411619  ...                                       Aghnashini
2     411620  ...  ayyo pavam kuchu enthelum patelum pattiyo entho
3     411621  ...                                             Brag
4     411622  ...                                         Reponguz

[5 rows x 12 columns]


In [None]:
df=df.drop(columns=['CommentId','commentText','language','user_index','post_index','report_count_comment', 'report_count_post', 'like_count_comment','like_count_post','commentCleaned'])

In [None]:
df.head()

Unnamed: 0,label,translated_value
0,0.0,steevejoseph
1,0.0,Aghnashini
2,0.0,ayyo pavam kuchu enthelum patelum pattiyo entho
3,0.0,Brag
4,0.0,Reponguz


Spelling correction

In [None]:
import textblob

In [None]:
def preprocess_data(df):
   
    
    # Convert text to lowercase
    df['lower'] = df['translated_value'].str.strip().str.lower()
    return df

In [None]:
dff = preprocess_data(df)

In [None]:
dff['translated_value']=dff['lower']

In [None]:
dff.head()

Unnamed: 0,label,translated_value,lower
0,0.0,steevejoseph,steevejoseph
1,0.0,aghnashini,aghnashini
2,0.0,ayyo pavam kuchu enthelum patelum pattiyo entho,ayyo pavam kuchu enthelum patelum pattiyo entho
3,0.0,brag,brag
4,0.0,reponguz,reponguz


In [None]:
dff=dff.drop(columns=['lower'])

In [None]:
dff.head()

Unnamed: 0,label,translated_value
0,0.0,steevejoseph
1,0.0,aghnashini
2,0.0,ayyo pavam kuchu enthelum patelum pattiyo entho
3,0.0,brag
4,0.0,reponguz


In [None]:
X_temp, X_test, y_temp, y_test = train_test_split(dff, list(dff.label), test_size=0.2, random_state=0)

In [None]:
X_test.shape, X_temp.shape, len(y_test), len(y_temp)

((26153, 2), (104612, 2), 26153, 104612)

In [None]:
X_temp['label'] = y_temp

In [None]:
X_temp.head()

Unnamed: 0,label,translated_value
29677,0.0,sup you
29254,0.0,kou gharar from
27234,1.0,baihu temple is eating
5709,0.0,nice
51176,0.0,want beautiful tanutu hi


In [None]:
zero = X_temp[X_temp['label'] == 0]
zero.shape[0]

72900

In [None]:
one = X_temp[X_temp['label'] == 1]
one.shape[0]

31712

In [None]:
ds = pd.concat([zero,one], axis = 0)
ds

Unnamed: 0,label,translated_value
29677,0.0,sup you
29254,0.0,kou gharar from
5709,0.0,nice
51176,0.0,want beautiful tanutu hi
9961,0.0,nicesuper
...,...,...
21243,1.0,thu muduka thu
45891,1.0,gand digi rindi
42613,1.0,nhi sewa to puri karo galiya bhi khao bahu hu ...
43567,1.0,hey matherchod randi


In [None]:
testdf = X_test
testdf['label'] = y_test
testdf.shape

(26153, 2)

In [None]:
len(testdf[testdf['label']==0]),len(testdf[testdf['label']==1])

(18212, 7941)

In [None]:
ds = pd.concat([ds, testdf], axis = 0)

In [None]:
ds[ds['label']==0].shape, ds[ds['label']==1].shape

((91112, 2), (39653, 2))

In [None]:
corpus = []
for i in range(ds.shape[0]):
    corpus.append(ds.iloc[i][0])

# Creating TF-IDF

In [None]:
vectorizer1 = TfidfVectorizer(max_features=1000)
X1 = vectorizer1.fit_transform(ds['translated_value'])
feature_names1 = vectorizer1.get_feature_names()
denselist1 = X1.todense().tolist()
df2_train = pd.DataFrame(denselist1, columns=feature_names1)



In [None]:
tdf = df2_train
tdf['labelxyz'] = list(ds.label)

In [None]:
tdf_zero = tdf[tdf.labelxyz == 0]
tdf_zero.shape

tdf_one = tdf[tdf.labelxyz == 1]
tdf_one.shape

(39653, 1001)

In [None]:
X_train_zero = tdf_zero.sample(frac=0.9, random_state=0)
X_test_zero = tdf_zero.drop(X_train_zero.index)

In [None]:
X_train_one = tdf_one.sample(frac=0.9, random_state=0)
X_test_one = tdf_one.drop(X_train_one.index)

In [None]:
X_train_df = pd.concat([X_train_zero,X_train_one], axis = 0)
tfdf_train = X_train_df.drop(['labelxyz'], axis = 1)
y_train = list(X_train_df.labelxyz)
X_test_df = pd.concat([X_test_zero,X_test_one], axis = 0)
tfdf_test = X_test_df.drop(['labelxyz'], axis = 1)
y_test = list(X_test_df.labelxyz)

# Building ML models

In [None]:
model = ['LR','DT','GB','RF','KN']
accuracy = {'TF-IDF':[],'BOW':[]}

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier

from sklearn import metrics
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import precision_score, recall_score, f1_score
import seaborn as sns

## Logistic Regression

In [None]:
#TF-IDF
regressor_LR_tf = LogisticRegression(C= 1.0, penalty='l2', solver= 'liblinear')
regressor_LR_tf.fit(tfdf_train,y_train)
y_predict_LR_tf = regressor_LR_tf.predict(tfdf_test)
a=(regressor_LR_tf.score(tfdf_test,y_test))
accuracy['TF-IDF'].append(a)

print(metrics.confusion_matrix(y_test, y_predict_LR_tf))
print(metrics.classification_report(y_test, y_predict_LR_tf))
print(metrics.accuracy_score(y_test, y_predict_LR_tf))

[[8536  575]
 [1647 2318]]
              precision    recall  f1-score   support

         0.0       0.84      0.94      0.88      9111
         1.0       0.80      0.58      0.68      3965

    accuracy                           0.83     13076
   macro avg       0.82      0.76      0.78     13076
weighted avg       0.83      0.83      0.82     13076

0.830070357907617


## Decision Tree

In [None]:
#TF-IDF
model_DT_tf = DecisionTreeClassifier() 
model_DT_tf.fit(tfdf_train,y_train)
y_predict_DT_tf = model_DT_tf.predict(tfdf_test)
a=(model_DT_tf.score(tfdf_test,y_test))
accuracy['TF-IDF'].append(a)

print(metrics.confusion_matrix(y_test, y_predict_DT_tf))
print(metrics.classification_report(y_test, y_predict_DT_tf))
print(metrics.accuracy_score(y_test, y_predict_DT_tf))

[[8196  915]
 [1756 2209]]
              precision    recall  f1-score   support

         0.0       0.82      0.90      0.86      9111
         1.0       0.71      0.56      0.62      3965

    accuracy                           0.80     13076
   macro avg       0.77      0.73      0.74     13076
weighted avg       0.79      0.80      0.79     13076

0.7957326399510554


## Gradient Boosting

In [None]:
#TF-IDF
model_GB_tf = GradientBoostingClassifier() 
model_GB_tf.fit(tfdf_train,y_train)
y_predict_GB_tf = model_GB_tf.predict(tfdf_test)
a=(model_GB_tf.score(tfdf_test,y_test))
accuracy['TF-IDF'].append(a)
print(metrics.confusion_matrix(y_test, y_predict_GB_tf))
print(metrics.classification_report(y_test, y_predict_GB_tf))
print(metrics.accuracy_score(y_test, y_predict_GB_tf))

[[8948  163]
 [2873 1092]]
              precision    recall  f1-score   support

         0.0       0.76      0.98      0.85      9111
         1.0       0.87      0.28      0.42      3965

    accuracy                           0.77     13076
   macro avg       0.81      0.63      0.64     13076
weighted avg       0.79      0.77      0.72     13076

0.7678189048638727


## Random Forest

In [None]:
#TF-IDF
model_RF_tf = RandomForestClassifier(n_estimators= 10)
model_RF_tf.fit(tfdf_train,y_train)
y_predict_RF_tf = model_RF_tf.predict(tfdf_test)
a=(model_RF_tf.score(tfdf_test,y_test))
accuracy['TF-IDF'].append(a)

print(metrics.confusion_matrix(y_test, y_predict_RF_tf))
print(metrics.classification_report(y_test, y_predict_RF_tf))
print(metrics.accuracy_score(y_test, y_predict_RF_tf))

[[8402  709]
 [1761 2204]]
              precision    recall  f1-score   support

         0.0       0.83      0.92      0.87      9111
         1.0       0.76      0.56      0.64      3965

    accuracy                           0.81     13076
   macro avg       0.79      0.74      0.76     13076
weighted avg       0.81      0.81      0.80     13076

0.8111043132456409


In [None]:
#TF-IDF
model_RF_tf = RandomForestClassifier(n_estimators= 50)
model_RF_tf.fit(tfdf_train,y_train)
y_predict_RF_tf = model_RF_tf.predict(tfdf_test)
a=(model_RF_tf.score(tfdf_test,y_test))
accuracy['TF-IDF'].append(a)

print(metrics.confusion_matrix(y_test, y_predict_RF_tf))
print(metrics.classification_report(y_test, y_predict_RF_tf))
print(metrics.accuracy_score(y_test, y_predict_RF_tf))

[[8349  762]
 [1632 2333]]
              precision    recall  f1-score   support

         0.0       0.84      0.92      0.87      9111
         1.0       0.75      0.59      0.66      3965

    accuracy                           0.82     13076
   macro avg       0.80      0.75      0.77     13076
weighted avg       0.81      0.82      0.81     13076

0.816916488222698


## K neighbors

In [None]:
#TF-IDF
model_KN_tf = KNeighborsClassifier(metric= 'euclidean', n_neighbors= 3, weights= 'distance') 
model_KN_tf.fit(tfdf_train,y_train)
y_predict_KN_tf = model_KN_tf.predict(tfdf_test)
a=(model_KN_tf.score(tfdf_test,y_test))
accuracy['TF-IDF'].append(a)
print(metrics.confusion_matrix(y_test, y_predict_KN_tf))
print(metrics.classification_report(y_test, y_predict_KN_tf))
print(metrics.accuracy_score(y_test, y_predict_KN_tf))