# CYBERBULLYING DETECTION IN TWEETS

### Import Numpy and PANDAS 

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

#import os
#print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

### Read the dataset into a PANDAS dataframe

In [2]:
df=pd.read_csv(r'../input/new_cleaned_data.csv', index_col=0)

### Display 1st five records of the dataframe

In [3]:
df.head()

Unnamed: 0,is_offensive,new_text
0,0,go village pump suggest change language rfc set
1,1,anti greek nationalis wikipedia hi alexikoua y...
2,1,dis hoe wasnt dis violent lottery ticket
3,0,better atabay helping banned vandals pushing pov
4,0,camelcase sicko camelcase camelcase rule r bal...


### Import statements

In [11]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

### Convert 'new_text' column to string and store its values in 'texts' variable, store 'is_offensive' column values in 'y' variable

In [5]:
texts = df['new_text'].astype(str)
y = df['is_offensive']

### Remove stopwords from tweets, tokenize the texts to give a sparsed matrix representation

In [6]:
vectorizer = CountVectorizer(stop_words='english', min_df=0.0001)
X = vectorizer.fit_transform(texts)

### Split the dataset into 70% training data and 30% testing data

In [7]:
import numpy as np
from sklearn.model_selection import train_test_split
texts_train, texts_test, y_train, y_test=train_test_split(X, y, test_size=0.3, random_state=1)

### SVM Model training

In [8]:
model = LinearSVC(class_weight="balanced", dual=False, tol=1e-2, max_iter=1e5)
model.fit(texts_train, y_train) 

LinearSVC(C=1.0, class_weight='balanced', dual=False, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=100000.0,
          multi_class='ovr', penalty='l2', random_state=None, tol=0.01,
          verbose=0)

### Predictions of SVM 

In [9]:
pred=model.predict(texts_test) 

### Confusion Matrix for SVM

In [13]:
print(confusion_matrix(y_test, pred))

[[42731  1561]
 [ 1175  9840]]


### Accuracy of SVM Model

In [14]:
accuracy_score(y_test, pred)

0.9505306742365343

### Precision, Recall and F1-Score for SVM

In [15]:
targets=['class 0', 'class 1']
print(classification_report(y_test, pred, target_names=targets))


              precision    recall  f1-score   support

     class 0       0.97      0.96      0.97     44292
     class 1       0.86      0.89      0.88     11015

    accuracy                           0.95     55307
   macro avg       0.92      0.93      0.92     55307
weighted avg       0.95      0.95      0.95     55307



### Training of Multinomial Naive Bayes Model

In [16]:
from sklearn.naive_bayes import MultinomialNB
model1=MultinomialNB()
model1.fit(texts_train, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

### Predictions for Naive Bayes

In [17]:
pred1=model1.predict(texts_test)

### Confusion Matrix for Naive Bayes

In [18]:
print(confusion_matrix(y_test, pred1))

[[41700  2592]
 [ 1254  9761]]


### Accuracy for Naive Bayes

In [19]:
accuracy_score(y_test, pred1)

0.9304608819860054

### Precision, recall and F1-Score for Naive Bayes

In [20]:
targets=['class 0', 'class 1']
print(classification_report(y_test, pred1, target_names=targets))


              precision    recall  f1-score   support

     class 0       0.97      0.94      0.96     44292
     class 1       0.79      0.89      0.84     11015

    accuracy                           0.93     55307
   macro avg       0.88      0.91      0.90     55307
weighted avg       0.93      0.93      0.93     55307



### Training of K Nearest Neighbours Model

In [21]:
from sklearn.neighbors import KNeighborsClassifier
model2=KNeighborsClassifier(n_neighbors=9, algorithm='auto')
model2.fit(texts_train, y_train)


KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=9, p=2,
                     weights='uniform')

### Predictions for KNN

In [22]:
pred2=model2.predict(texts_test)

### Confusion Matrix for KNN

In [23]:
print(confusion_matrix(y_test, pred2))

[[43774   518]
 [ 3898  7117]]


### Accuracy for KNN

In [24]:
accuracy_score(y_test, pred2)

0.92015477245195

### Precision, Recall and F1-Score for KNN

In [25]:
targets=['class 0', 'class 1']
print(classification_report(y_test, pred2, target_names=targets))

              precision    recall  f1-score   support

     class 0       0.92      0.99      0.95     44292
     class 1       0.93      0.65      0.76     11015

    accuracy                           0.92     55307
   macro avg       0.93      0.82      0.86     55307
weighted avg       0.92      0.92      0.91     55307



### Training of Adaboost Classifier

In [26]:
from sklearn.ensemble import AdaBoostClassifier 
model3=AdaBoostClassifier(n_estimators=200)
model3.fit(texts_train, y_train)

AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None, learning_rate=1.0,
                   n_estimators=200, random_state=None)

### Predictions for Adaboost

In [27]:
pred3=model3.predict(texts_test)

### Confusion Matrix of Adaboost

In [28]:
print(confusion_matrix(y_test, pred3))

[[43894   398]
 [ 2539  8476]]


### Accuracy for Adaboost

In [29]:
accuracy_score(y_test, pred3)

0.9468964145587357

### Precision, Recall and F1-Score for Adaboost

In [30]:
targets=['class 0', 'class 1']
print(classification_report(y_test, pred3, target_names=targets))

              precision    recall  f1-score   support

     class 0       0.95      0.99      0.97     44292
     class 1       0.96      0.77      0.85     11015

    accuracy                           0.95     55307
   macro avg       0.95      0.88      0.91     55307
weighted avg       0.95      0.95      0.94     55307



### Training of Random Forest Classifier

In [31]:
from sklearn.ensemble import RandomForestClassifier
model4=RandomForestClassifier(n_estimators=200)
model4.fit(texts_train, y_train)


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=200,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

### Predictions for Random Forest

In [32]:
pred4=model4.predict(texts_test)

### Confusion Matrix of Random Forest

In [33]:
print(confusion_matrix(y_test, pred4))

[[43590   702]
 [ 1933  9082]]


### Accuracy of Random Forest

In [34]:
accuracy_score(y_test, pred4)

0.9523568445223932

### Precision, Recall and F1-Score of Random Forest

In [35]:
targets=['class 0', 'class 1']
print(classification_report(y_test, pred4, target_names=targets))

              precision    recall  f1-score   support

     class 0       0.96      0.98      0.97     44292
     class 1       0.93      0.82      0.87     11015

    accuracy                           0.95     55307
   macro avg       0.94      0.90      0.92     55307
weighted avg       0.95      0.95      0.95     55307

