## Imports 

In [10]:
from hashlib import sha1
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import re
import sklearn 
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import (
    GridSearchCV,
    cross_val_score,
    cross_validate,
    train_test_split,
)

from stop_words import get_stop_words 
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline, make_pipeline                

## Load Cyberbullying Tweets csv file

In [2]:
tweets_df = pd.read_csv("data/cyberbullying_tweets.csv")
tweets_df.head()
stop_words_set = set(get_stop_words('en'))



In [3]:
tweets_df["cyberbullying_type"].value_counts()
 

cyberbullying_type
religion               7998
age                    7992
gender                 7973
ethnicity              7961
not_cyberbullying      7945
other_cyberbullying    7823
Name: count, dtype: int64

## EDA

### Step 1: Modify Label if the value is 'not_cyberbullying' then 0, otherwise 1

In [4]:
tweets_df["cyberbullying_type"] = tweets_df["cyberbullying_type"].apply(lambda x: 0 if x == "not_cyberbullying" else 1)
tweets_df["cyberbullying_type"].value_counts()

cyberbullying_type
1    39747
0     7945
Name: count, dtype: int64

### Step 2: Each `tweet_text` Filtering (Also, Read **Further Notice**)
<hr>

`tweet_text` contains emoji or any other languages. To make model easily, we are going to stick to have only English.

[**Further Notice**] As we know, if we remove the any emoji or any other languages, we might lose useful information. Then, we might need to change this process in the future.

We also want to make all the text lowercase, remove stop words, and remove all links!

In [5]:
tweets_df["tweet_text"] = tweets_df["tweet_text"].apply(lambda x: re.sub(r'http\S+', '', x))
tweets_df["tweet_text"] = tweets_df["tweet_text"].apply(lambda x: re.sub(r'[^\x00-\x7F]+', '', x))
tweets_df["tweet_text"] = tweets_df["tweet_text"].apply(lambda x: re.sub(r'[^a-zA-Z0-9\s]+', '', x))
tweets_df["tweet_text"] = tweets_df["tweet_text"].apply(lambda x: x.lower())
tweets_df["tweet_text"] = tweets_df["tweet_text"].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words_set]))

#Check this below
tweets_df["tweet_text"][21]


'kids love mohamad bin zayed city'

In [6]:
X = tweets_df["tweet_text"]
y = tweets_df["cyberbullying_type"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.8, random_state=123)


## Preprocessing

### Logistic Regression

In [7]:
# 2. Create a vectorizer object with stop_words = "english"
countvec = CountVectorizer(stop_words="english")

# 3. Create a logisticRegression object
lr = LogisticRegression(max_iter=1000, random_state=123)

# 4. Make a pipeline object
pipe = make_pipeline(countvec, lr)

# 5. Store the mean values of cross-validation scores
cv_score = cross_val_score(pipe, X_train, y_train).mean()
cv_score

0.8585647712993836

### SVM RBF

In [8]:
# 2. Create a vectorizer object with stop_words = "english"
countvec = CountVectorizer(stop_words="english")

# 3. Create a SVM RBF object
svm = SVC(kernel="rbf", C=10, gamma=0.1)

# 4. Make a pipeline object
pipe = make_pipeline(countvec, svm)

# 5. Store the mean values of cross-validation scores
cv_score = cross_val_score(pipe, X_train, y_train).mean()
cv_score

0.860557100124335

### RandomForest

In [9]:
# 2. Create a vectorizer object with stop_words = "english"
countvec = CountVectorizer(stop_words="english")

# 3. Create a RandomForest object
rf = RandomForestClassifier(n_jobs=-1, random_state=123)

# 4. Make a pipeline object
pipe = make_pipeline(countvec, rf)

# 5. Store the mean values of cross-validation scores
cv_score = cross_val_score(pipe, X_train, y_train).mean()
cv_score

0.8587740301372303