## Imports 

In [13]:
from hashlib import sha1
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import (
    GridSearchCV,
    cross_val_score,
    cross_validate,
    train_test_split,
)
from sklearn.pipeline import Pipeline, make_pipeline

## Load Cyberbullying Tweets csv file

In [14]:
tweets_df = pd.read_csv("data/cyberbullying_tweets.csv")
tweets_df.head()

Unnamed: 0,tweet_text,cyberbullying_type
0,"In other words #katandandre, your food was cra...",not_cyberbullying
1,Why is #aussietv so white? #MKR #theblock #ImA...,not_cyberbullying
2,@XochitlSuckkks a classy whore? Or more red ve...,not_cyberbullying
3,"@Jason_Gio meh. :P thanks for the heads up, b...",not_cyberbullying
4,@RudhoeEnglish This is an ISIS account pretend...,not_cyberbullying


In [15]:
tweets_df["cyberbullying_type"].value_counts()
 

cyberbullying_type
religion               7998
age                    7992
gender                 7973
ethnicity              7961
not_cyberbullying      7945
other_cyberbullying    7823
Name: count, dtype: int64

## EDA

### Step 1: Modify Label if the value is 'not_cyberbullying' then 0, otherwise 1

In [16]:
tweets_df["cyberbullying_type"] = tweets_df["cyberbullying_type"].apply(lambda x: 0 if x == "not_cyberbullying" else 1)
tweets_df["cyberbullying_type"].value_counts()

cyberbullying_type
1    39747
0     7945
Name: count, dtype: int64

### Step 2: Each `tweet_text` Filtering (Also, Read **Further Notice**)
<hr>

`tweet_text` contains emoji or any other languages. To make model easily, we are going to stick to have only English.

[**Further Notice**] As we know, if we remove the any emoji or any other languages, we might lose useful information. Then, we might need to change this process in the future.

In [17]:
tweets_df["tweet_text"] = tweets_df["tweet_text"].apply(lambda x: re.sub(r'[^\x00-\x7F]+', '', x))
tweets_df["tweet_text"] = tweets_df["tweet_text"].apply(lambda x: re.sub(r'[^a-zA-Z0-9\s]+', '', x))
tweets_df["tweet_text"]


# Check this below
# tweets_df["tweet_text"][21]

0        In other words katandandre your food was crapi...
1        Why is aussietv so white MKR theblock ImACeleb...
2        XochitlSuckkks a classy whore Or more red velv...
3        JasonGio meh P  thanks for the heads up but no...
4        RudhoeEnglish This is an ISIS account pretendi...
                               ...                        
47687    Black ppl arent expected to do anything depend...
47688    Turner did not withhold his disappointment Tur...
47689    I swear to God This dumb nigger bitch I have g...
47690    Yea fuck you RT therealexel IF YOURE A NIGGER ...
47691    Bro U gotta chill RT CHILLShrammy Dog FUCK KP ...
Name: tweet_text, Length: 47692, dtype: object

In [18]:
X = tweets_df["tweet_text"]
y = tweets_df["cyberbullying_type"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.8, random_state=123)

## Preprocessing