# Cyber Bullying

In [1]:
import os
import nltk
nltk.download('stopwords')
import pandas as pd
import numpy as np
import tensorflow.compat.v1 as tf
import re
tf.disable_v2_behavior()
from autocorrect import spell
import string 

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\yosef\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Instructions for updating:
non-resource variables are not supported in the long term


In [2]:
from typing import List

import nltk
import os
import numpy as np
import pandas as pd
import string
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
from enchant.checker import SpellChecker
# from nltk.stem import WordNetLemmatizer
# import re
import swifter


def merge_datasets(data_path='../Data/Source1') -> pd.DataFrame:
    df = pd.DataFrame()
    for filename in os.listdir(data_path):
        df1 = pd.read_csv(f'{data_path}/{filename}')
        df1 = df1[['Text', 'oh_label']]
        df = pd.concat([df, df1], axis=0)
    df["Text"] = df["Text"].astype(str)
    df = df.reset_index(drop=True)
    print("database merged successfully!")
    return df


def add_punctuation_stopwords_curse_features(df: pd.DataFrame) -> pd.DataFrame:
    def get_curses():
        lst = []
        with open("english_curse.csv") as curses_file:
            for curse in curses_file.readlines():
                lst.append(curse.replace("\n", ""))
        return lst

    curses = get_curses()
    features = list(string.punctuation) + list(stopwords.words('english')) + curses
    # new_features_cols = []
    for ch in features:
        df[ch] = df['Text'].astype(str).apply(lambda s: s.count(ch) / len(s))
    print("add_punctuation_and_stopwords_features successfully!")
    return df


def add_count_misspell_feature(df: pd.DataFrame) -> pd.DataFrame:
    def helper(data: str) -> float:
        spell = SpellChecker("en_US", data)
        counter = 0
        for _ in spell:
            counter += 1
        return counter / len(data)

    misspell_count = df["Text"].swifter.apply(helper).rename("misspell_count")
    df = pd.concat([df, misspell_count], axis=1)
    print("add_count_misspell_feature successfully!")
    return df


def add_avg_word_len_feature(df: pd.DataFrame) -> pd.DataFrame:
    avg_word_len = df["Text"].astype(str).swifter.apply(
        lambda s: pd.Series(nltk.word_tokenize(s)).map(len).mean()).rename("avg_word_len")
    df = pd.concat([df, avg_word_len], axis=1)
    print("add_avg_word_len_feature successfully!")
    return df


def add_avg_sentence_len_feature(df: pd.DataFrame) -> pd.DataFrame:
    sentence_count = df["Text"].astype(str).swifter.apply(
        lambda text: pd.Series(nltk.sent_tokenize(text)).map(lambda sent: len(nltk.word_tokenize(sent))).mean()) \
        .rename("sentence_count")

    df = pd.concat([df, sentence_count], axis=1)
    print("add_avg_sentence_len_feature successfully!")
    return df


def add_uppercase_count_feature(df: pd.DataFrame) -> pd.DataFrame:
    uppercase_count = df['Text'].str.findall(r'[A-Z]').str.len().rename("uppercase_count")/df["Text"].str.len()
    df = pd.concat([df, uppercase_count], axis=1)
    print("add_uppercase_count_feature successfully!")
    return df


def add_pos_features(df: pd.DataFrame) -> pd.DataFrame:
    def group_pos(tag):
        groups = {"noun": ['NN', 'NNS', 'NNP', 'NNPS'], "verb": ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ'],
                  "adverb": ['RB', 'RBR', 'RBS'], "adjective": ['JJ', 'JJR', 'JJS']}
        for key, group in groups.items():
            if tag in group:
                return key
        return None

    features = df["Text"].swifter.apply(lambda s: pd.Series([x[1] for x in nltk.pos_tag(nltk.word_tokenize(s))]).
                                        apply(group_pos).value_counts(normalize=True).copy())
    print("add_pos_features successfully!")
    features = features.fillna(0)
    return pd.concat([df, features], axis=1)


def to_one_hot_rep(df: pd.DataFrame, col_name: str) -> pd.DataFrame:
    cv = CountVectorizer()
    data_cv = cv.fit_transform(df[col_name])
    data_cv = pd.DataFrame(data_cv.toarray(), columns=cv.get_feature_names())
    data_cv["oh_label"] = df["oh_label"]
    return data_cv


# def filter_noise(df: pd.DataFrame) -> pd.DataFrame:
#     res = df.sum(axis=0)
#     res = res[res > res.median()]
#     ls = res.index.to_list()
#     del ls[0]
#     return df[df.columns.intersection(ls)]

def bug_fix(df:pd.DataFrame,ignored_columns:List[str])->pd.DataFrame:
    print("loaded!")
    df["uppercase_count"] /= df["Text"].str.len()
    print("fix broken col")
    X_df = df.drop(ignored_columns,axis=1)
    normalized_X_df = (X_df-X_df.mean())/X_df.std()
    df = pd.concat([df[ignored_columns],normalized_X_df],axis=1)
    print("normalized")
    return df


def preprocess(train_part=0.7, use_cache=True) -> (np.ndarray, np.ndarray, np.ndarray, np.ndarray):
    print("preprocess...")
    cleaned_output_path = "../Data/cleaned.csv"
    label_name = "oh_label"
    ignored_columns = ["Text", label_name]
    if use_cache and os.path.isfile(cleaned_output_path):
        df = pd.read_csv(cleaned_output_path,index_col=0)
        # df = bug_fix(df,label_name)
    else:
        df = merge_datasets()
        df = add_pos_features(df)
        df = add_punctuation_stopwords_curse_features(df)
        df = add_uppercase_count_feature(df)
        df = add_avg_word_len_feature(df)
        df = add_count_misspell_feature(df)
        df = add_avg_sentence_len_feature(df)
        # df["Text"] = df["Text"].apply(process_row)
        # df = to_one_hot_rep(df)
#         df.to_csv(cleaned_output_path)
        print("Saved")
    df = df.fillna(0)
    x = df.drop(ignored_columns, axis=1).values
    y = df[label_name].values
    num_rows = x.shape[0]
    mask_train = np.zeros(num_rows, dtype=bool)
    mask_train[np.random.choice(num_rows, int(num_rows * train_part), replace=False)] = True
    print(mask_train.shape, x.shape, y.shape)
    return x[mask_train, :], y[mask_train], x[~mask_train, :], y[~mask_train],df


In [3]:
x_train, y_train, x_test, y_test,df = preprocess(use_cache=True)
df

preprocess...
(448880,) (448880, 595) (448880,)


Unnamed: 0,Text,oh_label,adjective,adverb,noun,verb,!,"""",#,$,...,xx,xxx,yaoi,yellow showers,yiffy,zoophilia,ðŸ–•,uppercase_count,avg_word_len,misspell_count
0,`- This is not ``creative``. Those are the di...,0.0,0.704151,0.130975,-0.511120,0.113224,-0.141856,-0.122218,-0.175769,-0.031636,...,-0.025241,-0.013459,-0.004278,0.0,0.0,-0.003293,0.0,-0.097900,0.000670,-0.534211
1,` :: the term ``standard model`` is itself le...,0.0,0.611175,0.069942,-0.154997,-0.234684,-0.141856,-0.122218,-0.175769,-0.031636,...,-0.025241,-0.013459,-0.004278,0.0,0.0,-0.003293,0.0,-0.113050,-0.041753,-0.579315
2,"True or false, the situation as of March 200...",0.0,0.431937,-1.116819,0.881958,-0.694260,-0.141856,-0.122218,-0.175769,-0.031636,...,-0.025241,-0.013459,-0.004278,0.0,0.0,-0.003293,0.0,-0.080581,0.001535,-0.471277
3,"Next, maybe you could work on being less cond...",0.0,-0.516089,1.165731,-1.184439,1.098284,-0.141856,-0.122218,-0.175769,-0.031636,...,-0.025241,-0.013459,-0.004278,0.0,0.0,-0.003293,0.0,-0.125048,0.040398,-0.364195
4,This page will need disambiguation.,0.0,-1.334839,-1.116819,1.081539,0.176231,-0.141856,-0.122218,-0.175769,-0.031636,...,-0.025241,-0.013459,-0.004278,0.0,0.0,-0.003293,0.0,-0.436211,0.252417,-0.671879
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
448875,She pretty I love this song I miss the old kel...,1.0,-0.091552,-0.466926,0.291530,-0.020666,-0.141856,-0.122218,-0.175769,-0.031636,...,-0.025241,-0.013459,-0.004278,0.0,0.0,-0.003293,0.0,-0.083129,-0.073559,1.369299
448876,Status-Online Im ZxkillergirlzX! I'm Zxkillerg...,0.0,0.903077,-1.116819,0.702335,-0.768874,2.907767,-0.122218,-0.175769,-0.031636,...,-0.025241,-0.013459,-0.004278,0.0,0.0,-0.003293,0.0,0.564378,0.009215,1.866509
448877,JR so cute EXO M Better I agree like yeah yeah...,0.0,-0.317604,-0.407845,-0.125021,0.605824,0.144184,-0.122218,-0.175769,-0.031636,...,-0.025241,-0.013459,-0.004278,0.0,0.0,-0.003293,0.0,0.073139,-0.144876,0.328092
448878,! !,0.0,-1.334839,-1.116819,-2.710506,-2.186531,37.615373,-0.122218,-0.175769,-0.031636,...,-0.025241,-0.013459,-0.004278,0.0,0.0,-0.003293,0.0,-8.686155,-0.643590,-0.671879


In [4]:
len(df.columns[df.isna().any()].tolist())

0

In [13]:
import tensorflow.compat.v1 as tf
import numpy as np
from typing import List

tf.disable_v2_behavior()


class LogisticRegression:
    def __init__(self, X_train: np.ndarray, y_train: np.ndarray, num_iter=5000, learning_rate=0.001, batch_size=100,
                 print_step=1000):
        """
        :param X_train:
        :param y_train:
        :param num_iter:
        :param learning_rate:
        :param batch_size: -1 means all
        """
        self.sess = tf.Session()
        features = X_train.shape[1]
        eps = 1e-12
        self.x = tf.placeholder(tf.float32, [None, features])
        y_train_variable = tf.placeholder(tf.float32, [None, 1])
        W = tf.Variable(tf.zeros([features, 1]))
        b = tf.Variable(tf.zeros([1]))
        self.y = 1 / (1.0 + tf.exp(-(tf.matmul(self.x, W) + b)))
        loss = tf.reduce_mean(-(0.2 * y_train_variable * tf.log(self.y + eps) + 0.8 * (1 - y_train_variable) * tf.log(
            1 - self.y + eps)))  # cross entropy
        update = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss)  # TODO: check other optimizers
        self.sess.run(tf.global_variables_initializer())
        np.random.shuffle(X_train)
        rows_num = X_train.shape[0]
        for i in range(0, num_iter):
            counter_step = i % (rows_num // batch_size)
            X_batch = X_train[counter_step * batch_size:min((counter_step + 1) * batch_size, rows_num)]
            Y_batch = y_train[counter_step * batch_size:min((counter_step + 1) * batch_size, rows_num)]
            Y_batch = Y_batch.reshape((Y_batch.size, 1))
            self.sess.run(update, feed_dict={self.x: X_batch, y_train_variable: Y_batch})

            # if i % print_step == 0:
            #     print(f"iteration {i}: loss value is: {loss_value}")

    def predict(self, X_test, thr=0.5):
        predictions = self.sess.run(self.y, feed_dict={self.x: X_test})
        predictions[predictions >= thr] = 1
        predictions[predictions < thr] = 0
        return predictions


class MLP:
    """
    multi level perceptron implementation using tensorflow version 1
    """

    def __init__(self, X_train: np.ndarray, y_train, layers_sizes: List[int], learning_rate=0.001, num_iter=5000,
                 batch_size=100,
                 print_step=100):
        """

        :param X_train:
        :param y_train:
        :param layers_sizes: len of this list need to be greater than 1
        :param learning_rate:
        :param num_iter:
        :param batch_size:
        :param print_step:
        """
        self.sess = tf.Session()
        rows_num, features = X_train.shape[0], X_train.shape[1]
        eps = 1e-12
        self.x = tf.placeholder(tf.float32, [None, features])
        y_train_variable = tf.placeholder(tf.float32, [None, 1])
        layers_sizes = [features] + layers_sizes.copy() + [1]
        W = []
        b = []
        for i, layer_size in enumerate(layers_sizes[1:]):
            W.append(tf.Variable(tf.zeros([layers_sizes[i],layer_size])))
            b.append(tf.Variable(tf.zeros(layer_size)))
        # ff
        prev_output = tf.nn.relu(tf.matmul(self.x, W[0]) + b[0])
        for layer_w, layer_b in zip(W[1:], b[1:]):
            prev_output = 1 / (1.0 + tf.exp(-(tf.add(tf.matmul(prev_output, layer_w), layer_b))))
        self.y = prev_output
        loss = tf.reduce_mean(-(y_train_variable * tf.log(self.y + eps) + (1 - y_train_variable) * tf.log(
            1 - self.y + eps)))  # cross entropy
        update = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss)  # TODO: check other optimizers
        self.sess.run(tf.global_variables_initializer())
        np.random.shuffle(X_train)
        for i in range(0, num_iter):
            counter_step = i % (rows_num // batch_size)
            X_batch = X_train[counter_step * batch_size:min((counter_step + 1) * batch_size, rows_num)]
            Y_batch = y_train[counter_step * batch_size:min((counter_step + 1) * batch_size, rows_num)]
            Y_batch = Y_batch.reshape((Y_batch.size, 1))
            self.sess.run(update, feed_dict={self.x: X_batch, y_train_variable: Y_batch})

    def predict(self, X_test, thr=0.5):
        predictions = self.sess.run(self.y, feed_dict={self.x: X_test})
        predictions[predictions >= thr] = 1
        predictions[predictions < thr] = 0
        return predictions


In [20]:
# from sklearn.metrics import classification_report
# logistic = LogisticRegression(x_train,y_train,num_iter=500000)
# predictions = logistic.predict(x_test)
# print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

         0.0       0.87      1.00      0.93    117492
         1.0       0.17      0.00      0.00     17172

    accuracy                           0.87    134664
   macro avg       0.52      0.50      0.47    134664
weighted avg       0.78      0.87      0.81    134664



In [None]:
from sklearn.metrics import classification_report
mlp = MLP(x_train,y_train,[5,5,5])
mlp_predictions = mlp.predict(x_test)
print(classification_report(y_test, mlp_predictions))

## Data cleanning

In [10]:
df = pd.read_csv("./Data/Source1/youtube_parsed_dataset.csv")
df

Unnamed: 0,index,UserIndex,Text,Number of Comments,Number of Subscribers,Membership Duration,Number of Uploads,Profanity in UserID,Age,oh_label
0,0,X1,Does N.e.bodyelse Hear her Crazy ass Screamin ...,10,1,3,3,0,15,0
1,1,X2,There are so many things that are incorrect wi...,3,0,6,5,0,31,0
2,2,X3,3:26 hahah my boyfriend showed this song to me...,7,0,3,5,0,43,1
3,3,X2218,dick beyonce fuck y a ass hole you are truely ...,34,0,3,5,0,44,1
4,4,X5,DongHaeTaemin and Kai ;A; luhansehun and bacon...,11,173,5,5,0,21,0
...,...,...,...,...,...,...,...,...,...,...
3459,3464,X3465,She pretty I love this song I miss the old kel...,15,2,4,7,0,23,1
3460,3465,X3466,Status-Online Im ZxkillergirlzX! I'm Zxkillerg...,4,28,4,23,1,15,0
3461,3466,X3467,JR so cute EXO M Better I agree like yeah yeah...,23,0,5,3,0,33,0
3462,3467,X3468,! !,5,0,6,5,0,38,0


In [11]:
df.drop(['index',"UserIndex","Number of Comments","Number of Subscribers","Membership Duration","Number of Uploads","Profanity in UserID","Age"], axis=1, inplace=True)
df

Unnamed: 0,Text,oh_label
0,Does N.e.bodyelse Hear her Crazy ass Screamin ...,0
1,There are so many things that are incorrect wi...,0
2,3:26 hahah my boyfriend showed this song to me...,1
3,dick beyonce fuck y a ass hole you are truely ...,1
4,DongHaeTaemin and Kai ;A; luhansehun and bacon...,0
...,...,...
3459,She pretty I love this song I miss the old kel...,1
3460,Status-Online Im ZxkillergirlzX! I'm Zxkillerg...,0
3461,JR so cute EXO M Better I agree like yeah yeah...,0
3462,! !,0


In [12]:
df["Text"][1],df["oh_label"][1],

("There are so many things that are incorrect with your comment it's unbelievable. Guns don't kill people. A gun doesn't get up off a table and then shoot someone. He's not the reason soldiers are at war he's merely showing us weapons and explaining how they work and whether he likes them or not. You blame him for violent video games...Why would this guy mess around with video games when he has real weapons he can use and practice with. I can't say much more coz I don't have more space. Retard. This is all we need an Australian version of 1 Direction.... my dick was bleeding from how hard I was masturbating to this",
 0)

In [13]:
from spellchecker import SpellChecker
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download('words')
nltk.download('wordnet')
words = set(nltk.corpus.words.words())
spell = SpellChecker()
lemmatizer = WordNetLemmatizer()

def process_row(row):
    row = row.translate(str.maketrans('', '', string.punctuation))
    row = re.sub('\w*\d\w*', ' ', row) #remove words with numbers
    row = re.sub('[‘’“”…]', ' ', row)
    row = row.lower()
    stop_words = stopwords.words('english')
    row = " ".join(lemmatizer.lemmatize(w) for w in nltk.wordpunct_tokenize(row)) # spell checker
    row = " ".join(spell.correction(w) for w in nltk.wordpunct_tokenize(row)) # spell checker
    return " ".join(w for w in nltk.wordpunct_tokenize(row) if w in words and w not in stop_words) # word exist in corpus and remove stop words 

[nltk_data] Downloading package words to
[nltk_data]     C:\Users\eviat\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\eviat\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [14]:
df["Text"] = df["Text"].apply(process_row)
df

Unnamed: 0,Text,oh_label
0,doe hear crazy hoe say stupid hoe nobody see b...,0
1,many thing incorrect comment unbelievable gun ...,0
2,song love seizure u corgi training want rape,1
3,dick hole dog bitch look like dick god make si...,1
4,kai bacon ad kai henry dabba love rapping yeah...,0
...,...,...
3459,pretty love song miss old kelly yes love song ...,1
3460,band band practice,0
3461,cute better agree like yeah yeah yeah wow wow ...,0
3462,,0


In [15]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
data_cv = cv.fit_transform(df.Text)
data_cv = pd.DataFrame(data_cv.toarray(), columns=cv.get_feature_names())
data_cv["oh_label"] = df["oh_label"]
data_cv


Unnamed: 0,abandon,abandoned,abase,abate,abb,abbas,abbey,abbreviation,abdicate,abdomen,...,zipper,zo,zombie,zone,zoned,zoning,zoo,zoom,zucchini,oh_label
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3459,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3460,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3461,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3462,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [16]:
data_cv.to_csv("./Data/Source1/Youtube_Clean.csv")

In [19]:
cleaner = pd.read_csv("./Data/Source1/Youtube_Clean.csv")

In [49]:
res = cleaner.sum(axis=0)
res = res[res > res.median()]
ls = res.index.to_list()
del ls[0]
ls
pandas_df = cleaner[cleaner.columns.intersection(ls)]
pandas_df

Unnamed: 0,abandoned,ability,able,abomination,abortion,absolute,absolutely,absorbed,abuse,abusive,...,zeppelin,zero,zest,zip,zo,zombie,zone,zoo,zoom,oh_label
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3459,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3460,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3461,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3462,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [17]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(pandas_df.drop('oh_label', axis=1), pandas_df["oh_label"], test_size=0.33, random_state=42)

AttributeError: 'NoneType' object has no attribute 'drop'

In [None]:
from sklearn.linear_model import LogisticRegression
logmodel = LogisticRegression()
logmodel.fit(X_train,y_train)
predictions = logmodel.predict(X_test)

In [10]:
from sklearn.metrics import classification_report
print(classification_report(y_test,predictions))

              precision    recall  f1-score   support

           0       0.88      0.95      0.91       998
           1       0.25      0.11      0.15       146

    accuracy                           0.84      1144
   macro avg       0.56      0.53      0.53      1144
weighted avg       0.80      0.84      0.82      1144



## Basic FF NN Model

## RNN Models