In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


## About the Dataset
This a YouTube comments dataset scrapped using the [YouTube Data API](https://developers.google.com/youtube/v3/docs/comments). Currently the dataset consists of roughly **863K** comments. The dataset can be accessed [here](https://drive.google.com/drive/folders/1-9OnYbFuiSA0M7skOYche5erzZ4Ba5Nz?usp=sharing).
<br><br>
The dataset currently has text in _English, Hindi and Hinglish_.
Please refer to the following information for an overview of the columns and the corresponding data stored within them. [Link](https://github.com/aatmanvaidya/Sentiment-Analysis-of-Online-Harassment-Towards-Women-Wrestlers/blob/scraper/attributes.txt).

In [12]:
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import numpy as np
from collections import Counter

In [4]:
%%time
df = pd.read_feather(r'/content/gdrive/My Drive/youtube_dataset/comments_cleaned_feather.feather')

In [5]:
df.head()

Unnamed: 0,videoId,textDisplay,textOriginal,authorDisplayName,authorProfileImageUrl,authorChannelUrl,authorChannelId,canRate,viewerRating,likeCount,publishedAt,updatedAt,parentId,commentId
0,EBOKYsWUhvI,Dub maro jaato 😢😢.<br>Tumse tumare ladkiya nhi...,Dub maro jaato 😢😢.\nTumse tumare ladkiya nhi b...,HECTOR OF TROY,https://yt3.ggpht.com/ytc/AOPolaQNP5bd7gNvbAas...,http://www.youtube.com/channel/UC5G8fjqoiFIqHp...,{'value': 'UC5G8fjqoiFIqHpKyVeOTsFg'},True,none,0,2023-07-06T07:04:02Z,2023-07-06T07:04:02Z,,UgwbyvIkkAhUdaCFpcp4AaABAg
1,EBOKYsWUhvI,Haar gye bechare,Haar gye bechare,Geeta Saini,https://yt3.ggpht.com/8XmOPNrT3Vy3wr0fItlWbaMk...,http://www.youtube.com/channel/UCsMGRdH3YHrbs2...,{'value': 'UCsMGRdH3YHrbs21NOraRuyQ'},True,none,0,2023-07-03T22:21:37Z,2023-07-03T22:21:37Z,,Ugyz3OwSXamho91-8I94AaABAg
2,EBOKYsWUhvI,Dhamki mili pahalwano ko aur sab manage kr liy...,Dhamki mili pahalwano ko aur sab manage kr liy...,Ayaan Chouhan,https://yt3.ggpht.com/ytc/AOPolaREH2WnrnbD53OI...,http://www.youtube.com/channel/UC_dWuNh6zydTHI...,{'value': 'UC_dWuNh6zydTHIRr6hi3Omg'},True,none,0,2023-07-03T05:06:33Z,2023-07-03T05:06:33Z,,Ugyphs1TT1Yoj7MZBVJ4AaABAg
3,EBOKYsWUhvI,Jaato pr ye boj rhega ki vo apni hi vyavstha s...,Jaato pr ye boj rhega ki vo apni hi vyavstha s...,Kamal,https://yt3.ggpht.com/ytc/AOPolaTRSm_dEOKj9H82...,http://www.youtube.com/channel/UCVdGObpHM-IMHB...,{'value': 'UCVdGObpHM-IMHB_b7_K-7rA'},True,none,1,2023-07-02T03:39:50Z,2023-07-02T03:39:50Z,,UgwnIxpuFAcKsEzffBp4AaABAg
4,EBOKYsWUhvI,Pahlwan jante h kuch nhi kr payenge uska,Pahlwan jante h kuch nhi kr payenge uska,jagriti tiwari upp,https://yt3.ggpht.com/ytc/AOPolaRrFnzD2i3N_3rk...,http://www.youtube.com/channel/UCRkcewHFhxE5Kf...,{'value': 'UCRkcewHFhxE5KfLHrLx0wpA'},True,none,0,2023-06-28T15:10:36Z,2023-06-28T15:10:36Z,,UgxjuOlDigsmuDu53J54AaABAg


In [6]:
df.shape

(863146, 14)

## Simple Slur List Frequency Count
This is a simple frequency count of how many words from the slur list are found in the data.
I have stored the results in a dictonary called _'slur_counts'_

In [24]:
with open('/content/gdrive/My Drive/youtube_dataset/slur_list.txt', 'r') as file:
    slur_words = [word.strip() for word in file.readlines()]

In [25]:
# slur_words

In [26]:
slur_words_set = set(slur_words)

In [27]:
slur_counts = {}

In [28]:
# for text in df['textOriginal']:
#     words = text.split()
#     for word in words:
#         if word in slur_words:
#             if word in slur_counts:
#                 slur_counts[word] += 1
#             else:
#                 slur_counts[word] = 1
for text in df['textOriginal']:
    if text is not None:
        words = text.split()
        slur_words_in_text = set(words) & slur_words_set
        for word in slur_words_in_text:
            if word in slur_counts:
                slur_counts[word] += 1
            else:
                slur_counts[word] = 1
# Problem with code, in set the if a user has used the same word multiple times, set will dismiss that.

In [29]:
sorted_slur_counts = dict(sorted(slur_counts.items(), key=lambda item: item[1], reverse=True))

In [33]:
# sorted_slur_counts

In [30]:
# One thing is clear from this, all the explicit words