In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


## About the Dataset
This a YouTube comments dataset scrapped using the [YouTube Data API](https://developers.google.com/youtube/v3/docs/comments). Currently the dataset consists of roughly **1.12M** comments. The dataset can be accessed [here](https://drive.google.com/drive/folders/1-9OnYbFuiSA0M7skOYche5erzZ4Ba5Nz?usp=sharing).
<br><br>
The dataset currently has text in _English, Hindi and Hinglish_.
Please refer to the following information for an overview of the columns and the corresponding data stored within them. [Link](https://github.com/aatmanvaidya/Sentiment-Analysis-of-Online-Harassment-Towards-Women-Wrestlers/blob/scraper/attributes.txt).

In [27]:
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import numpy as np
from collections import Counter
import re

In [3]:
%%time
df = pd.read_feather(r'/content/gdrive/My Drive/youtube_dataset/comments_cleaned_feather.feather')

CPU times: user 1.47 s, sys: 1.06 s, total: 2.53 s
Wall time: 8.92 s


In [4]:
df.head()

Unnamed: 0,videoId,textDisplay,textOriginal,authorDisplayName,authorProfileImageUrl,authorChannelUrl,authorChannelId,canRate,viewerRating,likeCount,publishedAt,updatedAt,parentId,commentId
0,EBOKYsWUhvI,Dub maro jaato 😢😢.<br>Tumse tumare ladkiya nhi...,Dub maro jaato 😢😢.\nTumse tumare ladkiya nhi b...,HECTOR OF TROY,https://yt3.ggpht.com/ytc/AOPolaQNP5bd7gNvbAas...,http://www.youtube.com/channel/UC5G8fjqoiFIqHp...,{'value': 'UC5G8fjqoiFIqHpKyVeOTsFg'},True,none,0,2023-07-06T07:04:02Z,2023-07-06T07:04:02Z,,UgwbyvIkkAhUdaCFpcp4AaABAg
1,EBOKYsWUhvI,Haar gye bechare,Haar gye bechare,Geeta Saini,https://yt3.ggpht.com/8XmOPNrT3Vy3wr0fItlWbaMk...,http://www.youtube.com/channel/UCsMGRdH3YHrbs2...,{'value': 'UCsMGRdH3YHrbs21NOraRuyQ'},True,none,0,2023-07-03T22:21:37Z,2023-07-03T22:21:37Z,,Ugyz3OwSXamho91-8I94AaABAg
2,EBOKYsWUhvI,Dhamki mili pahalwano ko aur sab manage kr liy...,Dhamki mili pahalwano ko aur sab manage kr liy...,Ayaan Chouhan,https://yt3.ggpht.com/ytc/AOPolaREH2WnrnbD53OI...,http://www.youtube.com/channel/UC_dWuNh6zydTHI...,{'value': 'UC_dWuNh6zydTHIRr6hi3Omg'},True,none,0,2023-07-03T05:06:33Z,2023-07-03T05:06:33Z,,Ugyphs1TT1Yoj7MZBVJ4AaABAg
3,EBOKYsWUhvI,Jaato pr ye boj rhega ki vo apni hi vyavstha s...,Jaato pr ye boj rhega ki vo apni hi vyavstha s...,Kamal,https://yt3.ggpht.com/ytc/AOPolaTRSm_dEOKj9H82...,http://www.youtube.com/channel/UCVdGObpHM-IMHB...,{'value': 'UCVdGObpHM-IMHB_b7_K-7rA'},True,none,1,2023-07-02T03:39:50Z,2023-07-02T03:39:50Z,,UgwnIxpuFAcKsEzffBp4AaABAg
4,EBOKYsWUhvI,Pahlwan jante h kuch nhi kr payenge uska,Pahlwan jante h kuch nhi kr payenge uska,jagriti tiwari upp,https://yt3.ggpht.com/ytc/AOPolaRrFnzD2i3N_3rk...,http://www.youtube.com/channel/UCRkcewHFhxE5Kf...,{'value': 'UCRkcewHFhxE5KfLHrLx0wpA'},True,none,0,2023-06-28T15:10:36Z,2023-06-28T15:10:36Z,,UgxjuOlDigsmuDu53J54AaABAg


In [5]:
df.shape

(1119948, 14)

In [6]:
duplicate_rows = df[df.duplicated(subset='textOriginal', keep=False)]

In [7]:
duplicate_rows

Unnamed: 0,videoId,textDisplay,textOriginal,authorDisplayName,authorProfileImageUrl,authorChannelUrl,authorChannelId,canRate,viewerRating,likeCount,publishedAt,updatedAt,parentId,commentId
0,EBOKYsWUhvI,Dub maro jaato 😢😢.<br>Tumse tumare ladkiya nhi...,Dub maro jaato 😢😢.\nTumse tumare ladkiya nhi b...,HECTOR OF TROY,https://yt3.ggpht.com/ytc/AOPolaQNP5bd7gNvbAas...,http://www.youtube.com/channel/UC5G8fjqoiFIqHp...,{'value': 'UC5G8fjqoiFIqHpKyVeOTsFg'},True,none,0,2023-07-06T07:04:02Z,2023-07-06T07:04:02Z,,UgwbyvIkkAhUdaCFpcp4AaABAg
1,EBOKYsWUhvI,Haar gye bechare,Haar gye bechare,Geeta Saini,https://yt3.ggpht.com/8XmOPNrT3Vy3wr0fItlWbaMk...,http://www.youtube.com/channel/UCsMGRdH3YHrbs2...,{'value': 'UCsMGRdH3YHrbs21NOraRuyQ'},True,none,0,2023-07-03T22:21:37Z,2023-07-03T22:21:37Z,,Ugyz3OwSXamho91-8I94AaABAg
2,EBOKYsWUhvI,Dhamki mili pahalwano ko aur sab manage kr liy...,Dhamki mili pahalwano ko aur sab manage kr liy...,Ayaan Chouhan,https://yt3.ggpht.com/ytc/AOPolaREH2WnrnbD53OI...,http://www.youtube.com/channel/UC_dWuNh6zydTHI...,{'value': 'UC_dWuNh6zydTHIRr6hi3Omg'},True,none,0,2023-07-03T05:06:33Z,2023-07-03T05:06:33Z,,Ugyphs1TT1Yoj7MZBVJ4AaABAg
3,EBOKYsWUhvI,Jaato pr ye boj rhega ki vo apni hi vyavstha s...,Jaato pr ye boj rhega ki vo apni hi vyavstha s...,Kamal,https://yt3.ggpht.com/ytc/AOPolaTRSm_dEOKj9H82...,http://www.youtube.com/channel/UCVdGObpHM-IMHB...,{'value': 'UCVdGObpHM-IMHB_b7_K-7rA'},True,none,1,2023-07-02T03:39:50Z,2023-07-02T03:39:50Z,,UgwnIxpuFAcKsEzffBp4AaABAg
4,EBOKYsWUhvI,Pahlwan jante h kuch nhi kr payenge uska,Pahlwan jante h kuch nhi kr payenge uska,jagriti tiwari upp,https://yt3.ggpht.com/ytc/AOPolaRrFnzD2i3N_3rk...,http://www.youtube.com/channel/UCRkcewHFhxE5Kf...,{'value': 'UCRkcewHFhxE5KfLHrLx0wpA'},True,none,0,2023-06-28T15:10:36Z,2023-06-28T15:10:36Z,,UgxjuOlDigsmuDu53J54AaABAg
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1119906,deNOEjZtSfI,Why only haryana wrestlers have problem,Why only haryana wrestlers have problem,speaking puppet,https://yt3.ggpht.com/Y38SKI3vIUZ_HYqSHxBR1WTs...,http://www.youtube.com/channel/UCWqcerY29ChwFP...,{'value': 'UCWqcerY29ChwFPst8Ay1cOQ'},True,none,1,2023-05-09T06:08:43Z,2023-05-09T06:08:43Z,UgytwVIYA9vbe8SvlUB4AaABAg,UgytwVIYA9vbe8SvlUB4AaABAg.9pSgAgHsDMY9pUG96HLnEY
1119930,deNOEjZtSfI,😂😂😂😂,😂😂😂😂,Sunil kumar Yadav,https://yt3.ggpht.com/ytc/AOPolaQS1u5_6pkOKTIw...,http://www.youtube.com/channel/UCnwJ0GXunxSZ7f...,{'value': 'UCnwJ0GXunxSZ7fPsnU7lC5Q'},True,none,1,2023-05-08T16:48:37Z,2023-05-08T16:48:37Z,UgxcUJoWvkjnswpMLI54AaABAg,UgxcUJoWvkjnswpMLI54AaABAg.9pSeZ1ozTQS9pSp_y5ddrW
1119934,deNOEjZtSfI,We support you ❤,We support you ❤,SAURABH KUMAR,https://yt3.ggpht.com/ytc/AOPolaTN4bNGrprHx4sF...,http://www.youtube.com/channel/UCVe1ze2yPRRM5c...,{'value': 'UCVe1ze2yPRRM5c0UZcocdfQ'},True,none,3,2023-05-08T15:10:29Z,2023-05-08T15:10:29Z,,UgzelVWHs-KV1VKHoZ14AaABAg
1119939,deNOEjZtSfI,We support you ❤,We support you ❤,Rafat Ali Official,https://yt3.ggpht.com/PuN3oDQ4v7-DE0k1n39dGWjz...,http://www.youtube.com/channel/UC34yoxvYlw8hJN...,{'value': 'UC34yoxvYlw8hJN0nUfazdKQ'},True,none,13,2023-05-08T15:01:52Z,2023-05-08T15:01:52Z,,UgyH7fp9vLth3ETWU254AaABAg


In [8]:
len(df['textDisplay'].unique())

93475

## Simple Slur List Frequency Count
This is a simple frequency count of how many words from the slur list are found in the data.
I have stored the results in a dictonary called _'slur_counts'_

In [9]:
with open('/content/gdrive/My Drive/youtube_dataset/slur_list.txt', 'r') as file:
    slur_words = [word.strip() for word in file.readlines()]

In [10]:
# slur_words

In [11]:
slur_words_set = set(slur_words)

In [12]:
# A dictonary to count the frequency of each slur word identified
slur_counts = {}

In [13]:
'''
for text in df['textOriginal']:
    words = text.split()
    for word in words:
        if word in slur_words:
            if word in slur_counts:
                slur_counts[word] += 1
            else:
                slur_counts[word] = 1
'''
for text in df['textOriginal']:
    if text is not None:
        words = text.split()
        slur_words_in_text = set(words) & slur_words_set
        for word in slur_words_in_text:
            if word in slur_counts:
                slur_counts[word] += 1
            else:
                slur_counts[word] = 1
# Problem with code, in set the if a user has used the same word multiple times, set will dismiss that.

In [14]:
sorted_slur_counts = dict(sorted(slur_counts.items(), key=lambda item: item[1], reverse=True))

In [40]:
# sorted_slur_counts
list(sorted_slur_counts.items())[:8]

[('chutiya', 2647),
 ('chutiye', 1839),
 ('chod', 1788),
 ('mc', 1155),
 ('bsdk', 1147),
 ('stupid', 994),
 ('sali', 992),
 ('gaddar', 981)]

In [16]:
sum(sorted_slur_counts.values())

25393

In [17]:
'''
One thing is clear from this, all the explicit words that are in english are not there in the list,
this means youtube either deletes them or hides them.
'''

'\nOne thing is clear from this, all the explicit words that are in english are not there in the list,\nthis means youtube either deletes them or hides them.\n'

## Regex
What is a Regex? -> a sequence of characters that specifies a match pattern in text.

**Step 1** - Clean and Preprocess the Data
1.   Remove User Names - `r'@\w+\b'`
2.   Remove URL's and unecessary puntuation marks - `[^\w\s]|http\S+|www\S+|https\S+`
3.  Remove Double spaces - `r'\s+', ' '`
4.  Remove any leading or trailing spaces - `.strip()`



In [36]:
text = "@user, temp, temp, ;;;;;;;; I hate the offensive and racist comments. They disgust me. Visit https://example.com for more information."

In [38]:
cleaned_text = re.sub(r'@\w+\b|[^\w\s]|http\S+|www\S+|https\S+', '', text)
cleaned_text = re.sub(r'\s+', ' ', cleaned_text)
cleaned_text = cleaned_text.strip()
print(cleaned_text)

temp temp I hate the offensive and racist comments They disgust me Visit for more information


## Identify Text Language

In [18]:
#supressing output
# !pip install inltk==0.9 > /dev/null 2>&1
# !pip install --upgrade fastai > /dev/null 2>&1

In [19]:
!pip install polyglot==16.7.4 > /dev/null 2>&1
!pip install PyICU > /dev/null 2>&1
!pip install pycld2 > /dev/null 2>&1

In [20]:
# Use iNLTK, Indic NLP Library or polyglot
# from inltk.inltk import identify_language, reset_language_identifying_models
from polyglot.detect import Detector
# languages supported by polyglot - https://polyglot.readthedocs.io/en/latest/Detection.html#supported-languages

In [21]:
# text = 'आप कैसे हैं?' text = 'તમે કેમ છો' text = 'எப்படி இருக்கிறீர்கள்' text = 'സുഖമാണോ' text = 'तू कसा आहेस'
text = 'आप कैसे हैं?'

In [22]:
detector = Detector(text)
print(detector.language)

name: Hindi       code: hi       confidence:  96.0 read bytes:  1638
