In [1]:
import gdown

In [2]:
!gdown --id 1KmoZYmPzxrrtR4a3mSfPIumbDwf_Mrpw > /dev/null 2>&1
!gdown --id 19vK08esg9hQtD8JB-9jm7Qkck7nZ-ZqR > /dev/null 2>&1

## About the Dataset
This a YouTube comments dataset scrapped using the [YouTube Data API](https://developers.google.com/youtube/v3/docs/comments). Currently the dataset consists of roughly **1.12M** comments. The dataset can be accessed [here](https://drive.google.com/drive/folders/1-9OnYbFuiSA0M7skOYche5erzZ4Ba5Nz?usp=sharing).
<br><br>
The dataset currently has text in _English, Hindi and Hinglish_.
Please refer to the following information for an overview of the columns and the corresponding data stored within them. [Link](https://github.com/aatmanvaidya/Sentiment-Analysis-of-Online-Harassment-Towards-Women-Wrestlers/blob/scraper/attributes.txt).

In [3]:
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import numpy as np
from collections import Counter
import re

In [4]:
%%time
df = pd.read_feather(r'comments_cleaned_feather.feather')

CPU times: user 1.67 s, sys: 1.28 s, total: 2.94 s
Wall time: 2.47 s


In [5]:
df.head()

Unnamed: 0,videoId,textDisplay,textOriginal,authorDisplayName,authorProfileImageUrl,authorChannelUrl,authorChannelId,canRate,viewerRating,likeCount,publishedAt,updatedAt,parentId,commentId
0,EBOKYsWUhvI,Dub maro jaato 😢😢.<br>Tumse tumare ladkiya nhi...,Dub maro jaato 😢😢.\nTumse tumare ladkiya nhi b...,HECTOR OF TROY,https://yt3.ggpht.com/ytc/AOPolaQNP5bd7gNvbAas...,http://www.youtube.com/channel/UC5G8fjqoiFIqHp...,{'value': 'UC5G8fjqoiFIqHpKyVeOTsFg'},True,none,0,2023-07-06T07:04:02Z,2023-07-06T07:04:02Z,,UgwbyvIkkAhUdaCFpcp4AaABAg
1,EBOKYsWUhvI,Haar gye bechare,Haar gye bechare,Geeta Saini,https://yt3.ggpht.com/8XmOPNrT3Vy3wr0fItlWbaMk...,http://www.youtube.com/channel/UCsMGRdH3YHrbs2...,{'value': 'UCsMGRdH3YHrbs21NOraRuyQ'},True,none,0,2023-07-03T22:21:37Z,2023-07-03T22:21:37Z,,Ugyz3OwSXamho91-8I94AaABAg
2,EBOKYsWUhvI,Dhamki mili pahalwano ko aur sab manage kr liy...,Dhamki mili pahalwano ko aur sab manage kr liy...,Ayaan Chouhan,https://yt3.ggpht.com/ytc/AOPolaREH2WnrnbD53OI...,http://www.youtube.com/channel/UC_dWuNh6zydTHI...,{'value': 'UC_dWuNh6zydTHIRr6hi3Omg'},True,none,0,2023-07-03T05:06:33Z,2023-07-03T05:06:33Z,,Ugyphs1TT1Yoj7MZBVJ4AaABAg
3,EBOKYsWUhvI,Jaato pr ye boj rhega ki vo apni hi vyavstha s...,Jaato pr ye boj rhega ki vo apni hi vyavstha s...,Kamal,https://yt3.ggpht.com/ytc/AOPolaTRSm_dEOKj9H82...,http://www.youtube.com/channel/UCVdGObpHM-IMHB...,{'value': 'UCVdGObpHM-IMHB_b7_K-7rA'},True,none,1,2023-07-02T03:39:50Z,2023-07-02T03:39:50Z,,UgwnIxpuFAcKsEzffBp4AaABAg
4,EBOKYsWUhvI,Pahlwan jante h kuch nhi kr payenge uska,Pahlwan jante h kuch nhi kr payenge uska,jagriti tiwari upp,https://yt3.ggpht.com/ytc/AOPolaRrFnzD2i3N_3rk...,http://www.youtube.com/channel/UCRkcewHFhxE5Kf...,{'value': 'UCRkcewHFhxE5KfLHrLx0wpA'},True,none,0,2023-06-28T15:10:36Z,2023-06-28T15:10:36Z,,UgxjuOlDigsmuDu53J54AaABAg


In [6]:
df.shape

(1119948, 14)

In [7]:
df['textOriginal'] = df['textOriginal'].astype(str)

In [8]:
duplicate_rows = df[df.duplicated(subset='textOriginal', keep=False)]

In [9]:
len(df['textDisplay'].unique())

93475

In [10]:
# Now this is strange, the dataset doesnt have duplicate or empty rows, why is this happening? is it some datatype problem?
# Lets try to do a simple check for it again

In [11]:
count = 0
value_list = []
for index, row in df.iterrows():
    value_list.append(row['textOriginal'])
    count+=1
print(count)
print(len(value_list))

1119948
1119948


In [12]:
# This means we are penetrating through the entire dataset. I dont know why pandas is doing that ://
# There is one small problem, there is a \u200b character that is present, we might have to remove that.

## Simple Slur List Frequency Count
This is a simple frequency count of how many words from the slur list are found in the data.
I have stored the results in a dictonary called _'slur_counts'_

In [13]:
with open('slur_list.txt', 'r') as file:
    slur_words = [word.strip() for word in file.readlines()]

In [14]:
# slur_words

In [15]:
slur_words_set = set(slur_words)

In [16]:
# A dictonary to count the frequency of each slur word identified
slur_counts = {}

In [17]:
'''
for text in df['textOriginal']:
    words = text.split()
    for word in words:
        if word in slur_words:
            if word in slur_counts:
                slur_counts[word] += 1
            else:
                slur_counts[word] = 1
'''
for text in df['textOriginal']:
    if text is not None:
        words = text.split()
        slur_words_in_text = set(words) & slur_words_set
        for word in slur_words_in_text:
            if word in slur_counts:
                slur_counts[word] += 1
            else:
                slur_counts[word] = 1
# Problem with code, in set if a user has used the same word multiple times, set will dismiss that.

In [18]:
sorted_slur_counts = dict(sorted(slur_counts.items(), key=lambda item: item[1], reverse=True))

In [19]:
# sorted_slur_counts
list(sorted_slur_counts.items())[:8]

[('chutiya', 2647),
 ('chutiye', 1839),
 ('chod', 1788),
 ('mc', 1155),
 ('bsdk', 1147),
 ('stupid', 994),
 ('sali', 992),
 ('gaddar', 981)]

In [20]:
sum(sorted_slur_counts.values())

25393

In [21]:
'''
One thing is clear from this, all the explicit words that are in english are not there in the list,
this means youtube either deletes them or hides them. And the moderation for local vernacular language is weak.
'''

'\nOne thing is clear from this, all the explicit words that are in english are not there in the list,\nthis means youtube either deletes them or hides them. And the moderation for local vernacular language is weak.\n'

# Regex
What is a Regex? -> a sequence of characters that specifies a match pattern in text.

**Step 1** - Clean and Preprocess the Data
1.   Remove User Names - `r'@\w+\b'`
2.   Remove URL's and unecessary puntuation marks - `[^\w\s.]|http\S+|www\S+|https\S+`
3.  Remove Double spaces - `r'\s+', ' '`
4.  Remove any leading or trailing spaces - `.strip()`



In [22]:
text = "@user, temp, temp, ;;;;;;;; I hate the offensive and racist comments\n\n\n. They disgust me. Visit https://example.com for more information."

In [23]:
def clean_text(text):
    cleaned_text = re.sub(r'@\w+\b|[^\w\s.]|(?:https?|ftp)://\S+|www.\S+', '', text).replace('\n', '').strip()
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text)
    return cleaned_text

In [24]:
print(clean_text(text))

temp temp I hate the offensive and racist comments. They disgust me. Visit for more information.


In [25]:
regexDf = df[['textOriginal']]

In [26]:
# clean text
regexDf['textOriginal'] = regexDf['textOriginal'].apply(lambda x: clean_text(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  regexDf['textOriginal'] = regexDf['textOriginal'].apply(lambda x: clean_text(x))


### Just check the count of words from the slur list ignoring case (upper or lower) of the word. We also look if any word from the slur list is present as a sub-string of any other word in the dataset.

1.   `re.findall()` - searches for all non-overlapping occurrences of a pattern in a given text.
2.   `(?i)` - make matching case-insenstive
3.  `'(' + '|'.join(slur_words) + r')'` - find a pattern that matches any of the slur words

In [38]:
skip_regex = False

In [27]:
%%time
slur_word_counts = {}
for index, row in regexDf.iterrows():
    text = row['textOriginal']
    # matches = re.findall(r"(?i)\b(" + '|'.join(slur_words) + r")\b", text)
    matches = re.findall(r'(?i)(' + '|'.join(slur_words) + r')', text)
    for match in matches:
        slur_word_counts[match] = slur_word_counts.get(match, 0) + 1

CPU times: user 11min 54s, sys: 1.56 s, total: 11min 56s
Wall time: 12min 3s


In [28]:
# slur_word_counts

In [29]:
sum(slur_word_counts.values())

101382

In [30]:
sorted_slur_word_counts = dict(sorted(slur_word_counts.items(), key=lambda item: item[1], reverse=True))
print(list(sorted_slur_word_counts.items())[:8])

[('ms', 13407), ('bai', 9483), ('halwa', 8551), ('chod', 7929), ('chutiya', 3554), ('fat', 3144), ('mc', 3103), ('balatkar', 2631)]


In [31]:
# We clearly see that the amount of slur words drastically increase. Now the step ahead would be too create seperate regex for the most used slur words.
# Note - the substrings could also be a part of different words and not necessarily hateful. But given the nature of the words, I feel that chances are low.

## Create Regex for the most used Slur Words

- Lets start by looking at the word `"chutiya"`

In [118]:
text = "they called me a chhhutiyyaa and I ignored them."
# text = "abcchhutiyeexvy"

In [119]:
pattern = r'ch+h?[uo]t[iy]a?'

In [120]:
matches = re.findall(pattern, text, re.IGNORECASE)

In [121]:
matches

['chhhuti']

In [122]:
%%time
slur_word_count_2 = {}
for index, row in regexDf.iterrows():
    text = row['textOriginal']
    matches = re.findall(pattern, text, re.IGNORECASE)
    for match in matches:
        slur_word_count_2[match] = slur_word_count_2.get(match, 0) + 1

CPU times: user 57.4 s, sys: 147 ms, total: 57.6 s
Wall time: 58.9 s


In [123]:
sum(slur_word_count_2.values())

9289

In [124]:
sorted_slur_word_count_2 = dict(sorted(slur_word_count_2.items(), key=lambda item: item[1], reverse=True))
print(list(sorted_slur_word_count_2.items())[:8])

[('chuti', 6383), ('Chuti', 1168), ('choti', 463), ('chutia', 344), ('chuty', 204), ('chutya', 201), ('chhoti', 167), ('Chutya', 90)]


## Identify Text Language

In [32]:
#supressing output
# !pip install inltk==0.9 > /dev/null 2>&1
# !pip install --upgrade fastai > /dev/null 2>&1

In [33]:
!pip install polyglot==16.7.4 > /dev/null 2>&1
!pip install PyICU > /dev/null 2>&1
!pip install pycld2 > /dev/null 2>&1

In [34]:
# Use iNLTK, Indic NLP Library or polyglot
# from inltk.inltk import identify_language, reset_language_identifying_models
from polyglot.detect import Detector
# languages supported by polyglot - https://polyglot.readthedocs.io/en/latest/Detection.html#supported-languages

In [35]:
# text = 'आप कैसे हैं?' text = 'તમે કેમ છો' text = 'எப்படி இருக்கிறீர்கள்' text = 'സുഖമാണോ' text = 'तू कसा आहेस'
text = 'आप कैसे हैं?'

In [36]:
detector = Detector(text)
print(detector.language)

name: Hindi       code: hi       confidence:  96.0 read bytes:  1638
