In [1]:
import pandas as pd
import numpy as np
import re
import nltk
#nltk.download('stopwords')
#nltk.download('punkt')
from nltk.corpus import stopwords
import enchant
from enchant.checker import SpellChecker
import enchant

In [2]:
import sys
print("Python version")
print (sys.version)

Python version
3.7.8 (tags/v3.7.8:4b47a5b6ba, Jun 28 2020, 08:53:46) [MSC v.1916 64 bit (AMD64)]


In [3]:
df = pd.read_csv('../datasets/labelled/unpreprocessed.csv')

Number of labelled comments

In [4]:
df.shape[0]

20102

# Making the text more uniform

Remove urls

In [5]:
df['preproc'] = df.body.replace(r'http\S+', '', regex=True)

Change acronyms to words for uniformity. 
* https://www.netlingo.com/acronyms.php
* https://blog.adioma.com/internet-acronyms-intro-list-infographic/

In [6]:
import os, sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

from util.acronyms_smileys import acronyms

In [7]:
df.preproc = df.preproc.str.lower()
# the ’ was causing issues, took a while to notice
df.preproc = df.preproc.str.replace('’', '\'')

In [8]:
df.preproc = df.preproc.apply(lambda x: ' '.join(acronyms.get(word, word) for word in x.split()))

List of emojis defined from https://en.wikipedia.org/wiki/List_of_emoticons , https://emojipedia.org/people/

Other references:
* https://www.urbandictionary.com/define.php?term=%F0%9F%92%80
* https://www.urbandictionary.com/define.php?term=%F0%9F%94%A5

In [9]:
import emoji
# list of tagged emoticons from above links
from util.acronyms_smileys import smileys
from util.acronyms_smileys import sent_acronyms
# for removing untagged emoji
import demoji
#demoji.download_codes()

In [10]:
def replace_repeating_emoji(text):
    uniques = set()
    final_string = list()
    text_arr = [item for item in emoji.get_emoji_regexp().split(text) if not item == '']
    for e in text_arr:
        # for some reason even though it is defined as '❤', when its 
        # imported, it gets loaded as '❤❤'
        if e == '❤':
            e = '❤❤'
        if not bool(emoji.get_emoji_regexp().search(e)):
            final_string.append(smileys.get(e, e))
        else:
            if e not in uniques:
                uniques.add(e)
                final_string.append(smileys.get(e, e))
    return ' '.join(final_string)

In [11]:
# find repeating emoticons and remove repetitions, and tag emoticons
df.preproc = df.preproc.apply(lambda x: replace_repeating_emoji(x) if (bool(emoji.get_emoji_regexp().search(x)) and bool(re.search(r'(.)\1', x))) else x)

In [12]:
# remove untagged emoticons
df.preproc = df.preproc.apply(lambda x : demoji.replace(x, ''))

Set sentiment on acronyms (such as 'lol')

In [13]:
df.preproc = df.preproc.apply(lambda x: ' '.join(sent_acronyms.get(word, word) for word in x.split()))

Remove hashtags

In [14]:
df.preproc = df.preproc.apply(lambda x: ' '.join([word for word in x.split() if '#' not in word]))

In [15]:
df.loc[df.preproc.str.contains('#')]

Unnamed: 0,body,positive,negative,neutral,rated,comment_id,video_id,date,preproc


Replace negations with "not"

In [16]:
negations = ['don\'t', 'aint' 'aren\'t', 'couldn\'t','didn\'t', 
             'doesn\'t', 'hadn\'t', 'hasn\'t', 'haven\'t', 'isn\'t', 
             'mightn\'t', 'mustn\'t', 'needn\'t', 'shouldn\'t', 'wasn\'t', 
             'weren\'t', 'won\'t', 'wouldn\'t', 'nor', 'not', 'cant', 'dont',
            'arent', 'couldnt', 'didnt', 'doesnt', 'hadnt', 'hasnt', 'havent',
            'isnt', 'mightnt', 'mustnt', 'neednt', 'shouldnt', 'wasnt',
            'werent', 'wont', 'wouldnt']
regx = r'\b(?:{})\b'.format('|'.join(negations))
df.preproc = df.preproc.str.replace(regx, 'not')

In [17]:
# Remove negations from stop list, add two missing contractions
stopwords_list = stopwords.words('english')
stopwords_list = [el for el in stopwords_list if el not in negations]
missing_words = ['i\'m', 'i\'d']
stopwords_list.extend(missing_words)

In [18]:
singular_pronouns = ['me', 'you', 'he', 'she', 'they', 'his', 'her', 'him']
stopwords_list_complete = [x for x in stopwords_list if x not in singular_pronouns]

20 most commonly used words

In [19]:
pd.Series(' '.join(df.preproc).lower().split()).value_counts()[:20]

the      6492
i        5519
this     4766
is       3940
you      3794
a        3650
to       3399
and      3072
not      2546
of       2270
it       1971
like     1941
in       1772
that     1646
so       1624
was      1459
are      1367
laugh    1352
for      1252
just     1234
dtype: int64

In [20]:
def get_common_stopwords(stop_words, n=5):
    most_freq_words = pd.Series(' '.join(df.preproc).lower().split()).value_counts()[:int(n*2)].keys().to_numpy()
    common_stopwords = [i for i in most_freq_words if i in stopwords_list_complete]
    return common_stopwords[0:n]

Remove most common stop words

In [21]:
common_stopwords = get_common_stopwords(stopwords_list, n=20)
df.preproc = df.preproc.apply(lambda x: ' '.join([word for word in x.split() if word not in (common_stopwords)]))

In [22]:
df.head(5)

Unnamed: 0,body,positive,negative,neutral,rated,comment_id,video_id,date,preproc
0,*stretched past the 10 minute mark for ad reve...,0,1,0,1,UgztP4lVR-Epv5HlSXN4AaABAg,ItYOdWRo0JY,2020-01-10T20:24:33Z,"*stretched past 10 minute mark ad revenue, see!*"
1,That makes no sense you sold a new phone for n...,0,1,0,1,Ugw3sxLwikkGsUza7hh4AaABAg,ItYOdWRo0JY,2020-03-22T19:13:19Z,makes no sense you sold new phone nothing ?
2,WHY IS HE SO HAPPY HE JUST LOST $900 DOLLARS,0,0,1,1,Ugxe5-sz9eHyFCvZX4l4AaABAg,ItYOdWRo0JY,2020-10-30T12:27:57Z,why he happy he lost $900 dollars
3,Does it work on iPads,0,0,1,1,UgxInsxWP8cnbFEh1Cp4AaABAg,ItYOdWRo0JY,2020-07-21T16:38:40Z,does work ipads
4,Yo ass better not have donated.. that cord mig...,0,0,1,1,Ugwn6_K1_792ARqfY4h4AaABAg,ItYOdWRo0JY,2019-12-23T13:35:04Z,yo ass better not have donated.. cord might ha...


Remove punctuation

In [23]:
import string
df.preproc = df.preproc.str.replace('[{}]'.format(string.punctuation), '')

In [24]:
df.head(5)

Unnamed: 0,body,positive,negative,neutral,rated,comment_id,video_id,date,preproc
0,*stretched past the 10 minute mark for ad reve...,0,1,0,1,UgztP4lVR-Epv5HlSXN4AaABAg,ItYOdWRo0JY,2020-01-10T20:24:33Z,stretched past 10 minute mark ad revenue see
1,That makes no sense you sold a new phone for n...,0,1,0,1,Ugw3sxLwikkGsUza7hh4AaABAg,ItYOdWRo0JY,2020-03-22T19:13:19Z,makes no sense you sold new phone nothing
2,WHY IS HE SO HAPPY HE JUST LOST $900 DOLLARS,0,0,1,1,Ugxe5-sz9eHyFCvZX4l4AaABAg,ItYOdWRo0JY,2020-10-30T12:27:57Z,why he happy he lost 900 dollars
3,Does it work on iPads,0,0,1,1,UgxInsxWP8cnbFEh1Cp4AaABAg,ItYOdWRo0JY,2020-07-21T16:38:40Z,does work ipads
4,Yo ass better not have donated.. that cord mig...,0,0,1,1,Ugwn6_K1_792ARqfY4h4AaABAg,ItYOdWRo0JY,2019-12-23T13:35:04Z,yo ass better not have donated cord might have...


Remove repeating vowels and consonants

In [25]:
df.loc[df.positive == 1].head(10)

Unnamed: 0,body,positive,negative,neutral,rated,comment_id,video_id,date,preproc
117,Good,1,0,0,1,UgzHduB8hJNtSSVQJ-t4AaABAg,ItYOdWRo0JY,2019-03-01T09:31:58Z,good
144,Nice!,1,0,0,1,UgxB4Ya7EeXjnL2gXrF4AaABAg,ItYOdWRo0JY,2019-02-11T13:52:23Z,nice
229,U are a kind person😁👏,1,0,0,1,UgwTHUgTPBxRu8bz6-l4AaABAg,ItYOdWRo0JY,2019-01-30T12:56:09Z,you kind person
273,this looks good,1,0,0,1,UgjVVUkF3LLWNXgCoAEC,PpcNQNJmU9Y,2016-07-14T00:24:28Z,looks good
277,Cool,1,0,0,1,UgzhYX7nMHIoIuivJKB4AaABAg,ItYOdWRo0JY,2019-01-26T09:37:00Z,cool
284,Another talent Matthew possess .,1,0,0,1,UgwsvtG8cZmcHp_jynx4AaABAg,0tO_l_Ed5Rs,2019-07-29T22:00:27Z,another talent matthew possess
285,I kinda love this.. its not mumble rap b.s plu...,1,0,0,1,Ugxn2Ag15UkrVAtPDh54AaABAg,0tO_l_Ed5Rs,2020-03-11T22:15:20Z,kinda love this its not mumble rap bs plus mat...
287,"love this , mathew you killed it",1,0,0,1,Ugy1Wp2qqlNY2212gCt4AaABAg,0tO_l_Ed5Rs,2020-07-31T15:15:35Z,love mathew you killed
290,Dang matt you killin it.,1,0,0,1,Ugxv28EU_oNlNd7b13h4AaABAg,0tO_l_Ed5Rs,2020-03-24T22:22:26Z,dang matt you killin it
291,Bruh the good ol days,1,0,0,1,Ugzo7Rf5bNJFl6e5mYR4AaABAg,0tO_l_Ed5Rs,2019-12-28T22:40:42Z,bruh good ol days


In [26]:
# https://stackoverflow.com/questions/46701245/how-to-replace-multiple-consecutive-repeating-characters-into-1-character-in-pyt
df.preproc = df.preproc.apply(lambda x: ' '.join([re.sub(r'(.)\1{2,}', r'\1\1', word) for word in x.split()]))

Tag any sequence of "ha" or "ah" (for example, "ahaha" or "haha") as a "laugh"

In [27]:
df.preproc = df.preproc.apply(lambda x: ' '.join([re.sub(r'([ha]+[ah]+).*\1', r'laugh', word) for word in x.split()]))

Remove numbers

In [28]:
df.preproc = df.preproc.str.replace('\d+', '')

In [29]:
def assign_rating(row):
    if row['positive'] == 1:
        return 1
    elif row['negative'] == 1:
        return 0
    else:
        return -1

In [30]:
# 0 - negative, 1 - positive, -1 neutral
df['rating'] = df.apply(lambda row: assign_rating(row), axis=1)

Write preprocessed column and the rating to a file

In [31]:
# copy relevant columns for later to file write
preprocessed = df.filter(['comment_id', 'video_id', 'date', 'preproc', 'rating'], axis=1)
preprocessed.columns = ['comment_id', 'video_id', 'date', 'body', 'rating']

In [32]:
preprocessed = preprocessed.loc[preprocessed.body != '']

In [33]:
preprocessed.to_csv('../datasets/preprocessed/preprocessed_full.csv', index=False)

In [34]:
preprocessed.loc[preprocessed.rating == 1].to_csv('../datasets/preprocessed/preprocessed_pos.csv', index=False)
preprocessed.loc[preprocessed.rating == 0].to_csv('../datasets/preprocessed/preprocessed_neg.csv', index=False)