In [1]:
from Twitter import TwitterAccess
import pandas as pd
import time
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
from collections import defaultdict
from collections import Counter
pd.options.display.max_colwidth = None
import nltk
import string
import re
nltk.download('stopwords')
from nltk.corpus import stopwords
from spellchecker import SpellChecker
import random
import numpy as np
from multiprocessing import  Pool

[nltk_data] Downloading package stopwords to /u/arsaikia/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
### Emojis
# Reference : https://gist.github.com/slowkow/7a7f61f495e3dbb7e3d767f97bd7304b
# def remove_emoji(text):
#     emoji_pattern = re.compile("["
#                            u"\U0001F600-\U0001F64F"  # emoticons
#                            u"\U0001F300-\U0001F5FF"  # symbols & pictographs
#                            u"\U0001F680-\U0001F6FF"  # transport & map symbols
#                            u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
#                            u"\U00002702-\U000027B0"
#                            u"\U000024C2-\U0001F251"
#                            "]+", flags=re.UNICODE)
#     return emoji_pattern.sub(r'', text)

def remove_emoji(string):
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002500-\U00002BEF"  # chinese char
                               u"\U00002702-\U000027B0"
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               u"\U0001f926-\U0001f937"
                               u"\U00010000-\U0010ffff"
                               u"\u2640-\u2642"
                               u"\u2600-\u2B55"
                               u"\u200d"
                               u"\u23cf"
                               u"\u23e9"
                               u"\u231a"
                               u"\ufe0f"  # dingbats
                               u"\u3030"
                               "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', string)

### Hashtags
def remove_hashtag(text):
    hashtag=re.compile(r'#\w+')
    return hashtag.sub(r'',text)

### Punctuations
def remove_punct(text):
    table=str.maketrans('','',string.punctuation)
    return text.translate(table)

### Mentions
def remove_mentions(text):
    mention=re.compile(r'@\w+')
    return mention.sub(r'',text)

### URL
def remove_URL(text):
    URL=re.compile(r'url|&amp')
    return URL.sub(r'',text)

### Spell Checker

spell = SpellChecker()
def correct_spellings(text):
    corrected_text = []
    misspelled_words = spell.unknown(text.split())
    for word in text.split():
        if word in misspelled_words:
            corrected_text.append(spell.correction(word))
        else:
            corrected_text.append(word)
    return " ".join(corrected_text)

def clean_tweets(df):

    df['tweet_cleaned'] = df['tweet'].str.lower().apply(lambda x: remove_emoji(x))\
                                    .apply(lambda x: remove_hashtag(x))\
                                    .apply(lambda x: remove_mentions(x))\
                                    .apply(lambda x: remove_URL(x))\
                                    .apply(lambda x: remove_punct(x))\
                                    .str.strip()
    #                                     .apply(lambda x: correct_spellings(x))\


    return df

def parallelize_cleaning(df, func, n_cores=10):
    df_split = np.array_split(df, n_cores)
    pool = Pool(n_cores)
    df = pd.concat(pool.map(func, df_split))
    pool.close()
    pool.join()
    return df

# remove_URL(remove_pbunct(remove_mentions(remove_emoji(remove_hashtag(olid['tweet'][2]))))).strip()

In [15]:
!pwd

/home/arsaikia/hate_detection


In [3]:
DATA_PATH = '../data/OLID/'
PREPROCESSED = '../preprocessed/'

training_data = 'olid-training-v1.0.tsv'

olid = pd.read_csv(DATA_PATH + training_data, sep = '\t')
olid.head()

Unnamed: 0,id,tweet,subtask_a,subtask_b,subtask_c
0,86426,@USER She should ask a few native Americans what their take on this is.,OFF,UNT,
1,90194,@USER @USER Go home youâ€™re drunk!!! @USER #MAGA #Trump2020 ðŸ‘ŠðŸ‡ºðŸ‡¸ðŸ‘Š URL,OFF,TIN,IND
2,16820,Amazon is investigating Chinese employees who are selling internal data to third-party sellers looking for an edge in the competitive marketplace. URL #Amazon #MAGA #KAG #CHINA #TCOT,NOT,,
3,62688,"@USER Someone should'veTaken"" this piece of shit to a volcano. ðŸ˜‚""",OFF,UNT,
4,43605,@USER @USER Obama wanted liberals &amp; illegals to move into red states,NOT,,


In [4]:
def subtask_cii(col):
    if col == 'GRP' or col == 'OTH':
        return 'OTH'
    else:
        return col
    
def subtask_ciii(col):
    if col == 'GRP' or col == 'OTH':
        return col
    else:
        return np.NaN
    
olid['subtask_c_ii'] = olid['subtask_c'].apply(lambda x: subtask_cii(x))
olid['subtask_c_iii'] = olid['subtask_c'].apply(lambda x: subtask_ciii(x))
# olid.head(20)

In [5]:

olid_clean = parallelize_cleaning(olid, clean_tweets)
olid_clean.to_csv(PREPROCESSED + 'olid.csv', index = False)

In [7]:

def create_test_dataset(text, labels, out, y_label):

    olid_test = pd.read_csv(text, sep = '\t')
    olid_test_labels = pd.read_csv(labels, header = None)
    olid_test_labels.columns = ['id', y_label]

    olid_test = pd.merge(olid_test, olid_test_labels)
    olid_test_clean = parallelize_cleaning(olid_test, clean_tweets)
    
    try:
        olid_test_clean['subtask_c_ii'] = olid_test_clean['subtask_c'].apply(lambda x: subtask_cii(x))
        olid_test_clean['subtask_c_iii'] = olid_test_clean['subtask_c'].apply(lambda x: subtask_ciii(x))
    except:
        pass

    olid_test_clean.to_csv(out, index = False)
# olid_testa_labels

create_test_dataset(DATA_PATH+'testset-levela.tsv', DATA_PATH + 'labels-levela.csv', PREPROCESSED + 'olid-levela.csv', 'subtask_a')
create_test_dataset(DATA_PATH+'testset-levelb.tsv', DATA_PATH + 'labels-levelb.csv', PREPROCESSED + 'olid-levelb.csv', 'subtask_b')
create_test_dataset(DATA_PATH+'testset-levelc.tsv', DATA_PATH + 'labels-levelc.csv', PREPROCESSED + 'olid-levelc.csv', 'subtask_c')