## This section is for cleaning the tweets using the clean_tweet function.
This section takes data from G_DATA, cleans them, and stores them in my local `./data` directory under `./politics_cleaned`.

In [3]:
import re
import allTokens
from allTokens import abbr_dict, emoji_pattern

import spacy
from spacy_langdetect import LanguageDetector
from spacy.language import Language

import pandas as pd
from threading import Thread

In [2]:
L_DATA = "../data"
G_DATA = "/DATA/BLF/politics/"
# G_DATA_STORE = "/DATA/BLF/politics/CLEANED"

In [3]:
# code from src/preprocessing.py
my_nlp = None

extraChar = {'&quot;': '"',
 '&amp;': 'and',
 '&lt;': '<',
 '&gt;': '>',
 '&nbsp;': 'un-linebreak-able space',
 '&iexcl;': '¡',
 '&cent;': '¢',
 '&pound;': '£',
 '&curren;': '¤',
 '&yen;': '¥',
 '&brvbar;': '¦',
 '&sect;': '§',
 '&uml;': '¨',
 '&copy;': '©',
 '&ordf;': 'ª',
 '&laquo;': '«',
 '&not;': '¬',
 '&shy;': '\xad',
 '&reg;': '®',
 '&macr;': '¯',
 '&deg;': '°',
 '&plusmn;': '±',
 '&sup2': '²',
 '&sup3;': '³',
 '&acute;': '´',
 '&micro;': 'µ',
 '&para;': '¶',
 '&middot;': '·',
 '&cedil;': '¸',
 '&sup1;': '¹',
 '&ordm;': 'º',
 '&raquo;': '»',
 '&frac14;': '¼',
 '&frac12;': '½',
 '&frac34;': '¾',
 '&iquest;': '¿',
 '&times;': '×',
 '&divide;': '÷',
 '&ETH;': 'Ð',
 '&eth;': 'ð',
 '&THORN;': 'Þ',
 '&thorn;': 'þ',
 '&AElig;': 'Æ',
 '&aelig;': 'æ',
 '&OElig;': 'Œ',
 '&oelig;': 'œ',
 '&Aring;': 'Å',
 '&Oslash;': 'Ø',
 '&Ccedil;': 'Ç',
 '&ccedil;': 'ç',
 '&szlig;': 'ß',
 '&Ntilde;': 'Ñ',
 '&ntilde;': 'ñ'}

special = {
    "’":"'",
    "‘":"'",
    "`":"'",
    '“':'"',
    '”':'"',
    '…':"."
}

emoji_pattern = re.compile("[\U0001F600-\U0001F64F]|[\U0001F300-\U0001F5FF]|[\U0001F680-\U0001F6FF]")

In [4]:
@Language.factory('language_detector')
def language_detector(nlp, name):
    return LanguageDetector()

def get_nlp():
    global my_nlp
    if not my_nlp:        
        my_nlp = spacy.load('en_core_web_lg')
        my_nlp.add_pipe('language_detector')
    return my_nlp

def is_english(text):
    doc = get_nlp()(text)
    return doc._.language['score']>0.95
        
def removeTags(text,splitter):
    div = text.split(splitter)
    endExists = True
    i = len(div)-1
    while i>=0 and endExists:
        if len(div[i].strip().split(" "))  == 1:
            div.pop(i)
            i-=1
        else:
            endExists = False
        
    return " "+splitter.join(div).strip()

def removeTagsFromStart(text,splitter):
    div = text.split(splitter)
    endExists = True
    i = 0
    while len(div)>0 and endExists:
        if len(div[i].strip().split(" "))  == 1:
            div=div[i+1:]
        else:
            div[i] = splitter+div[i]
            endExists = False
    if len(div) == 0:
        return ''
    splitfirst = div[0].split(" ")
    if "you" in splitfirst[1].lower() and "@" in splitfirst[0]:
        splitfirst[1] = splitfirst[0]
        splitfirst = splitfirst[1:]
        div[0] = " ".join(splitfirst)
    if "@" in splitfirst[0].strip()[0:2]:
        splitfirst = splitfirst[1:]
        div[0] = " ".join(splitfirst)
    return " ".join(div).strip()


def removeRT(text):
    if text[0:2] == 'RT':
        return ":".join(text.split(":")[1:]).strip()
    return text  

def clean_tweet(text,removeFromMiddle):
    text = text.strip()
    for key,value in special.items():
        text = re.sub(key,value,text)
    for key,value in abbr_dict.items():
        text = re.sub(key,value,text,flags=re.I)
    for key,value in extraChar.items():
        text = re.sub(key,value,text)
    
        #print(text)
    if removeFromMiddle:
        text = re.sub("@[A-Za-z0-9_]+","", text)
        text = re.sub("#[A-Za-z0-9_]+","", text)
    text = re.sub(r"http\S+", "", text)
    text = emoji_pattern.sub(r' ', text)
    text = removeTags(text,"#")
    text = removeTags(text,"@")
    text = removeTags(text,"#")
    text = removeTagsFromStart(text,"@")
    text = removeRT(text)
    text = re.sub(' +', ' ', text)
    text = re.sub("@",'',text)
    text = re.sub("#",'',text)
    text = re.sub(r'[\n\r]+',r'\n',text)
    text = re.sub('(?<![.?!])\n',". ",text)
    text = re.sub('\n'," ",text)
    #text = ' '.join(text.replace('\r', ' ').split())
    text = re.sub("\s+"," ",text)
    #text = re.sub(r"[^A-Za-z.!?'', ]",'',text)
    
    if not is_english(text):
        return ''
    return text.strip()

In [5]:
data_february = pd.read_feather(f"{G_DATA}/output_feb.feather")
data_march = pd.read_feather(f"{G_DATA}/output_mar.feather")
data_april = pd.read_feather(f"{G_DATA}/output_april.feather")
data_may = pd.read_feather(f"{G_DATA}/output_may.feather")
data_june = pd.read_feather(f"{G_DATA}/output_jun.feather")
data_july = pd.read_feather(f"{G_DATA}/output_jul.feather")
data_august = pd.read_feather(f"{G_DATA}/output_aug.feather")
data_september = pd.read_feather(f"{G_DATA}/output_sept.feather")
data_october = pd.read_feather(f"{G_DATA}/output_oct.feather")
data_november = pd.read_feather(f"{G_DATA}/output_nov.feather")

4351948


In [7]:
def tweetCleaner(df):
    ogtext = df['text'].tolist()
    ans = []
    for i in range(len(ogtext)):
        ans.append(clean_tweet(ogtext[i],True))
    df['cleanedText'] = ans

In [None]:
thread1 = Thread(target=lambda: tweetCleaner(data_february).reset_index(drop=True).to_feather(f"{L_DATA}/output_cleaned_february_rohan.feather"))
thread2 = Thread(target=lambda: tweetCleaner(data_march).reset_index(drop=True).to_feather(f"{L_DATA}/output_cleaned_march_rohan.feather"))
thread3 = Thread(target=lambda: tweetCleaner(data_april).reset_index(drop=True).to_feather(f"{L_DATA}/output_cleaned_april_rohan.feather"))
thread4 = Thread(target=lambda: tweetCleaner(data_may).reset_index(drop=True).to_feather(f"{L_DATA}/output_cleaned_may_rohan.feather"))
thread5 = Thread(target=lambda: tweetCleaner(data_june).reset_index(drop=True).to_feather(f"{L_DATA}/output_cleaned_june_rohan.feather"))
thread6 = Thread(target=lambda: tweetCleaner(data_july).reset_index(drop=True).to_feather(f"{L_DATA}/output_cleaned_july_rohan.feather"))
thread7 = Thread(target=lambda: tweetCleaner(data_august).reset_index(drop=True).to_feather(f"{L_DATA}/output_cleaned_august_rohan.feather"))
thread8 = Thread(target=lambda: tweetCleaner(data_september).reset_index(drop=True).to_feather(f"{L_DATA}/output_cleaned_september_rohan.feather"))
thread9 = Thread(target=lambda: tweetCleaner(data_october).reset_index(drop=True).to_feather(f"{L_DATA}/output_cleaned_october_rohan.feather"))
thread10 = Thread(target=lambda: tweetCleaner(data_november).reset_index(drop=True).to_feather(f"{L_DATA}/output_cleaned_november_rohan.feather"))

thread1.start()
thread2.start()
thread3.start()
thread4.start()
thread5.start()
thread6.start()
thread7.start()
thread8.start()
thread9.start()
thread10.start()

thread1.join()
thread2.join()
thread3.join()
thread4.join()
thread5.join()
thread6.join()
thread7.join()
thread8.join()
thread9.join()
thread10.join()