In [9]:
import pandas as pd
import numpy as np
import re
from nltk.corpus import stopwords

In [96]:
data = pd.read_csv('cuomo_tweets.csv')
data['text'] = data['text'].astype('U')

### Preprocessing Steps
1. standardize apostrophes / quotes 
2. get rid of @____ , web addresses
3. Standardize common abbreviations
    - u.s. --> usa
4. Lowercase
5. Expand Contractions
6. remove possessives
7. Remove Special Characters , numbers
8. remove stopwords
9. remove extra spacing / white space created by text cleaning

In [97]:
# Change all apostrophes to standard: '
data['clean'] = data['text']
data['clean'] = data['clean'].apply(lambda x: re.sub('’','\'',x))
data['clean'] = data['clean'].apply(lambda x: re.sub('‘','\'',x))

In [98]:
# Get rid of @s and any links
def remove_ats(s):
    return re.sub('@\\w+', '',s).strip()

def remove_links(s):
    return re.sub('https\\S*','',s).strip()

data['clean'] = data['clean'].apply(lambda x: remove_ats(x))
data['clean'] = data['clean'].apply(lambda x: remove_links(x))

In [99]:
# Helper function
def replace_words(text, replace_dict):
    tokens = []
    for w in text.split():
        word = w
        for t in replace_dict.keys():
            if w == t:
                word = replace_dict[t]
        tokens.append(word)
    
    return " ".join(tokens)

In [100]:
## Standardize Common Abbreviations
translate_dict = {
    "US": "USA",
    "U.S.": "USA",
    "u.s.": "USA",
    "u.s": "USA",
}

data['clean'] = data['clean'].apply(lambda x: replace_words(x, translate_dict))

In [101]:
# lowercase 
data['clean'] = data['clean'].apply(lambda x: x.lower())

In [102]:
# Expand Contractions
contractions_dict = { "ain't": "are not","aren't": "are not",
                     "can't": "cannot","can't've": "cannot have",
                     "'cause": "because","could've": "could have","couldn't": "could not",
                     "couldn't've": "could not have", "didn't": "did not","doesn't": "does not",
                     "don't": "do not","hadn't": "had not","hadn't've": "had not have",
                     "hasn't": "has not","haven't": "have not","he'd": "he would",
                     "he'd've": "he would have","he'll": "he will", "he'll've": "he will have",
                     "how'd": "how did","how'd'y": "how do you","how'll": "how will",
                     "i'd": "i would", "i'd've": "i would have","i'll": "i will",
                     "i'll've": "i will have","i'm": "i am","i've": "i have", "isn't": "is not",
                     "it'd": "it would","it'd've": "it would have","it'll": "it will",
                     "it'll've": "it will have", "let's": "let us","ma'am": "madam",
                     "mayn't": "may not","might've": "might have","mightn't": "might not", 
                     "mightn't've": "might not have","must've": "must have","mustn't": "must not",
                     "mustn't've": "must not have", "needn't": "need not",
                     "needn't've": "need not have","o'clock": "of the clock","oughtn't": "ought not",
                     "oughtn't've": "ought not have","shan't": "shall not","sha'n't": "shall not",
                     "shan't've": "shall not have","she'd": "she would","she'd've": "she would have",
                     "she'll": "she will", "she'll've": "she will have","should've": "should have",
                     "shouldn't": "should not", "shouldn't've": "should not have","so've": "so have",
                     "that'd": "that would","that'd've": "that would have", "there'd": "there would",
                     "there'd've": "there would have", "they'd": "they would",
                     "they'd've": "they would have","they'll": "they will",
                     "they'll've": "they will have", "they're": "they are","they've": "they have",
                     "to've": "to have","wasn't": "was not","we'd": "we would",
                     "we'd've": "we would have","we'll": "we will","we'll've": "we will have",
                     "we're": "we are","we've": "we have", "weren't": "were not","what'll": "what will",
                     "what'll've": "what will have","what're": "what are", "what've": "what have",
                     "when've": "when have","where'd": "where did", "where've": "where have",
                     "who's": "who is",
                     "who'll": "who will","who'll've": "who will have","who've": "who have",
                     "why've": "why have","will've": "will have","won't": "will not",
                     "won't've": "will not have", "would've": "would have","wouldn't": "would not",
                     "wouldn't've": "would not have","y'all": "you all", "y'all'd": "you all would",
                     "y'all'd've": "you all would have","y'all're": "you all are",
                     "y'all've": "you all have", "you'd": "you would","you'd've": "you would have",
                     "you'll": "you will","you'll've": "you will have", "you're": "you are",
                     "you've": "you have"}



data['clean'] = data['clean'].apply(lambda x: replace_words(x, contractions_dict))

In [103]:
#possessives 
data['clean'] = data['clean'].apply(lambda x: re.sub("\'s$", '', x))
data['clean'] = data['clean'].apply(lambda x: re.sub("\'s ", ' ', x))

In [104]:
# Numbers and Punctuation
data['clean'] = data['clean'].apply(lambda x: re.sub('[^A-Za-z0-9 ]+', '', x))
data['clean'] = data['clean'].apply(lambda x: re.sub(r'[0-9]+', '', x))

In [105]:
# stopwords 
stop = stopwords.words('english')
data['clean'] = data['clean'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))

In [106]:
# extra spacing 
data['clean'] = data['clean'].apply(lambda x: re.sub('\s{2,}', ' ', x))
data['clean'] = data['clean'].apply(lambda x: x.strip())

In [107]:
data

Unnamed: 0,screen_name,text,created_at,clean
0,CuomoSucks,The 11 accusers are likely to Sue Cuomo for se...,2021-08-13 13:39:11,accusers likely sue cuomo sexual harassment li...
1,CuomoSucks,The specter of criminal charges against Cuomo ...,2021-08-13 13:15:48,specter criminal charges cuomo real county dis...
2,CuomoSucks,Not Out of the Woods. Cuomo’s legal mess is l...,2021-08-13 11:51:02,woods cuomo legal mess likely drag years see t...
3,DrToboggan4,@News12LI How about we ask her about what she ...,2021-08-13 13:39:09,ask knew much knew cuomo behaviors
4,factscontext,"@MarkYoungTruth If you haven’t checked lately,...",2021-08-13 13:39:03,checked lately cuomo resigned party leaders to...
...,...,...,...,...
1746,pbsaurus,@TIMESUPNOW Thanks for propping up predators l...,2021-08-13 06:09:44,thanks propping predators like cuomo party pow...
1747,Jire_,I watched Andrew Cuomo's rebuttal to the accus...,2021-08-13 06:08:30,watched andrew cuomo rebuttal accusations man ...
1748,Louis22364155,@robreiner You mean Cuomo,2021-08-13 06:07:50,mean cuomo
1749,Sandy94677285,Cuomo Has $18 Million in Campaign Cash. What C...,2021-08-13 06:06:50,cuomo million campaign cash via
