# Data Wrangling

## Uploading Modules

In [94]:
import pandas as pd
import string
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\anna_\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\anna_\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

## Loading the Data

In [77]:
active19 = pd.read_csv("active19.csv")
active20 = pd.read_csv("active20.csv")
lazy19 = pd.read_csv("lazy19.csv")
lazy20 = pd.read_csv("lazy20.csv")

After a text is obtained, we start with text normalization. Text normalization includes:
- converting all letters to lower or upper case
- converting numbers into words or removing numbers
- removing punctuations, accent marks and other diacritics
- removing white spaces
- expanding abbreviations
- removing stop words, sparse terms, and particular words
- text canonicalization

In [78]:
# Convert text to lowercase
active19['clean_text']=active19.Text.apply(lambda x: x.lower())
active20['clean_text']=active20.Text.apply(lambda x: x.lower())
lazy19['clean_text']=lazy19.Text.apply(lambda x: x.lower())
lazy20['clean_text']=lazy20.Text.apply(lambda x: x.lower())

# Remove numbers
active19['clean_text']=active19['clean_text'].apply(lambda x: re.sub(r'\d+', '', x))
active20['clean_text']=active20['clean_text'].apply(lambda x: re.sub(r'\d+', '', x))
lazy19['clean_text']=lazy19['clean_text'].apply(lambda x: re.sub(r'\d+', '', x))
lazy20['clean_text']=lazy20['clean_text'].apply(lambda x: re.sub(r'\d+', '', x))

# Remove punctuation
translator = str.maketrans('', '', string.punctuation)
active19['clean_text']=active19['clean_text'].apply(lambda x: x.translate(translator))
active20['clean_text']=active20['clean_text'].apply(lambda x: x.translate(translator))
lazy19['clean_text']=lazy19['clean_text'].apply(lambda x: x.translate(translator))
lazy20['clean_text']=lazy20['clean_text'].apply(lambda x: x.translate(translator))

# Remove whitespaces
active19['clean_text']=active19['clean_text'].apply(lambda x: x.strip())
active20['clean_text']=active20['clean_text'].apply(lambda x: x.strip())
lazy19['clean_text']=lazy19['clean_text'].apply(lambda x: x.strip())
lazy20['clean_text']=lazy20['clean_text'].apply(lambda x: x.strip())

In [61]:
# remove 'not' from set SW
SW.remove('not')
print('Reduced number of stopwords', len(SW))

Reduced number of stopwords 178


In [83]:
# leave only alpha:
active19['clean_text'] = active19['clean_text'].apply(lambda x: re.sub(r'[^A-Za-z]+',' ',x))
active20['clean_text'] = active20['clean_text'].apply(lambda x: re.sub(r'[^A-Za-z]+',' ',x))
lazy19['clean_text'] = lazy19['clean_text'].apply(lambda x: re.sub(r'[^A-Za-z]+',' ',x))
lazy20['clean_text'] = lazy20['clean_text'].apply(lambda x: re.sub(r'[^A-Za-z]+',' ',x))

In [84]:
# set 'SW' as english stopwords from NLTK and count them
SW = set(stopwords.words('english'))
print('Number of english stopwords', len(SW))

Number of english stopwords 179


In [85]:
# view english stopwords
print(SW)

{'whom', 'with', 'few', 's', 'doing', 'under', 'they', 'll', 'to', 'its', "needn't", 'now', 'shouldn', 'into', 'if', 'mightn', 'after', 't', 'themselves', 'it', 'those', 'wouldn', 'wasn', 'against', 'this', 'than', 'on', 'as', 'nor', 'should', 'you', 'are', 'of', "wouldn't", 'which', "wasn't", 'then', 'other', "it's", 'can', 'ours', 'just', "she's", 'don', 'in', 'myself', 'above', 'how', 'once', 'itself', "couldn't", 'when', 'yourselves', 'your', 'an', 'ourselves', 'these', 'no', 'between', "should've", "didn't", 'she', 'hers', 'himself', 'so', 'before', 'hadn', "you'll", 'o', 'mustn', "mustn't", 'is', 'do', "you've", "hasn't", 'only', 'i', 'be', 'been', 'who', "aren't", 'or', 'has', 'her', 'for', 'why', 'herself', 'shan', 'most', 'had', 'what', 'him', 'isn', 'm', 'does', 'there', "don't", 'ain', 'some', 'yours', "doesn't", 'hasn', 'from', 'did', 'very', 'too', "isn't", 'by', "you'd", 'didn', "hadn't", 'a', 'we', 'below', 'such', 'will', 'have', 'because', 'same', 'couldn', 'ma', 'agai

In [96]:
def remove_stop_words(x):
    words = word_tokenize(x) # make a list of words
    useful_words = [w for w in words if w not in SW]    # remove stopwords
    return (' '.join(useful_words))

In [99]:
# remove stopwords:
active19['clean_text'] = active19['clean_text'].apply(lambda x: remove_stop_words(x))
active20['clean_text'] = active20['clean_text'].apply(lambda x: remove_stop_words(x))
lazy19['clean_text'] = lazy19['clean_text'].apply(lambda x: remove_stop_words(x))
lazy20['clean_text'] = lazy20['clean_text'].apply(lambda x: remove_stop_words(x))

In [100]:
active20[0:50]

Unnamed: 0,Text,Date,Name,Location,clean_text
0,😂 Hundred percent intentional. Better to ask f...,2020-04-08,prattprattpratt,,hundred percent intentional better ask forgive...
1,I know I’m gonna see it! I’m turning my Twitte...,2020-04-07,prattprattpratt,,know gon na see turning twitter page alanyangs...
2,"Why am I crying? Oh, it’s just my friend John ...",2020-04-06,prattprattpratt,,crying oh friend john literally making world b...
3,That’s gonna open a wormhole GTFOOT!!! https:/...,2020-03-27,prattprattpratt,,gon na open wormhole gtfoot httpstcokufpiwt
4,Sure he was a writer and producer on “Parks an...,2020-03-27,prattprattpratt,,sure writer producer parks rec yes peabody emm...
5,Exciting news✨ #PixarOnward is ON DIGITAL NOW ...,2020-03-20,prattprattpratt,,exciting news pixaronward digital us available...
6,"I came downstairs, saw Katherine crying her ey...",2020-03-18,prattprattpratt,,came downstairs saw katherine crying eyes thou...
7,Gary Cooper in “The Westerner” https://t.co/mk...,2020-03-16,prattprattpratt,,gary cooper westerner httpstcomkjifqxmeq
8,I liked that https://t.co/lXGtDzIciV,2020-03-14,prattprattpratt,,liked httpstcolxgtdziciv
9,I’m so proud of my darling on the success of h...,2020-03-13,prattprattpratt,,proud darling success book smartly delayed res...


## Files for Future Use

In [101]:
#save dataframes as csv for future analysis
active20.reset_index().to_csv("active20-clean.csv")
active19.reset_index().to_csv("active19-clean.csv")
lazy20.reset_index().to_csv("lazy20-clean.csv")
lazy19.reset_index().to_csv("lazy19-clean.csv")