# Data Wrangling

## Uploading Modules

In [1]:
import pandas as pd
import string
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
#nltk.download('stopwords')
#nltk.download('punkt')
import emoji

## Loading the Data

In [2]:
active19 = pd.read_csv("active19.csv")
active20 = pd.read_csv("active20.csv")
lazy19 = pd.read_csv("lazy19.csv")
lazy20 = pd.read_csv("lazy20.csv")

After a text is obtained, we start with text normalization. Text normalization includes:
- converting all letters to lower or upper case
- converting numbers into words or removing numbers
- removing punctuations, accent marks and other diacritics
- removing white spaces
- expanding abbreviations
- removing stop words, sparse terms, and particular words
- text canonicalization

In [3]:
# Convert text to lowercase
active19['clean_text']=active19.Text.apply(lambda x: x.lower())
active20['clean_text']=active20.Text.apply(lambda x: x.lower())
lazy19['clean_text']=lazy19.Text.apply(lambda x: x.lower())
lazy20['clean_text']=lazy20.Text.apply(lambda x: x.lower())

# Remove numbers
active19['clean_text']=active19['clean_text'].apply(lambda x: re.sub(r'\d+', '', x))
active20['clean_text']=active20['clean_text'].apply(lambda x: re.sub(r'\d+', '', x))
lazy19['clean_text']=lazy19['clean_text'].apply(lambda x: re.sub(r'\d+', '', x))
lazy20['clean_text']=lazy20['clean_text'].apply(lambda x: re.sub(r'\d+', '', x))

# Remove punctuation
translator = str.maketrans('', '', string.punctuation)
active19['clean_text']=active19['clean_text'].apply(lambda x: x.translate(translator))
active20['clean_text']=active20['clean_text'].apply(lambda x: x.translate(translator))
lazy19['clean_text']=lazy19['clean_text'].apply(lambda x: x.translate(translator))
lazy20['clean_text']=lazy20['clean_text'].apply(lambda x: x.translate(translator))

# Remove whitespaces
active19['clean_text']=active19['clean_text'].apply(lambda x: x.strip())
active20['clean_text']=active20['clean_text'].apply(lambda x: x.strip())
lazy19['clean_text']=lazy19['clean_text'].apply(lambda x: x.strip())
lazy20['clean_text']=lazy20['clean_text'].apply(lambda x: x.strip())

In [4]:
# set 'SW' as english stopwords from NLTK and count them
SW = set(stopwords.words('english'))
# remove 'not' from set SW
SW.remove('not')
# add 'amp' that shows up in tweets after '&' as in '&amp' from set SW
SW.add('amp')
print('Number of english stopwords', len(SW))

Number of english stopwords 179


In [5]:
# view english stopwords
print(SW)

{'were', 'through', 'such', 'to', 'that', 'my', 'has', 'from', 'was', 're', 'didn', 'on', 'have', 'with', 'why', 'ours', 'ourselves', 'how', 'and', 'any', 'up', 'more', 'this', "needn't", 'she', 'a', 'doesn', 'aren', 'once', 'had', 'doing', 'myself', 'yours', 'hers', 'shan', 'yourself', 'each', 'being', 'their', 'until', 'of', 'out', 'd', 'them', 'mustn', 'couldn', 'wouldn', 'needn', 'are', 'will', 'amp', 'into', "she's", 'those', 'over', 'll', 'after', 'wasn', 'herself', 'i', 'down', "aren't", 'weren', 'whom', 'where', 'or', 'other', 'no', "don't", "it's", "wasn't", "couldn't", 'then', 'above', "you'd", 'should', 'itself', 'having', 'we', 'he', 'him', 'your', 'does', 'shouldn', 'while', 'some', 'too', "should've", "weren't", 'nor', 'the', 'both', 'own', 'haven', 'than', "isn't", "won't", "wouldn't", "you'll", 'been', 'who', "shan't", 'won', 'be', 'am', 'between', 'for', 'its', 'only', 'y', 'same', "haven't", 'below', 'do', 'further', 'off', 'her', 'at', 'but', 'you', 'don', 'yourselve

In [6]:
def remove_stop_words(x):
    words = word_tokenize(x) # make a list of words
    useful_words = [w for w in words if w not in SW]    # remove stopwords
    return (' '.join(useful_words))

In [7]:
# remove stopwords:
active19['clean_text'] = active19['clean_text'].apply(lambda x: remove_stop_words(x))
active20['clean_text'] = active20['clean_text'].apply(lambda x: remove_stop_words(x))
lazy19['clean_text'] = lazy19['clean_text'].apply(lambda x: remove_stop_words(x))
lazy20['clean_text'] = lazy20['clean_text'].apply(lambda x: remove_stop_words(x))

In [8]:
# convert emoji to text:
active19['clean_text'] = active19['clean_text'].apply(lambda x: emoji.demojize(x, use_aliases=False, delimiters=('', '')))
active20['clean_text'] = active20['clean_text'].apply(lambda x: emoji.demojize(x, use_aliases=False, delimiters=('', '')))
lazy19['clean_text'] = lazy19['clean_text'].apply(lambda x: emoji.demojize(x, use_aliases=False, delimiters=('', '')))
lazy20['clean_text'] = lazy20['clean_text'].apply(lambda x: emoji.demojize(x, use_aliases=False, delimiters=('', '')))

In [9]:
# leave only alpha:
#active19['clean_text'] = active19['clean_text'].apply(lambda x: re.sub(r'[^A-Za-z]+',' ',x))
#active20['clean_text'] = active20['clean_text'].apply(lambda x: re.sub(r'[^A-Za-z]+',' ',x))
#lazy19['clean_text'] = lazy19['clean_text'].apply(lambda x: re.sub(r'[^A-Za-z]+',' ',x))
#lazy20['clean_text'] = lazy20['clean_text'].apply(lambda x: re.sub(r'[^A-Za-z]+',' ',x))

In [10]:
active20[0:10]

Unnamed: 0,Text,Date,Name,Location,clean_text
0,😂 Hundred percent intentional. Better to ask f...,2020-04-08,prattprattpratt,,face_with_tears_of_joy hundred percent intenti...
1,I know I’m gonna see it! I’m turning my Twitte...,2020-04-07,prattprattpratt,,know ’ gon na see ’ turning twitter page alany...
2,"Why am I crying? Oh, it’s just my friend John ...",2020-04-06,prattprattpratt,,crying oh ’ friend john literally making world...
3,That’s gonna open a wormhole GTFOOT!!! https:/...,2020-03-27,prattprattpratt,,’ gon na open wormhole gtfoot httpstcokufpiwt
4,Sure he was a writer and producer on “Parks an...,2020-03-27,prattprattpratt,,sure writer producer “ parks rec ” yes ’ peabo...
5,Exciting news✨ #PixarOnward is ON DIGITAL NOW ...,2020-03-20,prattprattpratt,,exciting newssparkles pixaronward digital us a...
6,"I came downstairs, saw Katherine crying her ey...",2020-03-18,prattprattpratt,,came downstairs saw katherine crying eyes thou...
7,Gary Cooper in “The Westerner” https://t.co/mk...,2020-03-16,prattprattpratt,,gary cooper “ westerner ” httpstcomkjifqxmeq
8,I liked that https://t.co/lXGtDzIciV,2020-03-14,prattprattpratt,,liked httpstcolxgtdziciv
9,I’m so proud of my darling on the success of h...,2020-03-13,prattprattpratt,,’ proud darling success book smartly delayed r...


## Files for Future Use

In [11]:
#save dataframes as csv for future analysis
active20.reset_index().to_csv("active20-clean.csv")
active19.reset_index().to_csv("active19-clean.csv")
lazy20.reset_index().to_csv("lazy20-clean.csv")
lazy19.reset_index().to_csv("lazy19-clean.csv")