# Data Wrangling

## Uploading Modules

In [1]:
import pandas as pd
import string
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
#nltk.download('stopwords')
#nltk.download('punkt')
import emoji

## Loading the Data

In [2]:
active19 = pd.read_csv("active19.csv")
active20 = pd.read_csv("active20.csv")
lazy19 = pd.read_csv("lazy19.csv")
lazy20 = pd.read_csv("lazy20.csv")

After a text is obtained, we start with text normalization. Text normalization includes:
- converting all letters to lower or upper case
- converting numbers into words or removing numbers
- removing punctuations, accent marks and other diacritics
- removing white spaces
- expanding abbreviations
- removing stop words, sparse terms, and particular words
- text canonicalization

In [3]:
# Convert text to lowercase
active19['clean_text']=active19.Text.apply(lambda x: x.lower())
active20['clean_text']=active20.Text.apply(lambda x: x.lower())
lazy19['clean_text']=lazy19.Text.apply(lambda x: x.lower())
lazy20['clean_text']=lazy20.Text.apply(lambda x: x.lower())

# Remove numbers
active19['clean_text']=active19['clean_text'].apply(lambda x: re.sub(r'\d+', '', x))
active20['clean_text']=active20['clean_text'].apply(lambda x: re.sub(r'\d+', '', x))
lazy19['clean_text']=lazy19['clean_text'].apply(lambda x: re.sub(r'\d+', '', x))
lazy20['clean_text']=lazy20['clean_text'].apply(lambda x: re.sub(r'\d+', '', x))

# Remove punctuation
translator = str.maketrans('', '', string.punctuation)
active19['clean_text']=active19['clean_text'].apply(lambda x: x.translate(translator))
active20['clean_text']=active20['clean_text'].apply(lambda x: x.translate(translator))
lazy19['clean_text']=lazy19['clean_text'].apply(lambda x: x.translate(translator))
lazy20['clean_text']=lazy20['clean_text'].apply(lambda x: x.translate(translator))

# Remove whitespaces
active19['clean_text']=active19['clean_text'].apply(lambda x: x.strip())
active20['clean_text']=active20['clean_text'].apply(lambda x: x.strip())
lazy19['clean_text']=lazy19['clean_text'].apply(lambda x: x.strip())
lazy20['clean_text']=lazy20['clean_text'].apply(lambda x: x.strip())

In [4]:
# set 'SW' as english stopwords from NLTK and count them
SW = set(stopwords.words('english'))
# remove 'not' from set SW
SW.remove('not')
SW.add('amp') # add 'amp' that shows up in tweets after '&' as in '&amp' from set SW
SW.add('»') #left after punctuation removal
SW.add('”') #left after punctuation removal
SW.add('“') #left after punctuation removal
SW.add('’ ') #left after punctuation removal
SW.add('im')
SW.add("I'm")
SW.add('gon na')
print('Number of english stopwords', len(SW))

Number of english stopwords 186


In [5]:
# view english stopwords
print(SW)

{'about', 'whom', 'of', 'each', 'he', 'these', 'haven', 'and', 'as', 'yourself', 'him', 'me', 'yourselves', "mightn't", 'is', 'why', 'we', "won't", 'your', 'hadn', 'theirs', 'has', 'didn', 'don', 'but', "hadn't", "weren't", '»', 'some', 'off', 'they', 'ours', 'had', 'with', 'before', 'further', 'all', '“', 'do', 'only', 'through', "shan't", 'their', 'wasn', 'where', 'his', 'same', 'its', 'himself', 'too', 'yours', 'were', 'very', 'themselves', 'being', 'such', 'now', "it's", 'any', 'hasn', 'to', 'nor', 'd', "aren't", 'gon na', 'if', 'most', 'again', 'our', 'from', 'which', "wasn't", 'then', 'those', 'after', "mustn't", 'can', 'isn', 'should', 'she', 'by', 'ourselves', 'weren', 'so', 'into', "that'll", 'her', 'when', 'did', 'doesn', '’ ', "don't", 'an', 're', 'mustn', 'that', '”', 'are', 'it', 'against', 'amp', 's', 'was', 'what', 'in', 'under', 'here', 'at', 'above', 'during', 'shan', 'down', "doesn't", 'once', 'my', 'wouldn', 'both', 'than', 've', 'hers', 'am', 'm', 'just', 't', 'betw

In [6]:
def remove_stop_words(x):
    words = word_tokenize(x) # make a list of words
    useful_words = [w for w in words if w not in SW]    # remove stopwords
    return (' '.join(useful_words))

In [7]:
# remove stopwords:
active19['clean_text'] = active19['clean_text'].apply(lambda x: remove_stop_words(x))
active20['clean_text'] = active20['clean_text'].apply(lambda x: remove_stop_words(x))
lazy19['clean_text'] = lazy19['clean_text'].apply(lambda x: remove_stop_words(x))
lazy20['clean_text'] = lazy20['clean_text'].apply(lambda x: remove_stop_words(x))

In [8]:
# lemmatize
import nltk
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
lemmatizer=WordNetLemmatizer()
#lemmatizer.lemmatize(word)

active19['clean_text'] = active19['clean_text'].apply(lambda x: lemmatizer.lemmatize(x))
active20['clean_text'] = active20['clean_text'].apply(lambda x: lemmatizer.lemmatize(x))
lazy19['clean_text'] = lazy19['clean_text'].apply(lambda x: lemmatizer.lemmatize(x))
lazy20['clean_text'] = lazy20['clean_text'].apply(lambda x: lemmatizer.lemmatize(x))

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\anna_\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [9]:
# convert emoji to text:
active19['clean_text'] = active19['clean_text'].apply(lambda x: emoji.demojize(x, use_aliases=False, delimiters=('', '')))
active20['clean_text'] = active20['clean_text'].apply(lambda x: emoji.demojize(x, use_aliases=False, delimiters=('', '')))
lazy19['clean_text'] = lazy19['clean_text'].apply(lambda x: emoji.demojize(x, use_aliases=False, delimiters=('', '')))
lazy20['clean_text'] = lazy20['clean_text'].apply(lambda x: emoji.demojize(x, use_aliases=False, delimiters=('', '')))

In [10]:
# remove ' and gon na:
active19['clean_text'] = active19['clean_text'].str.replace(r'gon na', 'going')
active20['clean_text'] = active20['clean_text'].str.replace(r'gon na', 'going')
lazy19['clean_text'] = lazy19['clean_text'].str.replace(r'gon na', 'going')
lazy20['clean_text'] = lazy20['clean_text'].str.replace(r'gon na', 'going')

In [11]:
active20.head()

Unnamed: 0,Text,Date,Name,Location,clean_text
0,😂 Hundred percent intentional. Better to ask f...,2020-04-08,prattprattpratt,,face_with_tears_of_joy hundred percent intenti...
1,I know I’m gonna see it! I’m turning my Twitte...,2020-04-07,prattprattpratt,,know ’ going see ’ turning twitter page alanya...
2,"Why am I crying? Oh, it’s just my friend John ...",2020-04-06,prattprattpratt,,crying oh ’ friend john literally making world...
3,That’s gonna open a wormhole GTFOOT!!! https:/...,2020-03-27,prattprattpratt,,’ going open wormhole gtfoot httpstcokufpiwt
4,Sure he was a writer and producer on “Parks an...,2020-03-27,prattprattpratt,,sure writer producer parks rec yes ’ peabody e...


## Files for Future Use

In [12]:
#save dataframes as csv for future analysis
active20.reset_index().to_csv("active20-clean.csv")
active19.reset_index().to_csv("active19-clean.csv")
lazy20.reset_index().to_csv("lazy20-clean.csv")
lazy19.reset_index().to_csv("lazy19-clean.csv")