# Data Wrangling

## Uploading Modules

In [1]:
import pandas as pd
import string
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
#nltk.download('stopwords')
#nltk.download('punkt')
import emoji

## Loading the Data

In [2]:
active19 = pd.read_csv("active19.csv")
active20 = pd.read_csv("active20.csv")
lazy19 = pd.read_csv("lazy19.csv")
lazy20 = pd.read_csv("lazy20.csv")

After a text is obtained, we start with text normalization. Text normalization includes:
- converting all letters to lower or upper case
- converting numbers into words or removing numbers
- removing punctuations, accent marks and other diacritics
- removing white spaces
- expanding abbreviations
- removing stop words, sparse terms, and particular words
- text canonicalization

In [3]:
# Convert text to lowercase
active19['clean_text']=active19.Text.apply(lambda x: x.lower())
active20['clean_text']=active20.Text.apply(lambda x: x.lower())
lazy19['clean_text']=lazy19.Text.apply(lambda x: x.lower())
lazy20['clean_text']=lazy20.Text.apply(lambda x: x.lower())

# Remove numbers
active19['clean_text']=active19['clean_text'].apply(lambda x: re.sub(r'\d+', '', x))
active20['clean_text']=active20['clean_text'].apply(lambda x: re.sub(r'\d+', '', x))
lazy19['clean_text']=lazy19['clean_text'].apply(lambda x: re.sub(r'\d+', '', x))
lazy20['clean_text']=lazy20['clean_text'].apply(lambda x: re.sub(r'\d+', '', x))

# Remove punctuation
translator = str.maketrans('', '', string.punctuation)
active19['clean_text']=active19['clean_text'].apply(lambda x: x.translate(translator))
active20['clean_text']=active20['clean_text'].apply(lambda x: x.translate(translator))
lazy19['clean_text']=lazy19['clean_text'].apply(lambda x: x.translate(translator))
lazy20['clean_text']=lazy20['clean_text'].apply(lambda x: x.translate(translator))

# Remove whitespaces
active19['clean_text']=active19['clean_text'].apply(lambda x: x.strip())
active20['clean_text']=active20['clean_text'].apply(lambda x: x.strip())
lazy19['clean_text']=lazy19['clean_text'].apply(lambda x: x.strip())
lazy20['clean_text']=lazy20['clean_text'].apply(lambda x: x.strip())

In [4]:
# set 'SW' as english stopwords from NLTK and count them
SW = set(stopwords.words('english'))
# remove 'not' from set SW
SW.remove('not')
SW.add('amp') # add 'amp' that shows up in tweets after '&' as in '&amp' from set SW
SW.add('»') #left after punctuation removal
SW.add('”') #left after punctuation removal
SW.add('“') #left after punctuation removal
SW.add('’ ') #left after punctuation removal
SW.add('im')
SW.add("I'm")
SW.add('gon na')
print('Number of english stopwords', len(SW))

Number of english stopwords 186


In [5]:
# view english stopwords
print(SW)

{'but', 'his', 'there', '»', 'to', 'that', 'or', "you'd", 'isn', 'out', 'than', 'very', "won't", 'her', 'most', 'll', 'are', 'did', 'over', "wouldn't", 'yourself', 'again', 'after', 'under', 'just', 'gon na', 'at', 'who', "that'll", "you've", 've', 'weren', 'more', 'too', 'those', 'and', 'which', "doesn't", 'haven', 'other', 'during', '’ ', 'she', 'what', "she's", "it's", "mustn't", 'do', 'we', 'against', "you're", 'him', 'above', 'themselves', 'up', 'both', 'been', 'they', "shan't", "shouldn't", 'amp', 'is', 'can', 'being', 'needn', 'off', 'm', 'wasn', "hasn't", 'mightn', 'myself', "needn't", 'y', 'ma', 'me', 'where', 'o', 'down', 'mustn', 'does', 'should', 'as', "mightn't", 'a', 'same', 'doesn', 'shan', '“', 'own', 'in', "wasn't", 'ours', 'with', 'won', "I'm", 'have', 'its', 'until', 't', 'for', "don't", 'each', 'was', 's', "you'll", 'you', 'into', 'it', 'before', 'few', 'has', 'don', "isn't", 'im', 'then', 'the', 'whom', 'through', 'were', 'so', 'further', "couldn't", 'because', 'al

In [6]:
def remove_stop_words(x):
    words = word_tokenize(x) # make a list of words
    useful_words = [w for w in words if w not in SW]    # remove stopwords
    return (' '.join(useful_words))

In [7]:
# remove stopwords:
active19['clean_text'] = active19['clean_text'].apply(lambda x: remove_stop_words(x))
active20['clean_text'] = active20['clean_text'].apply(lambda x: remove_stop_words(x))
lazy19['clean_text'] = lazy19['clean_text'].apply(lambda x: remove_stop_words(x))
lazy20['clean_text'] = lazy20['clean_text'].apply(lambda x: remove_stop_words(x))

In [8]:
# lemmatize
import nltk
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
lemmatizer=WordNetLemmatizer()
#lemmatizer.lemmatize(word)

active19['clean_text'] = active19['clean_text'].apply(lambda x: lemmatizer.lemmatize(x))
active20['clean_text'] = active20['clean_text'].apply(lambda x: lemmatizer.lemmatize(x))
lazy19['clean_text'] = lazy19['clean_text'].apply(lambda x: lemmatizer.lemmatize(x))
lazy20['clean_text'] = lazy20['clean_text'].apply(lambda x: lemmatizer.lemmatize(x))

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\anna_\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [9]:
# convert emoji to text:
active19['clean_text'] = active19['clean_text'].apply(lambda x: emoji.demojize(x, use_aliases=False, delimiters=('', '')))
active20['clean_text'] = active20['clean_text'].apply(lambda x: emoji.demojize(x, use_aliases=False, delimiters=('', '')))
lazy19['clean_text'] = lazy19['clean_text'].apply(lambda x: emoji.demojize(x, use_aliases=False, delimiters=('', '')))
lazy20['clean_text'] = lazy20['clean_text'].apply(lambda x: emoji.demojize(x, use_aliases=False, delimiters=('', '')))

In [10]:
# remove ' and gon na:
active19['clean_text'] = active19['clean_text'].str.replace(r'gon na', 'going')
active20['clean_text'] = active20['clean_text'].str.replace(r'gon na', 'going')
lazy19['clean_text'] = lazy19['clean_text'].str.replace(r'gon na', 'going')
lazy20['clean_text'] = lazy20['clean_text'].str.replace(r'gon na', 'going')

active19['clean_text'] = active19['clean_text'].str.replace(r'’ ', '')
active20['clean_text'] = active20['clean_text'].str.replace(r'’ ', '')
lazy19['clean_text'] = lazy19['clean_text'].str.replace(r'’ ', '')
lazy20['clean_text'] = lazy20['clean_text'].str.replace(r'’ ', '')

In [11]:
lazy20.head()

Unnamed: 0,Text,Date,Name,Location,search terms,clean_text
0,Look honors bio did too but tbh I didn’t reall...,2020-04-11,HappyFeminist,"New Jersey, USA",homebody,look honors bio tbh really pay attention
1,IT. IS. A. VIRUS. \n\nOur high school sex ed c...,2020-04-11,HappyFeminist,"New Jersey, USA",homebody,virus high school sex ed class taught us diffe...
2,.@NickBollettieri webinar @MyUTR is absolutely...,2020-04-09,Riske4rewards,,homebody,nickbollettieri webinar myutr absolutely price...
3,Join me tomorrow at 2pm PT for my live UTR All...,2020-04-06,Riske4rewards,,homebody,join tomorrow pm pt live utr access series web...
4,my current fav hobby is browsing https://t.co/...,2020-04-05,Riske4rewards,,homebody,current fav hobby browsing httpstcovewmjay hea...


## Files for Future Use

In [12]:
#save dataframes as csv for future analysis
active20.reset_index().to_csv("active20-clean.csv")
active19.reset_index().to_csv("active19-clean.csv")
lazy20.reset_index().to_csv("lazy20-clean.csv")
lazy19.reset_index().to_csv("lazy19-clean.csv")