## Pre-processing Notebook

In [1]:
import pandas as pd
import string
import re
import nltk

In [8]:
df = pd.read_csv('tweets.csv')

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=True'.


  """Entry point for launching an IPython kernel.


In [9]:
# Dropping duplicates
df.drop_duplicates(inplace=True)

In [11]:
# Getting tweet text
tweets = df['text']

In [12]:
# First 5 tweets
tweets[:5]

0    lysoling da fuck outta everything bc i aint ca...
1                1 Coronavirus confirmed in Washington
2    Hmmmmm, could the #coronavirus soon be the new...
3    At 11 million people, #Wuhan is larger than Ne...
4    Wuhan is cutting off inflows and outflows of t...
Name: text, dtype: object

In [14]:
# Removing punctuation marks
def remove_punct(text):
    text  = "".join([char for char in str(text) if char not in string.punctuation])
    text = re.sub('[0-9]+', '', text)
    return text

tweets1 = tweets.apply(lambda x: remove_punct(x))

In [15]:
# Removing emojis
def remove_emoji(x):
    return x.encode('ascii', 'ignore').decode('ascii')

tweets2 = tweets1.apply(lambda x: remove_emoji(x))

In [16]:
# Removing links
def remove_link(x):
    return re.sub(r"http\S+", "", x)

tweets3 = tweets2.apply(lambda x: remove_link(x))

In [17]:
# Removing end of line characters
def replace_n(x):
    return x.replace('\n',' ')

tweets4 = tweets3.apply(lambda x: replace_n(x))

In [18]:
# Tokenization of the tweets
def tokenization(text):
    text = re.split('\W+', text)
    return text

tweets5 = tweets4.apply(lambda x: tokenization(x.lower()))

In [19]:
# English stopwords
sw = nltk.corpus.stopwords.words('english')

In [20]:
# Removing stopwords
def remove_stopwords(text):
    text = [word for word in text if word not in sw]
    return text
    
tweets6 = tweets5.apply(lambda x: remove_stopwords(x))

In [21]:
# Removing empty strings
def remove_empty_strings(x):
    without_empty_strings = []
    for string in x:
        if (string != ""):
            without_empty_strings.append(string)
    return without_empty_strings

tweets6 = tweets6.apply(lambda x: remove_empty_strings(x))

# After preprocessing the tweets, snapshot of first 20
tweets6[:20]

0     [lysoling, da, fuck, outta, everything, bc, ai...
1                  [coronavirus, confirmed, washington]
2       [hmmmmm, could, coronavirus, soon, new, plague]
3     [million, people, wuhan, larger, new, york, ci...
4     [wuhan, cutting, inflows, outflows, transport,...
5     [china, quarantines, wuhan, chinese, governmen...
6     [little, evidence, coronavirus, epidemic, cont...
7     [totally, control, one, person, coming, china,...
8                                [got, coronavirus, yo]
9     [received, briefing, today, case, asked, healt...
10    [new, coronavirus, wuhan, chinaand, upcoming, ...
11                          [insert, coronavirus, joke]
12    [question, left, long, zombies, appear, china,...
13                      [ya, okay, thanks, coronavirus]
14    [anyone, else, remember, movie, gwynethpaltrow...
15         [wuhan, announces, transportation, lockdown]
16    [bbcworld, jamestgallagher, endchina, coronavi...
17    [im, building, squad, zombie, apocalypse, 

In [22]:
# Lemmatization
lem = nltk.WordNetLemmatizer()

def lemmatizer(text):
    text = [lem.lemmatize(word) for word in text]
    return text

tweets7 = tweets6.apply(lambda x: lemmatizer(x))

In [23]:
# Stemming
ps = nltk.PorterStemmer()

def stemming(text):
    text = [ps.stem(word) for word in text]
    return text

tweets72 = tweets6.apply(lambda x: stemming(x))

In [24]:
# Creating separate day, month, year, hour, minutes and seconds from one column

from datetime import datetime
def time_change1(x):
    if ((x!='False')and(pd.isnull(x)==False)):
        x = float(x)
        day_name = datetime.fromtimestamp(x).strftime("%A")
        return day_name
    else:
        return 'NaN'

def time_change2(x):
    if ((x!='False')and(pd.isnull(x)==False)):
        x = float(x)
        day_name = datetime.fromtimestamp(x).strftime("%B")
        return day_name
    else:
        return 'NaN'
    
def time_change3(x):
    if ((x!='False')and(pd.isnull(x)==False)):
        x = float(x)
        day_name = datetime.fromtimestamp(x).strftime("%d")
        return day_name
    else:
        return 'NaN'
    
def time_change4(x):
    if ((x!='False')and(pd.isnull(x)==False)):
        x = float(x)
        day_name = datetime.fromtimestamp(x).strftime("%H")
        return day_name
    else:
        return 'NaN'
    
def time_change5(x):
    if ((x!='False')and(pd.isnull(x)==False)):
        x = float(x)
        day_name = datetime.fromtimestamp(x).strftime("%M")
        return day_name
    else:
        return 'NaN'

def time_change6(x):
    if ((x!='False')and(pd.isnull(x)==False)):
        x = float(x)
        day_name = datetime.fromtimestamp(x).strftime("%S")
        return day_name
    else:
        return 'NaN'
    

df['day'] = df['created_at'].apply(lambda x: time_change1(x))
df['month'] = df['created_at'].apply(lambda x: time_change2(x))
df['date'] = df['created_at'].apply(lambda x: time_change3(x))
df['hour'] = df['created_at'].apply(lambda x: time_change4(x))
df['minutes'] = df['created_at'].apply(lambda x: time_change5(x))
df['seconds'] = df['created_at'].apply(lambda x: time_change6(x))

In [25]:
# All columns in the dataset
df.columns

Index(['account_created_at', 'account_lang', 'bbox_coords', 'coords_coords',
       'country', 'country_code', 'created_at', 'description',
       'display_text_width', 'ext_media_expanded_url', 'ext_media_t.co',
       'ext_media_type', 'ext_media_url', 'favorite_count', 'favourites_count',
       'followers_count', 'friends_count', 'geo_coords', 'hashtags',
       'is_quote', 'is_retweet', 'lang', 'listed_count', 'location',
       'media_expanded_url', 'media_t.co', 'media_type', 'media_url',
       'mentions_screen_name', 'mentions_user_id', 'name', 'place_full_name',
       'place_name', 'place_type', 'place_url', 'profile_background_url',
       'profile_banner_url', 'profile_expanded_url', 'profile_image_url',
       'profile_url', 'protected', 'quote_count', 'quoted_created_at',
       'quoted_description', 'quoted_favorite_count', 'quoted_followers_count',
       'quoted_friends_count', 'quoted_location', 'quoted_name',
       'quoted_retweet_count', 'quoted_screen_name', 'quo

In [26]:
new = pd.DataFrame({'tweet':df.text,'lemmatizer':tweets7.values,'stemmer':tweets72.values,'day':df.day, 'month':df.month, 'date':df.date, 'hour':df.hour, 'minutes':df.minutes, 'seconds':df.seconds,'location':df.place_name})

In [27]:
df['lemmatizer'] = new['lemmatizer']
df['stemmer'] = new['stemmer']
df['day'] = new['day']
df['month'] = new['month']
df['date'] = new['date']
df['hour'] = new['hour']
df['minutes'] = new['minutes']
df['seconds'] = new['seconds']

In [29]:
def new_coordinates(x):
    if ((x==x)==False):
        return 'NaN'
    #print(x)
    if (x[2]!='-'):
        return 'NaN'
    lt = x[2:-1].split(',')
    l1 = (float(lt[5]) + float(lt[7]))/2
    l2 = (float(lt[0]) + float(lt[2]))/2
    L = [str(l1),str(l2)]
    return ", ".join(L)

df['coordinates'] = df['bbox_coords'].apply(lambda x:new_coordinates(x))

In [30]:
df.to_csv('tweets_preprocessed.csv')