In [1]:
import pandas as pd
import numpy as np
import nltk

In [3]:
df = pd.read_csv("data/tweets.csv")
df.head()

Unnamed: 0,id,handle,text,is_retweet,original_author,time,in_reply_to_screen_name,in_reply_to_status_id,in_reply_to_user_id,is_quote_status,...,place_type,place_country_code,place_country,place_contained_within,place_attributes,place_bounding_box,source_url,truncated,entities,extended_entities
0,780925634159796224,HillaryClinton,The question in this election: Who can put the...,False,,2016-09-28T00:22:34,,,,False,...,,,,,,,https://studio.twitter.com,False,{'media': [{'display_url': 'pic.twitter.com/Xr...,{'media': [{'display_url': 'pic.twitter.com/Xr...
1,780916180899037184,HillaryClinton,"Last night, Donald Trump said not paying taxes...",True,timkaine,2016-09-27T23:45:00,,,,False,...,,,,,,,http://twitter.com,False,{'media': [{'display_url': 'pic.twitter.com/t0...,{'media': [{'display_url': 'pic.twitter.com/t0...
2,780911564857761793,HillaryClinton,Couldn't be more proud of @HillaryClinton. Her...,True,POTUS,2016-09-27T23:26:40,,,,False,...,,,,,,,https://about.twitter.com/products/tweetdeck,False,"{'user_mentions': [{'id_str': '1536791610', 'n...",
3,780907038650068994,HillaryClinton,"If we stand together, there's nothing we can't...",False,,2016-09-27T23:08:41,,,,False,...,,,,,,,https://studio.twitter.com,False,{'media': [{'display_url': 'pic.twitter.com/Q3...,{'media': [{'display_url': 'pic.twitter.com/Q3...
4,780897419462602752,HillaryClinton,Both candidates were asked about how they'd co...,False,,2016-09-27T22:30:27,,,,False,...,,,,,,,https://about.twitter.com/products/tweetdeck,False,"{'user_mentions': [], 'symbols': [], 'urls': [...",


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6444 entries, 0 to 6443
Data columns (total 28 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   id                       6444 non-null   int64  
 1   handle                   6444 non-null   object 
 2   text                     6444 non-null   object 
 3   is_retweet               6444 non-null   bool   
 4   original_author          722 non-null    object 
 5   time                     6444 non-null   object 
 6   in_reply_to_screen_name  208 non-null    object 
 7   in_reply_to_status_id    202 non-null    float64
 8   in_reply_to_user_id      208 non-null    float64
 9   is_quote_status          6444 non-null   bool   
 10  lang                     6444 non-null   object 
 11  retweet_count            6444 non-null   int64  
 12  favorite_count           6444 non-null   int64  
 13  longitude                12 non-null     float64
 14  latitude                

# Preprocessing

In [5]:
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk import pos_tag
import re
from num2words import num2words
from nltk.tokenize import TweetTokenizer
# nltk.download('stopwords') # Find all stopwords
# nltk.download('wordnet') # Used for lemmatisation
# nltk.download('omw-1.4')  # Additional language support 
# nltk.download('averaged_perceptron_tagger_eng')  # For part-of-speech tagging

In [6]:
process_df = df.copy() # Make a copy of the original dataframe

# 1) lowercase
process_df["text"] = process_df["text"].str.lower()

tweets = list(process_df["text"])
stop_words = stopwords.words("english")
# Some extra stop words
stop_words.extend(["ive", "im"])
lemmatizer = WordNetLemmatizer()
tknzr = TweetTokenizer() # Tokenizes the tweet
filtered_tweets = []

# Function to map NLTK POS tag to WordNet POS
def get_wordnet_pos(word):
    tag = pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ, "N": wordnet.NOUN, "V": wordnet.VERB, "R": wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN)

for text in tweets:
    text_ls = tknzr.tokenize(text)
    filtered_text = []
    for t in text_ls:
        # 2) Remove Punctuations and Apostrophes
        t = re.sub(r'[^A-Za-z0-9]', '', t)
        # 3) Get rid of stopwords. 4) Removes the links at the end of each tweet. 5) Remove Single Characters.
        if (t not in stop_words) and ("https" not in t) and (len(t) > 1):
            # 6) Turn numbers into words. 7) Lemmatisation.
            if t.isdigit(): # If t is a number
                t = num2words(int(t))
            else:
                lemmatized_t = lemmatizer.lemmatize(t, get_wordnet_pos(t))
            filtered_text.append(t)
    
    filtered_tweets.append(filtered_text)

In [7]:
# Putting everything back together
filtered_tweets_sentence = []
for tweet in filtered_tweets:
    join_sentence = " ".join(tweet)
    filtered_tweets_sentence.append(join_sentence)

process_df["text"] = filtered_tweets_sentence

In [8]:
process_df.head()

Unnamed: 0,id,handle,text,is_retweet,original_author,time,in_reply_to_screen_name,in_reply_to_status_id,in_reply_to_user_id,is_quote_status,...,place_type,place_country_code,place_country,place_contained_within,place_attributes,place_bounding_box,source_url,truncated,entities,extended_entities
0,780925634159796224,HillaryClinton,question election put plans action make life b...,False,,2016-09-28T00:22:34,,,,False,...,,,,,,,https://studio.twitter.com,False,{'media': [{'display_url': 'pic.twitter.com/Xr...,{'media': [{'display_url': 'pic.twitter.com/Xr...
1,780916180899037184,HillaryClinton,last night donald trump said paying taxes smar...,True,timkaine,2016-09-27T23:45:00,,,,False,...,,,,,,,http://twitter.com,False,{'media': [{'display_url': 'pic.twitter.com/t0...,{'media': [{'display_url': 'pic.twitter.com/t0...
2,780911564857761793,HillaryClinton,couldnt proud hillaryclinton vision command la...,True,POTUS,2016-09-27T23:26:40,,,,False,...,,,,,,,https://about.twitter.com/products/tweetdeck,False,"{'user_mentions': [{'id_str': '1536791610', 'n...",
3,780907038650068994,HillaryClinton,stand together theres nothing cant make sure y...,False,,2016-09-27T23:08:41,,,,False,...,,,,,,,https://studio.twitter.com,False,{'media': [{'display_url': 'pic.twitter.com/Q3...,{'media': [{'display_url': 'pic.twitter.com/Q3...
4,780897419462602752,HillaryClinton,candidates asked theyd confront racial injusti...,False,,2016-09-27T22:30:27,,,,False,...,,,,,,,https://about.twitter.com/products/tweetdeck,False,"{'user_mentions': [], 'symbols': [], 'urls': [...",


In [9]:
process_df.to_csv("data/processed_tweet.csv", index=False)