In [56]:
import pandas as pd
import numpy as np


import re
import string

#NLTK
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import TweetTokenizer

#Date-time
from datetime import datetime
import dateutil.parser as dparser


In [2]:
trump_raw = pd.read_csv("D:\Codes\PROJECT 2\hashtag_donaldtrump.csv",
                 lineterminator='\n')


In [3]:
biden_raw = pd.read_csv("D:\Codes\PROJECT 2\hashtag_joebiden.csv",
                 lineterminator='\n')
                 

In [18]:
#Removes columns of data with very high Na values. 
trump=trump_raw.drop(['user_description','long','lat','city','continent',
'state','state_code'], axis=1)

#remove rows of data with less NaN values
trump=trump.dropna(subset=['user_name','source','user_location'])

print(f"This data has (rows x col){trump.shape}")
print(f"total NaN values in this dataFrame: {trump.isna().sum().sum()}")


This data has (rows x col)(675765, 14)
total NaN values in this dataFrame: 233165


In [19]:
#Removes columns of data with very high Na values. 
biden=biden_raw.drop(['user_description','long','lat','city','continent',
'state','state_code'], axis=1)

#remove rows of data with less NaN values
biden=biden.dropna(subset=['user_name','source','user_location'])

print(f"This data has (rows x col){biden.shape}")
print(f"total NaN values in this dataFrame: {biden.isna().sum().sum()}")

This data has (rows x col)(542952, 14)
total NaN values in this dataFrame: 189262


In [20]:
#Add ac column of T and a column of B in trump and biden respectively to easily be able to identify them
trump['candidate']='T'
biden['candidate']='B'

In [21]:
trump['user_location'].head()

0    Philadelphia, PA / Miami, FL
2                        Portland
4                   Washington DC
5               Perris,California
6                      Powell, TN
Name: user_location, dtype: object

In [50]:
#START HERE for location set makes new dataset with locations (only for tweets with location data)
tweets_loc = trump.append(biden) #adds trump and biden hashtag data into a single dataframe shape = (1747805, 21)

tweets_loc = tweets_loc.drop_duplicates(subset = 'tweet_id') #drops overlapping tweets (hashtag for both Trump and Biden) shape = (1522909, 21)

US = tweets_loc['country'] == 'United States of America' #selects only USA tweets

tweets_USA = tweets_loc[US] #shape = (301341, 21)


tweets_loc=tweets_loc.drop(['country'], axis=1).reset_index() #drops extra columns we wont use shape = (301341, 10)
tweets_USA=tweets_USA.drop(['country'], axis=1).reset_index()

(301329, 15)

## Pre processing the tweets

In [53]:
processed = pd.read_csv("D:\Codes\PROJECT 2\processed.csv")
processed_USA=pd.read_csv("D:\Codes\PROJECT 2\processed_usa.csv")
processed_USA.dtypes

Unnamed: 0     int64
0             object
dtype: object

### ATTENTION: (This cell creates the processed.csv file. If you have the processed.csv file imported you donot need to run this part of the notebook. let it stay as a markdown cell)

def process_tweet(tweet):
    """Process tweet function.
    Input:
        tweet: a string containing a tweet
    Output:
        tweets_clean: a list of words containing the processed tweet

    """
    stemmer = PorterStemmer()
    stopwords_english = stopwords.words('english')
    # remove stock market tickers like $GE
    tweet = re.sub(r'\$\w*', '', tweet)
    # remove old style retweet text "RT"
    tweet = re.sub(r'^RT[\s]+', '', tweet)
    # remove hyperlinks
    tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet)
    # remove hashtags
    # only removing the hash # sign from the word
    tweet = re.sub(r'#', '', tweet)
    
    # tokenize tweets
    tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True,
                               reduce_len=True)
    tweet_tokens = tokenizer.tokenize(tweet)

    tweets_clean = []
    for word in tweet_tokens:
        if (word not in stopwords_english and  # remove stopwords
                word not in string.punctuation):  # remove punctuation
            # tweets_clean.append(word)
            stem_word = stemmer.stem(word)  # stemming word
            tweets_clean.append(stem_word)

    return tweets_clean
    
# Applying the above made function on all of the data set. (This is a very resource intensive task. Run at your own risk)

x={}    # Inititalizing a new dictionary to store the values in
i=0     # A seperate iterator to avoid a double for loop
for tweet in tweets_USA['tweet']:
    x[i]=process_tweet(tweet)
    i+=1

s = pd.Series(x)

type(s)

s.to_csv(r"D:\Codes\PROJECT 2\processed_usa.csv")

## Adaptation
+ Creating a new column that checks the time remaining until elections to check for relevance of the tweet


In [54]:

created_at = tweets_USA['created_at']
countdown = []
for i in range(len(created_at)):
    time = dparser.parse(created_at[i])
    election = datetime(2020, 11, 3)
    newtime = (election - time).days
    countdown.append(newtime)
tweets_USA['countdown'] = countdown

In [55]:
joined = tweets_USA['user_join_date']
account = tweets_USA['created_at']
agelist = []
for i in range(len(joined)):
    time = dparser.parse(joined[i])
    created = dparser.parse(account[i])
    newtime = (created-time).days
    agelist.append(newtime)
tweets_USA['age'] = agelist
tweets_USA = tweets_USA.drop(['created_at', 'index', 'user_join_date'], axis = 1)

In [58]:
tweets_USA.head()

Unnamed: 0,tweet_id,tweet,likes,retweet_count,source,user_id,user_name,user_screen_name,user_followers_count,user_location,collected_at,candidate,countdown,age
0,1.316529e+18,#Elecciones2020 | En #Florida: #JoeBiden dice ...,0.0,0.0,TweetDeck,360666500.0,El Sol Latino News,elsollatinonews,1860.0,"Philadelphia, PA / Miami, FL",2020-10-21 00:00:00,T,18,3340
1,1.316529e+18,"#Trump: As a student I used to hear for years,...",2.0,1.0,Twitter Web App,8436472.0,snarke,snarke,1185.0,Portland,2020-10-21 00:00:00.746433060,T,18,4798
2,1.316529e+18,You get a tie! And you get a tie! #Trump ‘s ra...,4.0,3.0,Twitter for iPhone,47413800.0,Rana Abtar - رنا أبتر,Ranaabtar,5393.0,Washington DC,2020-10-21 00:00:01.492866121,T,18,4139
3,1.316529e+18,@CLady62 Her 15 minutes were over long time ag...,2.0,0.0,Twitter for Android,1138416000.0,Farris Flagg,FarrisFlagg,2363.0,"Perris,California",2020-10-21 00:00:01.866082651,T,18,2812
4,1.316529e+18,@DeeviousDenise @realDonaldTrump @nypost There...,0.0,0.0,Twitter for iPhone,9.007611e+17,Stacey Gulledge 🇺🇸 Patriot ♥️ KAG 🙏 👮‍♀️♥️,sm_gulledge,766.0,"Ohio, USA",2020-10-21 00:00:02.612515712,T,18,1147
