# Creating Features

In this notebook, create features for Clinton and Trump:
* Topic based on hashtag
* Presence of url, mention or hash tag
* Days before the election, weekday, time of day
* Who is mentioned in a tweet

In [1]:
# Data Structures
import numpy  as np
import pandas as pd
import csv
import pickle

# Corpus Processing
import re
import nltk.corpus
from unidecode                        import unidecode
from nltk.tokenize                    import word_tokenize
from nltk                             import SnowballStemmer

In [2]:
pd.set_option('display.max_colwidth', None)

# Tweets
## Open Data Set With Readability Scores

In [3]:
path_clinton = '../Data/tweets_clinton_20151109_20161109_readability.json'

with open(path_clinton, encoding="utf8") as f:
    tweets_clinton = pd.read_json(f, orient='columns', convert_axes=True, dtype=True, lines=False)

### Topic based on hash tag

In [4]:
path_categories = '../Topics/topic_categories.csv'

with open(path_categories, encoding = 'utf8') as f:
    reader = csv.reader(f)
    hashtag_cats = {rows[0]:rows[1] for rows in reader}

In [5]:
def get_topic(hashtags):
    if len(hashtags) > 0:
        return hashtag_cats[hashtags[0]]
    else:
        return 'no topic'

In [6]:
tweets_clinton['topic_hashtag'] = tweets_clinton.apply(lambda x: get_topic(x.hashtags), axis = 1)

tweets_clinton['topic_hashtag']

0            no topic
1            no topic
2            no topic
3            no topic
4            no topic
            ...      
5997         no topic
5998         no topic
5999         no topic
6000    miscellaneous
6001         no topic
Name: topic_hashtag, Length: 6002, dtype: object

## Presence of Url, Mention, Hash Tag

In [7]:
url_pattern = r"(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’]))"
mention_pattern = r"(?:[\s\.-–()^=+\[\{\]\}|\\:;\"',.<>\/?~`])([@][\w_-]+)"
hashtag_pattern = r"(?:\W)([#][\w_-]+)"

In [8]:
presence_urlmentionhashtag = {
    'presence url': False,
    'presence mention':  False,
    'presence hashtag': False
}

In [9]:
def find_url_mention_hashtag(tweet):
    offset = 10
    
    try:
        presence = presence_urlmentionhashtag.copy()

        # Find url
        url = re.search(url_pattern, tweet)
        if url:
            presence['presence url'] = True

        # Find mention
        mention = re.search(mention_pattern, tweet)
        if mention:
            presence['presence mention'] = True
            
        # Find hashtag
        hashtag = re.search(hashtag_pattern, tweet)
        if hashtag:
            presence['presence hashtag'] = True
            
        return pd.Series(presence)
    
    except ValueError:
        return None

In [10]:
tweets_clinton_presence = tweets_clinton.apply(lambda x: find_url_mention_hashtag(x.text), axis = 1)

In [11]:
tweets_clinton_presence.sample(n = 5, random_state = 75)

Unnamed: 0,presence url,presence mention,presence hashtag
1027,True,False,True
5138,True,False,False
2814,True,True,False
365,True,False,False
3347,False,False,False


In [12]:
tweets_clinton = tweets_clinton.merge(tweets_clinton_presence, how = 'left', left_index = True, right_index = True)

## Features Related to Date and Time
### Days Before Election

In [13]:
election_date = pd.Timestamp('2016-11-08T12')

def to_days_before_election(time_stamp):
    return (time_stamp - election_date).days

In [14]:
# Days before election is the number of days before the election
# Meaning that November 8, 2016 is encoded as 0
# And the day before it, November 8, 2016 as -1, etc
tweets_clinton['days before election'] = tweets_clinton.apply(lambda x: to_days_before_election(x['created_at']), axis = 1)

### Day of the Week

In [15]:
days_of_the_week = {
    'day monday': False,
    'day tuesday': False,
    'day wednesday': False,
    'day thursday': False,
    'day friday': False,
    'day saturday': False,
    'day sunday': False
}

def day_of_week(timestamp):
    try:
        days = days_of_the_week.copy()
        day_of_week = timestamp.dayofweek
        
        if day_of_week == 0:
            days['day monday'] = True
        elif day_of_week == 1:
            days['day tuesday'] = True
        elif day_of_week == 2:
            days['day wednesday'] = True
        elif day_of_week == 3:
            days['day thursday'] = True
        elif day_of_week == 4:
            days['day friday'] = True
        elif day_of_week == 5:
            days['day saturday'] = True
        elif day_of_week == 6:
            days['day sunday'] = True
        
        return pd.Series(days)
    except ValueError:
        return None

In [16]:
# Get the day of the week of every tweet
tweets_clinton_day_of_week = tweets_clinton.apply(lambda x: day_of_week(x['created_at']), axis = 1)

In [17]:
tweets_clinton_day_of_week.sample(n = 5, random_state = 75)

Unnamed: 0,day monday,day tuesday,day wednesday,day thursday,day friday,day saturday,day sunday
1027,False,False,False,True,False,False,False
5138,False,False,True,False,False,False,False
2814,True,False,False,False,False,False,False
365,False,True,False,False,False,False,False
3347,False,False,False,False,True,False,False


In [18]:
# Merge 'day of the week' dataframe with tweets dataframe
tweets_clinton = tweets_clinton.merge(tweets_clinton_day_of_week, how = 'left', left_index = True, right_index = True)

### Hour of the Day

In [19]:
# Get the hour of the day of every tweet
tweets_clinton['hour of the day'] = tweets_clinton.apply(lambda x: x['created_at'].hour, axis = 1)

## Mentioned in Tweet

In [20]:
# removes a list of words (ie. stopwords) from a tokenized list.
def removeWords(listOfTokens, listOfWords):
    return [token for token in listOfTokens if token not in listOfWords]

# applies stemming to a list of tokenized words
def applyStemming(listOfTokens, stemmer):
    return [stemmer.stem(token) for token in listOfTokens]

# removes any words composed of less than 2 letters
def twoLetters(listOfTokens):
    twoLetterWord = []
    for token in listOfTokens:
        if len(token) <= 2:
            twoLetterWord.append(token)
    return twoLetterWord

In [21]:
def processCorpus(corpus, language):   
    stopwords = nltk.corpus.stopwords.words(language)
    param_stemmer = SnowballStemmer(language)
    
    for document in corpus:
        index = corpus.index(document)
        corpus[index] = corpus[index].replace(u'\ufffd', '8')   # Replaces the ASCII '�' symbol with '8'
        corpus[index] = corpus[index].replace(',', '')          # Removes commas
        corpus[index] = corpus[index].rstrip('\n')              # Removes line breaks
        corpus[index] = corpus[index].casefold()                # Makes all letters lowercase
        
        corpus[index] = re.sub('\W_',' ', corpus[index])        # removes specials characters and leaves only words
        corpus[index] = re.sub("\S*\d\S*"," ", corpus[index])   # removes numbers and words concatenated with numbers IE h4ck3r. Removes road names such as BR-381.
        corpus[index] = re.sub(r'http\S+', '', corpus[index])   # removes URLs with http
        corpus[index] = re.sub(r'www\S+', '', corpus[index])    # removes URLs with www

        listOfTokens = word_tokenize(corpus[index])
        twoLetterWord = twoLetters(listOfTokens)

        listOfTokens = removeWords(listOfTokens, stopwords)
        listOfTokens = removeWords(listOfTokens, twoLetterWord)
        
        listOfTokens = applyStemming(listOfTokens, param_stemmer)
        
        corpus[index] = listOfTokens

    return corpus

In [22]:
corpus_clinton = tweets_clinton['text'].tolist()

In [23]:
language = 'english'
corpus_clinton_tokenized = processCorpus(corpus_clinton, language)

In [24]:
corpus_clinton_joined = []

for doc in corpus_clinton_tokenized:
    doc_joined = " ".join(doc)
    doc_unidecode = unidecode(doc_joined)
    corpus_clinton_joined.append(doc_unidecode)

In [25]:
corpus_clinton_series = pd.Series(corpus_clinton_joined)

In [26]:
mention_trump = ['trump', 'donald', 'nevertrump', 'realdonaldtrump']
mention_clinton = ['clinton', 'hillari', 'hillary', '-hillari', 'crookedhillari', 'noclintoninwheveragain', 'hillaryclinton']

mention_dem_candidate = ['bernie', 'sanders', 'berni', 'sander']
mention_rep_candidate = ['kasich', 'cruz', 'rubio', 'carson', 'bush', 'gilmore', 'gilmor', 'santorum', 'christie', 'christi', 'fiorina', 'paul', 'huckabee', 'huckabe']

mention_obama = ['barack', 'obama']

In [27]:
mentions_dict = {
    'mention trump': False,
    'mention clinton': False,
    'mention dem candidate': False,
    'mention rep candidate': False,
    'mention obama': False
}

In [28]:
def extract_mentions(tokenized_tweet):
    try:
        mentions = mentions_dict.copy()

        for word in tokenized_tweet.split():
            if word in mention_trump:
                mentions['mention trump'] = True
            if word in mention_clinton:
                mentions['mention clinton'] = True
            if word in mention_dem_candidate:
                mentions['mention dem candidate'] = True
            if word in mention_rep_candidate:
                mentions['mention rep candidate'] = True
            if word in mention_obama:
                mentions['mention obama'] = True
        
        return pd.Series(mentions)
    
    except ValueError:
        return None

In [29]:
tweets_clinton_mentions = corpus_clinton_series.apply(lambda x: extract_mentions(x))

In [30]:
tweets_clinton_mentions.sample(n = 5, random_state = 75)

Unnamed: 0,mention trump,mention clinton,mention dem candidate,mention rep candidate,mention obama
1027,True,False,False,True,False
5138,False,True,False,False,False
2814,False,True,False,False,False
365,False,False,False,False,False
3347,False,False,False,False,False


In [31]:
tweets_clinton = tweets_clinton.merge(tweets_clinton_mentions, how = 'left', left_index = True, right_index = True)

## Save Dataframe

In [32]:
json_path = '../Data/tweets_clinton_20151109_20161109_readability_extrafeatures.json'

# Save full dataframe as csv
tweets_clinton.to_json(json_path, orient = 'columns')

***

# Tweets
## Open Data Set With Readability Scores and Hash Tags

In [33]:
path_trump = '../Data/tweets_trump_20151109_20161109_readability.json'

with open(path_trump, encoding="utf8") as f:
    tweets_trump = pd.read_json(f, orient='columns', convert_axes=True, dtype=True, lines=False)

In [34]:
corpus_trump = tweets_trump['text'].tolist()

corpus_trump_tokenized = processCorpus(corpus_trump, language)

corpus_trump_joined = []

for doc in corpus_trump_tokenized:
    doc_joined = " ".join(doc)
    doc_unidecode = unidecode(doc_joined)
    corpus_trump_joined.append(doc_unidecode)

corpus_trump_series = pd.Series(corpus_trump_joined)

### Topic based on hash tag

In [35]:
tweets_trump['topic_hashtag'] = tweets_trump.apply(lambda x: get_topic(x.hashtags), axis = 1)

In [36]:
tweets_trump['topic_hashtag']

0       no topic
1       no topic
2       no topic
3       no topic
4       no topic
          ...   
4964    no topic
4965    no topic
4966    no topic
4967    no topic
4968       media
Name: topic_hashtag, Length: 4969, dtype: object

## Presence of Url, Mention, Hash Tag

In [37]:
tweets_trump_presence = tweets_trump.apply(lambda x: find_url_mention_hashtag(x.text), axis = 1)

In [38]:
tweets_trump = tweets_trump.merge(tweets_trump_presence, how = 'left', left_index = True, right_index = True)

## Features Related to Date and Time
### Days Before Election

In [39]:
# Days before election is the number of days before the election
# Meaning that November 8, 2016 is encoded as 0
# And the day before it, November 8, 2016 as -1, etc
tweets_trump['days before election'] = tweets_trump.apply(lambda x: to_days_before_election(x['created_at']), axis = 1)

### Day of the Week

In [40]:
# Get the day of the week of every tweet
tweets_trump_day_of_week = tweets_trump.apply(lambda x: day_of_week(x['created_at']), axis = 1)

In [41]:
tweets_trump_day_of_week.sample(n = 5, random_state = 75)

Unnamed: 0,day monday,day tuesday,day wednesday,day thursday,day friday,day saturday,day sunday
4334,False,False,False,False,True,False,False
1816,False,False,False,False,True,False,False
4049,True,False,False,False,False,False,False
867,False,False,True,False,False,False,False
2149,False,False,False,False,True,False,False


In [42]:
# Merge 'day of the week' dataframe with tweets dataframe
tweets_trump = tweets_trump.merge(tweets_trump_day_of_week, how = 'left', left_index = True, right_index = True)

### Hour of the Day

In [43]:
tweets_trump['hour of the day'] = tweets_trump.apply(lambda x: x['created_at'].hour, axis = 1)

## Mentioned in Tweet

In [44]:
tweets_trump_mentions = corpus_trump_series.apply(lambda x: extract_mentions(x))

In [45]:
tweets_trump = tweets_trump.merge(tweets_trump_mentions, how = 'left', left_index = True, right_index = True)

## Save Dataframe

In [46]:
json_path = '../Data/tweets_trump_20151109_20161109_readability_extrafeatures.json'

# Save full dataframe as csv
tweets_trump.to_json(json_path, orient = 'columns')