# TF-IDF Analysis

##### The purpose of this notebook is to conduct tf-idf analysis to determine what terms appear most frequently for each candidate

In [7]:
import pandas as pd
import json
import nltk
from nltk.tokenize import word_tokenize

Read in master dataframe and separate into two dataframes, one for each city

In [8]:
df = pd.read_csv('all_queries_with_data.csv',index_col=False)
bf = df[df.city=='Boston']
nf = df[df.city=='NYC']

In [9]:
bf.head()

Unnamed: 0,Query,Source,Date Added,(Num Occurances),city,seed_query,candidate,gender
0,jon santiago races 2021,autocomplete,2021-04-27,,Boston,jon santiago race,jon santiago,M
2,john barros wiki,autocomplete,2021-04-27,,Boston,john barros,john barros,M
6,boston mayoral election 2017,autocomplete,2021-04-27,,Boston,,,
26,michelle wu for mayor,autocomplete,2021-04-27,,Boston,michelle wu for mayor,michelle wu,W
31,john barros mayor elections 2018,autocomplete,2021-04-27,,Boston,john barros,john barros,M


Helper functions to calculate tf-idf from Eni's 315 notebook

In [10]:
#tf-idf
def tf(term, doc, normalize=True):
    """Helper function to calculate term frequency.
    """
    doc = doc.lower().split()
    term = term.lower()
    if normalize:
        return doc.count(term) / float(len(doc))
    else:
        return doc.count(term) / 1.0
    
# Test the function

#print(tf('the', corpus['a']))
#print(tf('the', corpus['a'], False))

In [11]:
from math import log

def idf(term, corpus):
    """Helper function to calculate inverse document frequency.
    """
    # Find all queries that contain the term
    num_docs_with_term = len([True for candidate_queries in corpus if term.lower() in candidate_queries.lower()])

    # tf-idf calc involves multiplying against a tf value less than 0, so it's
    # necessary to return a value greater than 1 for consistent scoring.
    # (Multiplying two values less than 1 returns a value less than each of
    # them.)

    try:
        return 1.0 + log(float(len(corpus)) / num_docs_with_term)
    except ZeroDivisionError:
        return 1.0
 
# Testing the function
#print(idf('the', corpus.values()))
#print(idf('a', corpus.values()))

In [12]:
def tf_idf(term, doc, corpus):
    """Helper function to calculate tf-idf score."""
    return tf(term, doc) * idf(term, corpus)

Our documents in this case will be a long string that is all the queries about that candidates joined together with a space.

In [13]:
def make_candidate_docs(df, candidate):
    '''takes dataframe and candidate, returns list of queries about that candidate combined into one long string'''
    cand_queries = df[df['candidate']==candidate]['Query']
    
    return ' '.join(cand_queries)   

In [14]:
#making the documents

boston = {}
for candidate in bf.candidate.unique():
    #we need to ignore the case of the nan, which is from queries that aren't about a candidate, like 'nyc mayoral election'
    if str(candidate) == 'nan': continue
    boston[candidate] = make_candidate_docs(bf, candidate)

nyc = {}
for candidate in nf.candidate.unique():
    if str(candidate) == 'nan': continue
    nyc[candidate] = make_candidate_docs(nf, candidate)

In [15]:
boston['jon santiago'][:150]

'jon santiago races 2021 dr jon santiago massachusetts jon santiago campaign manager jon santiago mayoral election 2019 candidates jon santiago covid 1'

In [16]:
nyc['andrew yang'][:150]

'andrew yang twitter chicken soup andrew yang policies andrew yang early childhood education andrew yang education net worth andrew yang mayor new york'

Next, we need to make the corpus. This will be all the queries for each city combined into one long string. We want this to be city-specific in case there is some difference between city.

In [17]:
#the corpus will be all the queries for each city combined
def make_corpus(city):
    '''takes a city and returns a list of queries from candidates in city, each as one big string'''
    if city== 'Boston': df = bf
    else: df = nf
    doc_list = []
    city = df[df.city==city]
    for candidate in city.candidate.unique():
        doc_list.append(' '.join(city[city.candidate==candidate]['Query']))
    
    return doc_list

bos_corpus = make_corpus('Boston')
nyc_corpus = make_corpus('NYC')

In [21]:
bos_corpus[0][:250]

'jon santiago races 2021 dr jon santiago massachusetts jon santiago campaign manager jon santiago mayoral election 2019 candidates jon santiago covid 19 dr jon santiago boston medical center jon santiago endorsements jon santiago nationality jon santi'

The terms that we want to search are going to be unigrams from all queries, so that we can give all a fair chance and see which ones show up the most, not just looking for gendered words.

In [22]:
#generate a list of unigrams 
from nltk.corpus import stopwords
import string
stop = set(stopwords.words('english') + list(string.punctuation))

In [23]:
#we want to filter out unigrams that are part of candidate names, since these will likely be top queries for almost all cands
#let's make simple string of all candidate names so we can easily check if a unigram is inside it
import numpy as np
cand_names = list(df.candidate.unique())
cand_names.append('isaac wright jr.') #some queries have a period in his name, so we want to exclude that
cand_names.remove(np.nan)
cand_names = ' '.join(cand_names)
cand_names

'jon santiago isaac wright jr john barros scott stringer art chang andrew yang curtis sliwa aaron foldenauer eric adams ray mcguire dianne morales kathryn garcia michelle wu shaun donovan annisa essaibi george dana depelteau andrea campbell paperboy prince kim janey michael bianchi maya wiley fernando mateo isaac wright jr.'

Next let's generate all unigrams from the corpus. We want to exclude all unigrams that are just numbers and that are not part of a candidates name

In [24]:
def make_unigrams(corpus):
    all_docs= ' '.join(corpus)
  #  [all_docs.extend(doc) for doc in corpus]
    unigrams = set(word_tokenize(all_docs))
    return [i for i in unigrams if i not in stop and not i.isnumeric() and i.lower() not in cand_names] #there are some that are just years

bos_unigrams = make_unigrams(bos_corpus)
nyc_unigrams = make_unigrams(nyc_corpus)

In [25]:
print(bos_unigrams[:10])
print(nyc_unigrams[:10])

['worth', 'summary', 'mattapan', 'deal', 'district', 'speech', 'sign', 'manager', 'partnerships', 'endorsements']
['worth', 'images', 'summary', '3rd', 'bitcoin', 'district', 'resigned', 'funds', 'pba', 'stepfather']


In [26]:
print(len(bos_unigrams))
print(len(nyc_unigrams))

444
842


### 1. Final calculations: Boston

To calculate all the final tf-idf scores, we'll iterate through all the candidates and then calculate the tf-idf scores for each unigram in that city's corpus, storing all information in a dictionary

In [27]:
bos_dct = {}
for candidate in bf.candidate.unique():
    if str(candidate) == 'nan': continue
    bos_dct[candidate] = []
    for term in bos_unigrams:
        bos_dct[candidate].append(tf_idf(term, boston[candidate], bos_corpus))

Now we can store this in a dataframe for further analysis, the candidates will be the columns and the terms will be the first column, then the values in each cell the tf-idf score for that term for that candidate.

In [28]:
columns = ['term']
columns.extend(list(bos_dct.keys()))

bos_dct['term'] = bos_unigrams
bos_tfidf = pd.DataFrame(bos_dct,columns=columns)

In [29]:
bos_tfidf

Unnamed: 0,term,jon santiago,john barros,michelle wu,annisa essaibi george,dana depelteau,andrea campbell,kim janey,michael bianchi
0,worth,0.000319,0.000258,0.002112,0.011291,0.001594,0.000523,0.009371,0.000697
1,summary,0.000000,0.000000,0.000253,0.000000,0.000066,0.000144,0.000000,0.000000
2,mattapan,0.000000,0.000000,0.000000,0.000000,0.000000,0.000088,0.000000,0.000000
3,deal,0.000000,0.000000,0.000686,0.000000,0.000000,0.000000,0.000000,0.000000
4,district,0.001118,0.000000,0.000000,0.000000,0.000000,0.000058,0.000000,0.015884
...,...,...,...,...,...,...,...,...,...
439,wikipedia,0.000000,0.063423,0.009660,0.000000,0.000000,0.021055,0.012035,0.000000
440,texas,0.000048,0.000000,0.000000,0.000000,0.000159,0.000000,0.000000,0.000000
441,ballotpedia,0.000061,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
442,views,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.045802,0.000000


Now let's look at which terms have the highest tf-idf scores for each candidate. We'll use the nlargest function in pandas to grab the 20 terms with the highest tf-idf scores for each candidate, storing this in a dictionary

In [30]:
bos_top_dict = {} 
for candidate in bos_tfidf.columns[1:]:
    mini = bos_tfidf.nlargest(20,candidate, 'all')
    bos_top_dict[candidate] = mini['term'].unique()
    print(candidate, ' --> done!')

jon santiago  --> done!
john barros  --> done!
michelle wu  --> done!
annisa essaibi george  --> done!
dana depelteau  --> done!
andrea campbell  --> done!
kim janey  --> done!
michael bianchi  --> done!


In [31]:
bos_top_dict

{'jon santiago': array(['wife', 'alexandra', 'photographer', 'campaign', 'manager',
        'endorsements', 'age', 'high', 'boston', 'mayor', 'md', 'school',
        'globe', 'twitter', 'facebook', 'married', 'representative',
        'state', 'instagram', 'medical'], dtype=object),
 'john barros': array(['wife', 'boston', 'wikipedia', 'political', 'party', 'mayor',
        'restaurant', 'bio', 'website', 'family', 'campaign', 'wiki',
        'high', 'bc', 'age', 'manager', 'globe', 'barroso', 'twitter',
        'massachusetts'], dtype=object),
 'michelle wu': array(['husband', 'family', 'boston', 'mother', 'siblings', 'mayor',
        'live', 'instagram', 'announcement', 'salary', 'accomplishments',
        'child', 'collection', 'signature', 'campaign', 'care', 'policies',
        'twitter', 'staff', 'events'], dtype=object),
 'annisa essaibi george': array(['mayoral', 'election', 'race', 'elections', 'partnership',
        'partners', 'worth', 'background', 'endorsement', 'COVID', '

### 2. Final calculations: NYC

The following code is all the same as above, but just for NYC instead of Boston

In [32]:
nyc_dct = {}
for candidate in nf.candidate.unique():
    if str(candidate) == 'nan': continue
    nyc_dct[candidate] = []
    for term in nyc_unigrams:
        nyc_dct[candidate].append(tf_idf(term, nyc[candidate], nyc_corpus))

In [33]:
columns = ['term']
columns.extend(list(nyc_dct.keys()))

nyc_dct['term'] = nyc_unigrams
nyc_tfidf = pd.DataFrame(nyc_dct,columns=columns)
nyc_tfidf

Unnamed: 0,term,isaac wright jr,scott stringer,art chang,andrew yang,curtis sliwa,aaron foldenauer,eric adams,ray mcguire,dianne morales,kathryn garcia,shaun donovan,paperboy prince,maya wiley,fernando mateo
0,worth,0.006847,0.001358,0.002452,0.051808,0.031965,0.019089,0.002257,0.008338,0.001958,0.0029,0.001947,0.019004,0.023581,0.007239
1,images,0.002414,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0000,0.000000,0.000000,0.002771,0.000000
2,summary,0.000000,0.000000,0.000000,0.002531,0.000000,0.000000,0.000000,0.000000,0.000000,0.0000,0.000000,0.000000,0.000000,0.000000
3,3rd,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.008317,0.000000,0.000000,0.0000,0.000000,0.000000,0.000000,0.000000
4,bitcoin,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.009785,0.000000,0.000000,0.0000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
837,views,0.000000,0.000000,0.000000,0.000716,0.000677,0.000000,0.006091,0.000000,0.000000,0.0000,0.003823,0.000000,0.000000,0.004737
838,tucker,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000070,0.000000,0.000000,0.0000,0.000000,0.000000,0.000000,0.000000
839,reopening,0.000000,0.000000,0.000000,0.001266,0.000000,0.000000,0.000000,0.000000,0.000000,0.0000,0.000000,0.000000,0.000000,0.000000
840,apk,0.000000,0.000000,0.025514,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0000,0.000000,0.000000,0.000000,0.000000


In [35]:
nyc_top_dict

{'isaac wright jr': array(['wife', 'daughter', 'life', 'still', 'story', 'happened',
        'settlement', 'lawyer', 'case', 'married', 'attorney', 'sunshine',
        'prosecutor', 'friend', 'book', 'pictures', 'mayor', 'today',
        'jamal', 'released', 'true'], dtype=object),
 'scott stringer': array(['nyc', 'mayor', 'campaign', 'commercial', 'relations', 'york',
        'new', 'news', 'kimball', 'jean', 'comptroller', 'jobs', 'wife',
        'father', 'race', 'mother', 'married', 'kid', 'conference',
        'ethnicity'], dtype=object),
 'art chang': array(['changes', 'changer', 'everything', 'album', 'world', 'change',
        'mayor', 'lives', 'morgan', 'changing', 'apk', 'elections',
        'election', 'windows', 'jp', 'changed', 'nyc', 'driver', 'quotes',
        'mayoral', 'race'], dtype=object),
 'andrew yang': array(['mayor', 'worth', 'net', 'nyc', 'cartoon', 'wife', 'ubi', 'tweet',
        'york', 'israel', 'new', 'basic', 'universal', 'income', 'reddit',
        'twitt