<h2>Appendix 8 - Location Inference</h2>

This program uses cross references tweet location fields with a gazetteer (geographical dictionary) to assign a state abbreviation to each tweet where possible. The need for this information, contents of the gazetteer, and reasoning for choosing this rule-based approach over a machine learning system are included in the Assigning Location section of our study.

In [1]:
import pandas as pd
import numpy as np
import string 

In [2]:
# Read in gazetteer
states = pd.read_excel("states_cities.xlsx")

# Convert all null values to empty strings to avoid false matches
states.cities_abbreviated.replace(np.NaN, "", inplace = True)
states.USA.replace(np.NaN, "", inplace = True)

# Convert all columns to upper case 
for i in range(5):
    states.iloc[:,i] = states.iloc[:,i].map(lambda x: x.upper())

# Cities in the gazetteer come in comma-separated lists for each state
# Each of these lists is recognised by Python as one String
# Convert each String into a list, to be iterated over later
for i in range(len(states)):
    states.at[i, "cities"] = states.at[i, "cities"].split(",")
    states.at[i, "cities_abbreviated"] = states.at[i, "cities_abbreviated"].split(",")

In [3]:
# Import processed twitter data set
tweet_locations = pd.read_excel("2sotu_processed.xlsx")

In [4]:
# Add empty column to imported tweet fields, to be populated with matched states later
tweet_locations["state"] = ""

In [5]:
# Checks for tokens in the tweet location field that match US state abbreviations, with some exceptions.
# Uses tokens to avoid unwanted matches, e.g. "woodlands" does not return "LA"
def check_abbreviations(n):
        
        # Tokenise the tweet location
        tweet_location_list = tweet_location.split(" ")
        
        # Iterate through state abbreviations, set the tweet's "state" field on finding a match
        for i in range(len(states)):
            abbreviation_candidate = states.at[i, "state"]
            if (abbreviation_candidate not in false_flags) and (abbreviation_candidate in tweet_location_list):
                tweet_locations.at[n, "state"] = abbreviation_candidate
                break

# List of state abbreviations that are also two-letter words, to be ignored by check_abbreviations()
# "LA" is also ommitted, to avoid mismatches between Louisiana and Los Angeles
false_flags = ["HI","IN","ME","OH","OK","OR","LA"]

# Checks for the full name of a state in the tweet location field
def check_full_names(n):
        
        # Iterate through state names, set the tweet's "state" field on finding a match
        for i in range(len(states)):
            full_name_candidate = states.at[i, "full_name"]
            if full_name_candidate in tweet_location:
                tweet_locations.at[n, "state"] = states.at[i, "state"]
                break

# Checks for the full name of a city in the tweet location field
def check_cities(n):
        
        # Nested for loop first selects a state in the gazetteer and then iterates through cities in that state
        # Set the tweet's "state" field on finding a match
        for i in range(len(states)):
            for j in range(len(states.at[i, "cities"])):
                city_candidate = states.at[i, "cities"][j]
                if city_candidate in tweet_location:
                    tweet_locations.at[n, "state"] = states.at[i, "state"]
                    break
            if tweet_locations.at[n, "state"] != "":
                break

# Checks for tokens in the tweet location field that match US city abbreviations
def check_city_abbreviations(n):
        
        # Tokenise the tweet location
        tweet_location_list = tweet_location.split(" ")
        
        # Nested for loop first selects a state in the gazetteer and then iterates through city abbreviations in that state
        # Set the tweet's "state" field on finding a match        
        for i in range(len(states)):
            for j in range(len(states.at[i, "cities_abbreviated"])):
                city_candidate = states.at[i, "cities_abbreviated"][j]
                
                # Disregard empty city abbreviation lists, written as "-" in the original gazetteer file
                if city_candidate != "-":
                    if city_candidate in tweet_location_list:
                        tweet_locations.at[n, "state"] = states.at[i, "state"]
                        break
            if tweet_locations.at[n, "state"] != "":
                break

# Checks for terms for the United States in the tweet location field
def check_USA_terms(n):
    
        # Iterate through USA terms, set the tweet's "state" field to "USA" on finding a match        
        for i in range(5):
            USA_candidate = states.at[i, "USA"]
            if USA_candidate in tweet_location:
                tweet_locations.at[n, "state"] = "USA"
                break

# Iterate through tweet location fields, calling above functions to assign a matching state where possible
# Continues to the next tweet location field on finding a match, reducing function calls
for i in range(len(tweet_locations)):
    tweet_location = tweet_locations.at[i, "location"]
    if type(tweet_location) == str:
        if tweet_location != "":
            check_abbreviations(i)
            if tweet_locations.at[i, "state"] != "":
                continue
            check_full_names(i)
            if tweet_locations.at[i, "state"] != "":
                continue
            check_cities(i)
            if tweet_locations.at[i, "state"] != "":
                continue
            check_city_abbreviations(i)
            if tweet_locations.at[i, "state"] != "":
                continue
            check_USA_terms(i)


In [6]:
# Save twitter location fields and new state field information
writer = pd.ExcelWriter('03sotu_with_states.xlsx')
tweet_locations.to_excel(writer,'Sheet1')
writer.save()