# Estimating Internal Migration

This notebook accompanies the paper ***Using Twitter to track internal migration before and during the COVID-19 pandemic in the UK*** by Yikang Wang, Chen Zhong and Carmen Cabrera-Arnau.

Code segments have been simplified to exclude database queries previously integrated within the code.

## Libraries

In [None]:
import geopandas as gp
import pandas as pd
import numpy as np
from datetime import datetime
from shapely import geometry
import os
import requests
import math
import re
import socket
from tqdm import tqdm

## Twitter Geocoding

### Lookup Table

Matching Twitter place attributes to officially defined UK administrative divisions by a loopup table

In [None]:
# Some unofficial place names in the Twitter place attributes
lookup_dict = {
    'Bristol': 'Bristol, City of',
    'Dundee': 'Dundee City',
    'Edinburgh': 'City of Edinburgh',
    'Glasgow': 'Glasgow City',
    'Kingston upon Hull': 'Kingston upon Hull, City of',
    'Herefordshire': 'Herefordshire, County of',
    'North East': 'North East (England)',
    'North West': 'North West (England)',
    'South East': 'South East (England)',
    'South West': 'South West (England)',
    'East Midlands': 'East Midlands (England)',
    'Saint Helier': 'Jersey',
    'Sale': 'Trafford',
    'Huddersfield': 'Kirklees',
    'West Bromwich': 'Sandwell',
    'Ashton-under-Lyne': 'Tameside'
}

# UK LAD to LAU to ITL lookup data
# Data source: https://geoportal.statistics.gov.uk/documents/1191f4fc8e06433b9196103eac198d56/about
lookup_df = pd.read_csv('LAD21_LAU121_ITL321_ITL221_ITL121_UK_LU.csv')

# Set of places with a scale greater than the LAD scale
larger_than_LAD_set = set.union(set(lookup_df.LAU121NM),
                                set(lookup_df.ITL321NM),
                                set(lookup_df.ITL221NM),
                                set(lookup_df.ITL121NM)) - LAD_set

# Split LAD names to bilud places to LAD lookup table
LAD_set = set(lookup_df.LAD21NM)
LAD_split_dict = {}
for loc in LAD_set:
    if len(loc.split(' and ')) >= 2 and len(re.split(' and |, ', loc)) >= 2:
        for i in range(len(re.split(' and |, ', loc))):
            LAD_split_dict.update({re.split(' and |, ', loc)[i]: loc})
lookup_dict.update(LAD_split_dict)

# UK ward to LAD lookup data
# Data source: https://geoportal.statistics.gov.uk/documents/ward-to-local-authority-district-december-2021-lookup-in-the-united-kingdom/about
ward_dict = pd.read_csv('WD21_LAD21_UK_LU_provisional.csv').set_index('WD21NM').LAD21NM.to_dict()
lookup_dict.update(ward_dict)

# Split ward names to bilud places to LAD lookup table
ward_split_dict = {}
for loc in ward_dict:
    if len(loc.split(' and ')) >= 2 and len(re.split(' and |, ', loc)) >= 2:
        for i in range(len(re.split(' and |, ', loc))):
            ward_split_dict.update(
                {re.split(' and |, ', loc)[i]: ward_dict[loc]})
lookup_dict.update(ward_split_dict)

# Delete duplicate place names
for i in (lookup_dict).copy():
    if i in LAD_set or i in larger_than_LAD_set:
        del lookup_dict[i]

In [None]:
# Match place names to administrative divisions using the lookup table
def match(location):
    if location in LAD_set:
        return 1  # matched on LAD scale
    elif location in larger_than_LAD_set:
        return 2  # matched on a scale greater than LAD
    elif location in lookup_dict:
        return match(lookup_dict[location])
    return 0  # not matched


# Change unofficial place names to official names using the lookup table
def change_place_name(place):
    if place in lookup_dict:
        return lookup_dict[place]
    elif match(place) > 0:
        return place
    return ''

In [None]:
''' Example '''
# Read Twitter place attributes data
df = pd.read_csv('twitter_place_attributes.csv')

# Change place names and match with the lookup table
df['locality'] = df['location'].apply(change_place_name)
df['locatity_match'] = df['locality'].apply(match)

### Bing Map API Geocoding

In [None]:
API_key = ''  # Bing Map Geocoding API key

for i in tqdm(range(df.shape[0])):
    # If the place is already matched with the lookup table, then skip API geocoding
    if df.loc[i, 'locality'] != '':
        continue

    place = df.iloc[i, 0]
    URL = "http://dev.virtualearth.net/REST/v1/Locations?q=" + place + "%20UK&o=json&key=" + API_key
    response = requests.get(URL)
    if response.status_code == 200:
        if response.json()['resourceSets'][0]['estimatedTotal'] == 0:
            continue
        address = response.json()['resourceSets'][0]['resources'][0]['address']

        # If the returned locality is not matched with the lookup table, try to match the returned address
        try:
            locality = address['locality']
            address_num = 0
        except:
            locality = address['formattedAddress'].split(', ')[0]
            address_num = 1
        if match(locality) == 0:
            for j in range(address_num, len(address['formattedAddress'].split(', '))):
                locality = address['formattedAddress'].split(', ')[j]
                if match(locality) > 0:
                    break

        # If the returned locality is not matched on the LAD scale, save the bounding box
        if match(locality) != 1:
            df.loc[i, 'bbox0'], df.loc[i, 'bbox1'], df.loc[i, 'bbox2'], df.loc[i, 'bbox3'] = response.json()['resourceSets'][0]['resources'][0]['bbox']

        # Change to official locality names
        if locality in special_loc_dict:
            df.loc[i, 'locality'] = special_loc_dict[locality]
        else:
            df.loc[i, 'locality'] = locality

# Match with the lookup table
df['locality_match'] = df['locality'].apply(match)

### Bounding Box Intersection Geocoding

In [None]:
def IOU(se):
    try:
        bboxlist = list(se)
        y1, x1, y2, x2 = bboxlist[0], bboxlist[1], bboxlist[2], bboxlist[3]
        if np.isnan(y1) or y2 - y1 > 2:
            return ''
        poly2 = geometry.box(x1, y1, x2, y2)
        inter_se = LAD['geometry'].intersects(poly2)
        temp_df = LAD[inter_se]
        IOU_se = temp_df['geometry'].intersection(poly2).area / poly2.area
        if IOU_se.max() > 0.6:
            return LAD.loc[IOU_se.idxmax(), 'LAD21NM']
    except:
        return ''
    return ''

In [None]:
''' Example '''
# Read LAD geo data
# Data source: https://geoportal.statistics.gov.uk/maps/local-authority-districts-december-2021-uk-bgc
LAD = gp.read_file('Local_Authority_Districts_(May_2021)_UK_BGC.geojson')

# IOU geocoding
df.loc[:, 'locality_IOU'] = df[['bbox0', 'bbox1', 'bbox2', 'bbox3']].apply(IOU, axis=1)

# Match IOU geocoding result with the lookup table
df['locality_IOU_match'] = df['locality_IOU'].apply(match)

# If the IOU geocoding is successful, update the locality
df.loc[df['locality_IOU_match'] == 1, 'locality'] = df.loc[df['locality_IOU_match'] == 1, 'locality_IOU']
df['locality_match'] = df['locality'].apply(match)

# Save result
df.to_csv('twitter_place_attributes_LAD.csv', encoding='utf-8')

## User Localization

In [None]:
def user_localization(userid):
    temp_df = twitter_LAD_count_df[twitter_LAD_count_df.userid == userid]
    if temp_df.counts.sum() > 1 and temp_df.counts.max()/temp_df.counts.sum() > 0.65:
        return temp_df.locality.iloc[0]
    return ''

In [None]:
''' Example '''
# Read Twitter place geocoding result
twitter_place_LAD = pd.read_csv('twitter_place_attributes_LAD.csv')[['location', 'locality', 'locality_match']]
loc_LAD = loc_LAD[loc_LAD.locality_match == 1].reset_index(drop=True)
loc_LAD = loc_LAD[['location', 'locality']]

for year in [2019, 2020, 2021]:
    for month in range(13):
        # Read monthly Twitter data
        twitter_df = pd.read_csv('data/uk' + str(year) + '-' + str(month).zfill(2) + '.csv')
        twitter_LAD.userid = twitter_LAD.userid.apply(lambda x: round(float(x)))  # avoid string type data
        twitter_df.userid = twitter_df.userid.astype('int')

        # Match with Twitter geocoding result
        twitter_LAD = twitter_df.merge(loc_LAD, how='left', on='location')[['userid', 'locality']]
        twitter_LAD = twitter_LAD[twitter_LAD.locality.isna() == False]

        user_LAD = pd.DataFrame(twitter_LAD.userid.value_counts()).reset_index()
        user_LAD.columns = ['userid', 'counts']
        user_LAD = user_LAD[user_LAD.counts > 1]
        twitter_LAD = twitter_LAD[['userid', 'locality']]
        twitter_LAD_count_df = twitter_LAD.value_counts().reset_index()
        twitter_LAD_count_df.columns = ['userid', 'locality', 'counts']

        # If all Tweet from the user came from the same LAD, use it as the user's location
        user_loc_pair = twitter_LAD.drop_duplicates()
        user_loc_count = user_loc_pair.userid.value_counts()
        user_oneloc = user_loc_count[user_loc_count == 1].reset_index()
        user_oneloc.columns = ['userid', 'oneloc']
        user_oneloc = user_oneloc.merge(user_loc_pair, how='left', on='userid')[['userid', 'locality']]
        user_oneloc.columns = ['userid', 'home']
        user_LAD = user_LAD.merge(user_oneloc, how='left', on='userid')

        # For other users, if at least 65% of Tweet from one LAD, use it as the user's location
        endline = user_LAD.shape[0] - user_LAD[user_LAD.counts == 2].shape[0]
        for i in range(endline):
            if type(user_LAD.loc[i, 'home']) == str:
                continue
            user_LAD.loc[i, 'home'] = user_localization(user_LAD.loc[i, 'userid'])

        # Generate a user-home csv file for each month. contains userid and estimated home location
        user_LAD.to_csv('data/uk' + str(year) + '-' + str(month).zfill(2) + '-userhome.csv', index=False, encoding='utf-8')

## User Migration Analysis

In [None]:
def migration(year1, month1, year2, month2):
    '''
    Compare home location between two months to identify migrations
    '''

    # Read user-home csv files of these two months
    df1 = pd.read_csv('data/uk' + str(year1) + '-' + str(month1).zfill(2) + '-userhome.csv')
    df2 = pd.read_csv('data/uk' + str(year2) + '-' + str(month2).zfill(2) + '-userhome.csv')

    # Merge user-home csv files
    df = df1.merge(df2, how='left', on='userid')
    df = df[(df.home_x.isna() == False) & (df.home_y.isna() == False)]
    df = df.reset_index(drop=True)

    # migration dataframe
    migration_df = df[df.home_x != df.home_y]
    
    # migration flow dataframe
    flow_df = migration_df.groupby(['home_x','home_y']).count().reset_index().iloc[:,:3]
    flow_df.columns = ['origin_LAD','destination_LAD','flow']
    flow_df['month_start'] = str(year1) + '-' + str(month1).zfill(2)
    flow_df['month_end'] = str(year2) + '-' + str(month2).zfill(2)

    return flow_df

In [None]:
''' Example '''
# Migration flows between mid-2019 and mid-2020
df = migration(2019, 6, 2020, 6)
df.to_csv('migration_flow_2019_06_2020_06.csv', index=False, encoding='utf-8')