In [None]:
%load_ext autoreload
%autoreload 2

In [1]:
import datetime
import pandas as pd
import plotly.express as px

from ast import literal_eval
from utilities.utils import shared_dir, display_dict, figures_dir, env_bool, plot_bihistogram
from utilities.data_processing.processing_utils import get_geo, get_tweet_geo, get_all_geo, format_geojson_all_types, \
    get_user_location_tweets, handle_user_coords, format_geojson, get_tweet_place_tweets, get_lat_lon, \
    get_entity_locations_tweets, get_entity_coords, format_geojson_entities, get_location, stack_df, display_rules, \
    display_rules_text, display_overlaid_histogram, save_for_annotation, get_geo_df, to_geojson

In [None]:
pd.set_option('display.max_colwidth', None)

### Load geospatial data

Loads geospatial data from raw tweets.

In [None]:
geo_df = get_geo_df('tweets')
print(f'Geospatial Dataset Size: {len(geo_df)}')
print(f'Geospatial Dataset Columns: {geo_df.columns}')
geo_df.head()

### Format data

Basic data formatting.

In [None]:
geo_df.drop('key', axis=1, inplace=True)
geo_df.drop('test', axis=1, inplace=True)
geo_df.drop('entities_places', axis=1, inplace=True)
geo_df.drop('place_id', axis=1, inplace=True)
geo_df.drop('entities', axis=1, inplace=True)
geo_df.drop('user_location', axis=1, inplace=True)
geo_df.drop('author_id', axis=1, inplace=True)
geo_df.drop('_id', axis=1, inplace=True)
geo_df.rename(columns={'geo': 'tweet_location', 'place_data': 'tweet_place', 'value': 'user_location'}, inplace=True)
geo_df.head()

In [None]:
geo_df['user_location'] = geo_df['user_location'].fillna(value='{}')
geo_df['user_location'] = geo_df['user_location'].astype(str)
geo_df['user_location'] = geo_df['user_location'].apply(lambda x: {} if x == '' else literal_eval(x))
geo_df['user_location'] = geo_df['user_location'].astype(object)

geo_df['tweet_place'] = geo_df['tweet_place'].fillna(value='{}')
geo_df['tweet_place'] = geo_df['tweet_place'].astype(str)
geo_df['tweet_place'] = geo_df['tweet_place'].apply(lambda x: {} if x == '' else literal_eval(x))
geo_df['tweet_place'] = geo_df['tweet_place'].astype(object)

geo_df['tweet_location'] = geo_df['tweet_location'].fillna(value='{}')
geo_df['tweet_location'] = geo_df['tweet_location'].astype(str)
geo_df['tweet_location'] = geo_df['tweet_location'].apply(lambda x: {} if x == '' else literal_eval(x))
geo_df['tweet_location'] = geo_df['tweet_location'].astype(object)

print(f'Size of all geospatial data: {len(get_geo(geo_df))}')
print(f'Size of tweet-specific data: {len(get_tweet_geo(geo_df))}')
geo_df[((geo_df.tweet_place != {}) & (geo_df.tweet_location == {})) | (
            (geo_df.tweet_place == {}) & (geo_df.tweet_location != {}))].head()

### Convert places to coordinates

In [None]:
all_type_geo = get_all_geo(geo_df)

data = format_geojson_all_types(all_type_geo)
data = [x for x in data if x[0] is not None and None not in x[0]]
to_geojson(data, 'all_locations', override=True)
all_type_geo.head()

In [None]:
geo = get_user_location_tweets(geo_df)
display_dict(geo.iloc[0].user_location)
geo['lat_lon'] = geo.user_location.progress_apply(handle_user_coords)
geo_df['user_coords'] = geo_df['user_location'].progress_apply(lambda x: handle_user_coords(x) if x != {} else None)
data = format_geojson(geo_df.sample(n=20), key='user_coords')
data = [x for x in data if x[0] is not None and None not in x[0]]
to_geojson(data, label='user_coords')
geo[['lat_lon']].head()

In [None]:
geo = get_tweet_place_tweets(geo_df)
geo['tweet_place'] = geo['tweet_place'].progress_apply(get_lat_lon)
geo_df['tweet_coords'] = geo_df['tweet_place'].progress_apply(get_lat_lon)
data = format_geojson(geo_df.sample(n=200), key='tweet_coords')
data = [x for x in data if x[0] is not None and None not in x[0]]
to_geojson(data, label='tweet_locations')
geo = geo[~geo['tweet_place'].isnull()]
geo[geo['tweet_place'].map(len) != 0].head()

In [None]:
geo = get_entity_locations_tweets(geo_df)
geo['entity_coords'] = geo['entity_locations'].progress_apply(get_entity_coords)
geo_df['entity_coords'] = geo_df['entity_locations'].progress_apply(get_entity_coords)
data = format_geojson_entities(geo_df.sample(n=20))
data = [x for x in data if x[0] is not None and None not in x[0]]
to_geojson(data, label='entity_locations')
geo.head()

In [None]:
geo_df['location'] = geo_df.progress_apply(get_location, axis=1)

print(f'Size of un-formatted geospatial data: {len(geo_df)}')
geo_df = geo_df[~geo_df.location.isnull()].copy()
print(f'Size of formatted geospatial data: {len(geo_df)}')

geo_df['location_type'] = geo_df['location'].apply(lambda x: x['type'])
geo_df['location'] = geo_df['location'].apply(lambda x: x['locations'])

geo_df.drop(['tweet_location', 'user_data', 'tweet_place', 'user_location', 'entity_locations', 'user_coords', 'tweet_coords', 'entity_coords'], axis=1, inplace=True)

geo_df[geo_df.location_type != 'user'].head()

### Stack for multiple possible locations

In [None]:
geo_df, _ = stack_df(geo_df, 'location', drop=['location_old', 'level_1'], set_index='id')
geo_df[geo_df.location_type != 'user'].head()

### Format tweet time into seconds since the year 2000

In [None]:
geo_df['epoch'] = datetime.datetime.strptime('2000-01-01T00:00:00.000', '%Y-%m-%dT%H:%M:%S.%f').replace(tzinfo=datetime.timezone.utc)
geo_df['created_at'] = pd.to_datetime(geo_df['created_at'], format='%Y-%m-%dT%H:%M:%S.%f')
geo_df['diff'] = geo_df['created_at'] - geo_df['epoch']
geo_df['time'] = geo_df['diff'].dt.total_seconds()
geo_df.drop(['epoch', 'created_at', 'diff'], axis=1, inplace=True)
geo_df.head()

### Graph locations by type

In [None]:
fig = px.histogram(geo_df, x="location_type")
fig.write_html(f'{figures_dir}/data_processing/location_type.html')
fig.write_json(f'{figures_dir}/data_processing/location_type.json')
fig.show()

### Display comparison between raw tweets and geospatial tweets by rule distribution

This is optional due to significant runtime

In [14]:
if env_bool('DISPLAY_RULES'):
    tweets = pd.read_pickle(f'{shared_dir}/tweets.pickle')
    colors = (px.colors.qualitative.D3[0], px.colors.qualitative.D3[4])
    _, rules_df, rules_stack = display_rules(tweets.copy(), color=colors[0], filename='tweets')
    display_rules_text(rules_df)
    _, geo_rules_df, geo_stack = display_rules(geo_df.copy(), color=colors[1], filename='geospatial')
    display_overlaid_histogram(tweets, geo_df, colors, (rules_stack, geo_stack))
    plot_bihistogram(dfs=[rules_df, geo_rules_df], key='rule', labels=['RAW', 'GEO'], filename='rules_bihistogram')

### Save tweets for annotation

Optional

In [15]:
if env_bool('SAVE_FOR_ANNOTATION'):
    save_for_annotation(geo_df)

### Save to file for further use

In [None]:
geo_df.to_pickle(f'{shared_dir}/geospatial.pickle')