In [1]:
import json
import pandas as pd
from pandas.io.json import json_normalize
import pymongo
from pymongo import MongoClient
import numpy as np

file_name = "../geotagged_tweets_20160812-0912.jsons"

In [2]:
user_cols = ['user.name', 'user.id_str', 'user.location', 'user.followers_count']

place_cols = ['place.id', 'place.place_type', 'place.name', 'place.full_name', 
              'place.country_code', 'place.bounding_box.coordinates']

entities_cols = ['entities.hashtags', 'entities.urls', 'entities.user_mentions', 'entities.symbols']

other_cols = ['created_at', 'text', 'lang', 'timestamp_ms']

useful_cols = other_cols + user_cols + place_cols + entities_cols

In [3]:
client = MongoClient()
db = client.twitter
collection = db.inventory
data = pd.DataFrame(list(collection.find()))

ServerSelectionTimeoutError: localhost:27017: [Errno 111] Connection refused, Timeout: 30s, Topology Description: <TopologyDescription id: 5f620b468d102fd8ec3782db, topology_type: Single, servers: [<ServerDescription ('localhost', 27017) server_type: Unknown, rtt: None, error=AutoReconnect('localhost:27017: [Errno 111] Connection refused')>]>

In [None]:
data['user'] = data['user'].apply(lambda x: {} if pd.isna(x) else x)
user = json_normalize(data['user'], errors = 'ignore')
user.columns = ['user.' + str(col) for col in user.columns]
user = user[user_cols]

print("User fields flattened")

data['place'] = data['place'].apply(lambda x: {} if pd.isna(x) else x)
place = json_normalize(data['place'], errors = 'ignore')
place.columns = ['place.' + str(col) for col in place.columns]
place = place[place_cols]

print("Place fields flattened")

data['entities'] = data['entities'].apply(lambda x: {} if pd.isna(x) else x)
entities = json_normalize(data['entities'], errors = 'ignore')
entities.columns = ['entities.' + str(col) for col in entities.columns]
entities = entities[entities_cols]

print("Entities fields flattened")

In [None]:
place

In [None]:
data = data.drop(columns=['user', 'place', 'entities'])
data = pd.concat([data, user, place, entities], axis=1, sort=False)[useful_cols]
data.head()

In [None]:
data.columns

In [None]:
data.groupby("place.country_code").count()["created_at"].sort_values(ascending=False)

In [None]:
#Other languages won't work with english libraries, 'und' language seems to be only hashtags, ats, and emojis
data = data[data["lang"] == "en"]
data = data[data["place.country_code"] == "US"]

data = data.drop(columns=['lang', 'place.country_code']).reset_index()
data

In [None]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    print(data.groupby("place.full_name").count()["created_at"].sort_values(ascending=False))

In [None]:
us_state_abbrev = {
    'Alabama': 'AL',
    'Alaska': 'AK',
    'Arizona': 'AZ',
    'Arkansas': 'AR',
    'California': 'CA',
    'Colorado': 'CO',
    'Connecticut': 'CT',
    'Delaware': 'DE',
    'District of Columbia': 'DC',
    'Florida': 'FL',
    'Georgia': 'GA',
    'Hawaii': 'HI',
    'Idaho': 'ID',
    'Illinois': 'IL',
    'Indiana': 'IN',
    'Iowa': 'IA',
    'Kansas': 'KS',
    'Kentucky': 'KY',
    'Louisiana': 'LA',
    'Maine': 'ME',
    'Maryland': 'MD',
    'Massachusetts': 'MA',
    'Michigan': 'MI',
    'Minnesota': 'MN',
    'Mississippi': 'MS',
    'Missouri': 'MO',
    'Montana': 'MT',
    'Nebraska': 'NE',
    'Nevada': 'NV',
    'New Hampshire': 'NH',
    'New Jersey': 'NJ',
    'New Mexico': 'NM',
    'New York': 'NY',
    'North Carolina': 'NC',
    'North Dakota': 'ND',
    'Ohio': 'OH',
    'Oklahoma': 'OK',
    'Oregon': 'OR',
    'Pennsylvania': 'PA',
    'Rhode Island': 'RI',
    'South Carolina': 'SC',
    'South Dakota': 'SD',
    'Tennessee': 'TN',
    'Texas': 'TX',
    'Utah': 'UT',
    'Vermont': 'VT',
    'Virginia': 'VA',
    'Washington': 'WA',
    'West Virginia': 'WV',
    'Wisconsin': 'WI',
    'Wyoming': 'WY',
}

In [None]:
len(us_state_abbrev)

In [None]:
twoletterstates = data["place.full_name"].str.extract(', ([A-Z]{2})$')
fullnamestates = data["place.full_name"].str.extract('(.*), USA')
twoletterstates = pd.DataFrame(twoletterstates[0].combine_first(fullnamestates[0].map(us_state_abbrev)))

In [None]:
twoletterstates

In [None]:
twoletterstates[0].value_counts()

In [None]:
fullnamestates[fullnamestates.notna()]