In [1]:
import pandas as pd
import pickle
import nltk
import re
import string
import numpy as np

from pprint import pprint


In [2]:
with open('xl.pkl', 'rb') as picklefile:
    df = pickle.load(picklefile)

### Clean up text of tweets and identify retweets versus original 

#### keep puncuation for Vader sentiment analysis

In [3]:
df.head(2)

Unnamed: 0,id,created_at,location,text,screen_name,retweet_count,favorite_count,followers_count,friends_count,hashtags_text,description,tweet_reply_to,search_term,artist
0,873619804468506624,Sat Jun 10 19:16:07 +0000 2017,,RT @Adele: I need to share with you all that I...,gueth_andre,56525,0,4,88,[],,,@adele,Adele
1,873599554175860736,Sat Jun 10 17:55:39 +0000 2017,"Manaus, Brasil",RT @Adele: Because of it though... I'm treatin...,MirianR88152620,36982,0,2,11,[],lauriemirian/twitter,,@adele,Adele


In [4]:
def remove_retweet_url(tweet):
    """remove RT and links in tweet""" 
    tweet.strip()
    tweet = re.sub(r"(https?\://)\S+", "", tweet)
    if tweet.startswith('RT'):
        tweet = re.sub(r"(RT\s@.*:)", "", tweet)
    return tweet

In [5]:
df['clean'] = df.text.apply(lambda x : remove_retweet_url(x))

In [6]:
def countRetweet(row):
    """ return 1 if retweet, 0 if original"""
    words = row['text'].split(' ')
    if words[0] == 'RT':
        return 1
    else:
        return 0

In [7]:
def countOriginal(row):
    """ return 0 if retweet, 1 if original"""
    words = row['text'].split(' ')
    if words[0] != 'RT':
        return 1
    else:
        return 0

In [8]:
df['retweet'] = df.apply(countRetweet, axis = 1)

In [9]:
df['original_tweet'] = df.apply(countOriginal, axis = 1)

In [10]:
len(df)

216992

### Location Retrieval

#### Stripping puncuation for location analysis. Build a dictionary of lat/long cooridinates for the top 2500 locations. The google geocode API accepts 2500 calls per day. 

In [11]:
def lower_alpha_num(corpus):
    # convert to lower case
    corpus = map(str.lower, corpus)
    
    # remove alpha-numerical words
    corpus = map(lambda x: re.sub(r"""\w*\d\w*""", '', x), corpus)
    corpus = list(corpus)
    s = ''.join(corpus)
    return s

In [12]:
def remove_punct(corpus):
    # regular expression to remove punctuation
    punc_re = re.compile('[%s]' % re.escape(string.punctuation))

    corpus = map(lambda x: punc_re.sub('', x), corpus)
    corpus = list(corpus)
    s = ''.join(corpus)
    return s

In [13]:
# Select the rows of df where Location is not NaN
from copy import deepcopy
df_location = df[df['location'].notnull()]
df_location = deepcopy(df_location)

In [14]:
df_location['location'] = df_location.location.apply(lambda x: lower_alpha_num(x))

In [15]:
df_location['location'] = df_location.location.apply(lambda x: remove_punct(x))

In [17]:
df_location.head(5)

Unnamed: 0,id,created_at,location,text,screen_name,retweet_count,favorite_count,followers_count,friends_count,hashtags_text,description,tweet_reply_to,search_term,artist,clean,retweet,original_tweet
1,873599554175860736,Sat Jun 10 17:55:39 +0000 2017,manaus brasil,RT @Adele: Because of it though... I'm treatin...,MirianR88152620,36982,0,2,11,[],lauriemirian/twitter,,@adele,Adele,Because of it though... I'm treating myself t...,1,0
2,873599531861950464,Sat Jun 10 17:55:33 +0000 2017,manaus brasil,RT @Adele: The piano mics fell on to the piano...,MirianR88152620,37033,0,2,11,[],lauriemirian/twitter,,@adele,Adele,"The piano mics fell on to the piano strings, ...",1,0
5,873330450462330880,Sat Jun 10 00:06:19 +0000 2017,miami fl,RT @Adele: HUGE song @tinietempah love it love...,CalvoRicky,2142,0,104596,71,[],"El marlin lastimosamente se ahogó , siempre la...",,@adele,Adele,HUGE song @tinietempah love it love it!!,1,0
6,873330445030748160,Sat Jun 10 00:06:18 +0000 2017,miami fl,RT @Adele: JT!!,CalvoRicky,3358,0,104596,71,[],"El marlin lastimosamente se ahogó , siempre la...",,@adele,Adele,JT!!,1,0
7,873330436138840064,Sat Jun 10 00:06:16 +0000 2017,miami fl,RT @Adele: This Sam Smith song is so so good h...,CalvoRicky,2773,0,104596,71,[],"El marlin lastimosamente se ahogó , siempre la...",,@adele,Adele,This Sam Smith song is so so good xx,1,0


#### Get a list of the top 2480 locations

In [1]:
# locations = [i for i in df_location_final.location.value_counts()[:2480].index]
# locations

### Call the google Geocode API

In [38]:
import urllib
import json
from urllib.parse import urlparse
import urllib.parse
import urllib.request
import time

In [39]:
import geocoder
import time

In [40]:
googleGeocodeUrl = 'http://maps.googleapis.com/maps/api/geocode/json?'

def get_coordinates(locations, from_sensor=False):
    ''' return a dictionary of lat, long and region'''
    location_dict = {}
    for query in locations:
        query = query.encode('utf-8')
        params = {
            'address': query,
            'sensor': "true" if from_sensor else "false"
        }
        
        url = googleGeocodeUrl + urllib.parse.urlencode(params)
        try:
            json_response = urllib.request.urlopen(url)
            response = json.loads(json_response.read())
            if response['results']:
                location = response['results'][0]['geometry']['location']
                latitude, longitude = location['lat'], location['lng']
                region = response['results'][0]['address_components'][0]['long_name']
                location_dict[query] = latitude, longitude, region
                time.sleep(1)
            else:
                try:
                    location_dict[query] == None
                    time.sleep(1)
                except KeyError:
                    pass
        
        except urllib.error.HTTPError as err:
            if err == 400: 
                location_dict[query] == None
            
    return location_dict
    

In [25]:
with open('dict.pkl', 'rb') as picklefile:
    d = pickle.load(picklefile)

In [26]:
def convert(data):
    if isinstance(data, bytes):  return data.decode('utf-8')
    if isinstance(data, dict):   return dict(map(convert, data.items()))
    if isinstance(data, tuple):  return tuple(map(convert, data))
    return data

In [27]:
d = convert(d)

#### After pulling the lat/long from the Google Maps API, I used Python geocoder wrapper to build a dictionary mapping lat/long to city, state, country, county

In [28]:
def reverse_geo(d):
    '''provided lat/long, returns a dictionary of city, state, country, county and postal formatted state'''
    reverse_dict = {}
    for k, v in list(d.items()):
        g = geocoder.google([d[k][0], d[k][1]], method='reverse')
        reverse_dict[k] = g.lat, g.lng, g.city, g.state_long, g.country_long, g.county, g.state
        time.sleep(1)
    return reverse_dict

In [29]:
# reverse_dict = reverse_geo(d)

In [19]:
with open('reverse_dict.pkl', 'rb') as picklefile:
    reverse_dict = pickle.load(picklefile)

In [23]:
list(reverse_dict.items())[0:4]

[('london',
  (51.5072996,
   -0.1280232,
   'London',
   'England',
   'United Kingdom',
   'Greater London',
   'England')),
 ('united states',
  (37.0891604,
   -95.7131979,
   'Independence',
   'Kansas',
   'United States',
   'Montgomery County',
   'KS')),
 ('london england',
  (51.5072996,
   -0.1280232,
   'London',
   'England',
   'United Kingdom',
   'Greater London',
   'England')),
 ('el qaliobia egypt',
  (30.3274395,
   31.2159308,
   None,
   'Al Qalyubia Governorate',
   'Egypt',
   'Toukh',
   'Al Qalyubia Governorate'))]

In [27]:
# with open('df_location_final.pkl', 'rb') as picklefile:
#     df_location_final = pickle.load(picklefile)

In [32]:
location_keys = ['latitude','longitude','city','state', 'country', 'county', 'postal_state']


In [33]:
def get_location(row, reverse_dict, index):
    try:
        if reverse_dict[row][index] !=None:
            loc = reverse_dict[row][index]
        else:
            loc = ''
        return loc
    except KeyError:
        pass

#### Add columns to dataframe with city, state, county etc...

In [34]:
for i, item in enumerate(location_keys):
    df_location[item] = df_location.location.apply(lambda x: get_location(x, reverse_dict,i))

In [35]:
df_location.head(5) 

Unnamed: 0,id,created_at,location,text,screen_name,retweet_count,favorite_count,followers_count,friends_count,hashtags_text,...,clean,retweet,original_tweet,latitude,longitude,city,state,country,county,postal_state
1,873599554175860736,Sat Jun 10 17:55:39 +0000 2017,manaus brasil,RT @Adele: Because of it though... I'm treatin...,MirianR88152620,36982,0,2,11,[],...,Because of it though... I'm treating myself t...,1,0,-3.11903,-60.0217,Manaus,Amazonas,Brazil,Manaus,AM
2,873599531861950464,Sat Jun 10 17:55:33 +0000 2017,manaus brasil,RT @Adele: The piano mics fell on to the piano...,MirianR88152620,37033,0,2,11,[],...,"The piano mics fell on to the piano strings, ...",1,0,-3.11903,-60.0217,Manaus,Amazonas,Brazil,Manaus,AM
5,873330450462330880,Sat Jun 10 00:06:19 +0000 2017,miami fl,RT @Adele: HUGE song @tinietempah love it love...,CalvoRicky,2142,0,104596,71,[],...,HUGE song @tinietempah love it love it!!,1,0,25.7616,-80.1919,Miami,Florida,United States,Miami-Dade County,FL
6,873330445030748160,Sat Jun 10 00:06:18 +0000 2017,miami fl,RT @Adele: JT!!,CalvoRicky,3358,0,104596,71,[],...,JT!!,1,0,25.7616,-80.1919,Miami,Florida,United States,Miami-Dade County,FL
7,873330436138840064,Sat Jun 10 00:06:16 +0000 2017,miami fl,RT @Adele: This Sam Smith song is so so good h...,CalvoRicky,2773,0,104596,71,[],...,This Sam Smith song is so so good xx,1,0,25.7616,-80.1919,Miami,Florida,United States,Miami-Dade County,FL


In [36]:
# with open('location_df.pkl', 'wb') as picklefile:
#     pickle.dump(df_location, picklefile)

In [2]:
# with open('df_location_final.pkl', 'rb') as picklefile:
#     df_location_final = pickle.load(picklefile)

In [37]:
df_location.head(5)

Unnamed: 0,id,created_at,location,text,screen_name,retweet_count,favorite_count,followers_count,friends_count,hashtags_text,...,clean,retweet,original_tweet,latitude,longitude,city,state,country,county,postal_state
1,873599554175860736,Sat Jun 10 17:55:39 +0000 2017,manaus brasil,RT @Adele: Because of it though... I'm treatin...,MirianR88152620,36982,0,2,11,[],...,Because of it though... I'm treating myself t...,1,0,-3.11903,-60.0217,Manaus,Amazonas,Brazil,Manaus,AM
2,873599531861950464,Sat Jun 10 17:55:33 +0000 2017,manaus brasil,RT @Adele: The piano mics fell on to the piano...,MirianR88152620,37033,0,2,11,[],...,"The piano mics fell on to the piano strings, ...",1,0,-3.11903,-60.0217,Manaus,Amazonas,Brazil,Manaus,AM
5,873330450462330880,Sat Jun 10 00:06:19 +0000 2017,miami fl,RT @Adele: HUGE song @tinietempah love it love...,CalvoRicky,2142,0,104596,71,[],...,HUGE song @tinietempah love it love it!!,1,0,25.7616,-80.1919,Miami,Florida,United States,Miami-Dade County,FL
6,873330445030748160,Sat Jun 10 00:06:18 +0000 2017,miami fl,RT @Adele: JT!!,CalvoRicky,3358,0,104596,71,[],...,JT!!,1,0,25.7616,-80.1919,Miami,Florida,United States,Miami-Dade County,FL
7,873330436138840064,Sat Jun 10 00:06:16 +0000 2017,miami fl,RT @Adele: This Sam Smith song is so so good h...,CalvoRicky,2773,0,104596,71,[],...,This Sam Smith song is so so good xx,1,0,25.7616,-80.1919,Miami,Florida,United States,Miami-Dade County,FL


### Group data by artist, location and tweets

In [38]:
grouped = df_location.groupby(['artist', 'latitude', 'longitude','city', 'state', 'country']).count()


In [39]:
grouped = grouped.reset_index()[['artist', 'latitude', 'longitude', 'city', 'state', 'country', 'id']].sort_values(by='id', ascending=False)

In [40]:
grouped = grouped.rename(columns = {'id':'tweets'}).reset_index(drop=True)

In [42]:
grouped.head(10)

Unnamed: 0,artist,latitude,longitude,city,state,country,tweets
0,Adele,51.5073,-0.128023,London,England,United Kingdom,1898
1,Adele,37.0892,-95.7132,Independence,Kansas,United States,1122
2,Adele,30.3274,31.2159,,Al Qalyubia Governorate,Egypt,1028
3,Adele,55.378,-3.43594,Moffat,Scotland,United Kingdom,844
4,Adele,-14.1764,-52.0122,,Mato Grosso,Brazil,842
5,Radiohead,51.5073,-0.128023,London,England,United Kingdom,814
6,Adele,36.1699,-115.139,Las Vegas,Nevada,United States,800
7,Radiohead,23.6496,-102.56,,Zacatecas,Mexico,748
8,Adele,52.3555,-1.17147,Kilsby,England,United Kingdom,738
9,Radiohead,19.4327,-99.1324,México D.F.,Ciudad de México,Mexico,715


#### During checks on data, I discovered that Independence Kansas was returned from users who only specified "United States". Let's remove it. 

In [50]:
grouped_test = grouped[grouped['city'] != 'Independence']

#### write to csv for plotting

In [49]:
#grouped_test.to_csv('location_world_backup.csv', index=False)