In [32]:
import pandas as pd
import math
import numpy as np
from mobility_helper import *
from collections import Counter
import seaborn as sns
import matplotlib.pyplot as plt
import time
from geopy.geocoders import Nominatim,Bing
import datetime as dt


# 1. Load the data

Contains subset of the original schema. Already removed useless columns for mobility pattern at preprocessing:
1. id
2. userID
3. createdAt
4. longitude
5. latitude
6. placeID
7. placeLatitude
8. placeLongitude
9. sourceName
10. userLocation

In [None]:
src = '../twitter-swisscom/twex_mobility_corrected.tsv'
columns = ['ID', 'userID', 'createdAt', 'longitude', 'latitude', 'placeID','placeLatitude','placeLongitude','sourceName', 'userLocation']
dtypes = {'ID': 'int', 'createdAt': 'str', 'longitude': 'float', 'latitude': 'float', 'placeID': 'str', 'placeLatitude': 'float', 'placeLongitude': 'float', 'sourceName': 'str', 'userLocation': 'str'}

chunk_size = 10**6
df = pd.read_csv(src, sep='\t', names=columns, na_values=['\\N'],dtype=dtypes, encoding='utf-8')

# 2. Cleaning

In [None]:
"""
df = pd.DataFrame()
for chunk in tweet_chunks:
    df = chunk.copy()
    del chunk
    break
"""

In [None]:
#tweets with null latitude
print(len(df[pd.isnull(df.latitude)]))
#tweets with null longitude
print(len(df[pd.isnull(df.longitude)]))
#tweets with both null latitude and null longitude
no_coord = np.logical_and(pd.isnull(df.latitude), pd.isnull(df.longitude))
tw_wo_coord = df[no_coord]
print(len(tw_wo_coord))
#if one coordinate is null the other one too

In [None]:
#check place coordinate existance when no original coordinate
has_place_coord = tw_wo_coord[~np.logical_and(pd.isnull(tw_wo_coord.placeLatitude), pd.isnull(tw_wo_coord.placeLongitude))]
len(has_place_coord)

In [None]:
#Can always replace by place coordinates when no original coordinates provided.
#Do replacement
df['latitude'] = df.apply(replaceLatitude, axis=1)
df['longitude'] = df.apply(replaceLongitude, axis=1)
#tweets without coordinate
no_coord = np.logical_and(pd.isnull(df.latitude), pd.isnull(df.longitude))
len(df[no_coord])

In [None]:
#no more NaN coordinates.
#delete place coordinates columns
df = df.drop(['placeID', 'placeLatitude', 'placeLongitude'], axis=1)
df.head()

In [None]:
#check tweets with no timestamps
print(len(df[pd.isnull(df.createdAt)]))

In [None]:
#Nice. 
#Want coordinates in area of Switzerland as well as  part of neighboring countries. 
#This area is approximately defined in the following  coordinate intervals:
# - latitude: [45, 48]
# - longitude: [4,13]
#Remove geographical outliers
print(len(df))
lat_inbound = ((45 <= df.latitude) & (df.latitude <= 48))
long_inbound = ((4 <= df.longitude) & (df.longitude <= 13))
inbound = np.logical_and(lat_inbound, long_inbound)
print(len(df[~inbound]))
df = df[inbound]

In [None]:
#Check sourceName possibilities.
if 0:
    df.sourceName.unique()

In [None]:
if 0:
    keywords = ['iOS', 'App', 'app', 'Android', 'APP', 'application', 'Phone', 'iPhone']
    is_portable = lambda s: any(k in s for k in keywords)
    df['sourceName'] = df.apply(lambda row: 'portable' if is_portable(row.sourceName) else row.sourceName, axis=1)                            

In [None]:
if 0:
    df.sourceName.unique()

In [None]:
#no real additional information from those values. (Desktop/portable)
#drop sourceName
df = df.drop(['sourceName'], axis=1)
df.head()

# 3. Data Exploration/Analysis

In [2]:
#Check distribution of coordinates latitude, longitude
main_cities = {
    'Zurich': [47.36667, 8.55000],
    'Geneva':[46.2,6.1667],
    'Basel': [47.5667,7.6],
    'Bern' : [46.9167,7.4667],
    'Lausanne': [46.5333,6.6667],
    'Luzern': [47.0833,8.2667],
    'Sion': [46.2333,7.35],
    'Varese': [45.8176,8.8264],
    'Mulhouse': [47.75, 7.3333],
    'Annecy': [45.9,6.1167],
    'Annemasse': [46.1944, 6.2377],
    'Pontarlier': [46.9035,6.3554],
    'Aoste': [45.5833, 5.6]
}

In [None]:
g = sns.jointplot(x=df.longitude, y=df.latitude, kind="hex", color="k");
for city, coord in main_cities.items():
    g.ax_joint.scatter(coord[1], coord[0], marker='o', c='r', s=5)
    g.ax_joint.annotate(city, xy=(coord[1], coord[0]), xytext=(coord[1], coord[0]))
plt.show()

In [None]:
#UserLocation is not reliable information. 
#Takes too much resource to find a way to filter useless and helphful information.
#we reverse query our coordinates
df = df.drop(['userLocation'], axis=1)
df.head()

In [None]:
count_users = pd.DataFrame(df.groupby(['userID']).size())
count_users.columns = ['count']
count_users = count_users.sort_values(by=['count'], ascending=True)
count_users = count_users.reset_index()

In [None]:
#plot distribution of tweets per user (add legend)
count_users.plot(y='count', use_index=True)
plt.show()

In [None]:
#5% of users have more than 70% of tweets. 
#we remove users with a number of tweets below a certain threshold.
threshold = 100
before = len(count_users)
count_users = count_users[count_users['count'] >= threshold]
after = len(count_users)
print('Percentage of user loss: ',(before-after)*100/before)

In [None]:
ids_to_keep = list(count_users.userID)
tweets = df[df.userID.isin(ids_to_keep)]
len(tweets)

In [None]:
#check types
tweets.dtypes

In [None]:
#convert createdAt to date
tweets['createdAt'] = pd.DatetimeIndex(tweets['createdAt'])
#tweets['createdAt'] = pd.to_datetime(tweets['createdAt'])
print(tweets.dtypes)

In [None]:
tweets = tweets.reset_index(drop=True)
tweets.head()

# 4. Mobility Pattern

In [None]:
#location too precise. Need to compare areas/spots between them. Lower coordinate precision.
#https://en.wikipedia.org/wiki/Decimal_degrees
#precision to 3 decimals: equateur => 80m 
#precision to 2 decimals: equateur => 780m 

tweets['areaLat'] = tweets.apply(lambda r: float('%.2f' % r.latitude), axis=1)
tweets['areaLong'] = tweets.apply(lambda r: float('%.2f' % r.longitude), axis=1)
tweets.head()

In [None]:
#Indicate if tweet during work hour or not
#set work hours being 8:00-18:00
tweets['@workHour'] = (tweets['createdAt'].dt.hour <= 18) & (tweets['createdAt'].dt.hour >= 8)
tweets.head()

## 4.1 Home and Work Place Extraction

### 4.1.1 Work place detection

In [None]:
at_work = tweets[tweets['@workHour']]
at_work.head()

In [None]:
def place_24hdiff(name, most_freq, place):
    if place == "work":
        tw = at_work[(at_work.userID == name) & (at_work.areaLong == most_freq[1]) & (at_work.areaLat == most_freq[0])]
    else:
        tw = nat_work[(nat_work.userID == name) & (nat_work.areaLong == most_freq[1]) & (nat_work.areaLat == most_freq[0])]
    #return true if day distance >=1        
    for i1, row1 in tw.iterrows():
        for i2, row2 in tw.iterrows():
            d = row1.createdAt - row2.createdAt
            if abs(d.days) >= 1:
                return True
    return False

def most_freq_coord(group, threshold, place):
    lat_long = list(zip(group.areaLat, group.areaLong))
    c = Counter(lat_long)
    most_freq = list(c)[0]
    n = c[most_freq]
    while not place_24hdiff(group.name, most_freq, place):
        c.pop(most_freq, 0)
        if len(c) == 0:
            #no places with valid diff: return last one with frequence 0 which will be deleted with threshold
            return pd.Series({'freqLat': most_freq[0], 'freqLong': most_freq[1], 'frequence': 0})
        else:
            most_freq = list(c)[0]
            n = c[most_freq]
    #out of while loop: means we have a valid place (1 day diff)
    return pd.Series({'freqLat': most_freq[0], 'freqLong': most_freq[1], 'frequence': n})

In [None]:
threshold = 10

In [None]:
work_freq = at_work.groupby(['userID']).apply(lambda g : most_freq_coord(g,threshold, "work"))
print(len(work_freq))
work_freq.head()

In [None]:
#apply threshold on frequence
len(work_freq[work_freq.frequence < threshold])

In [None]:
#we would loose 950 users that tweeted less than threshold tweets from their potential work
#save index of to_remove users
work_to_remove = work_freq[work_freq.frequence < threshold].index.values.tolist()
len(work_to_remove)

### 4.1.2 Home detection

In [None]:
nat_work = tweets[~tweets['@workHour']]
nat_work.head()

In [None]:
threshold = 10

In [None]:
home_freq = nat_work.groupby(['userID']).apply(lambda g: most_freq_coord(g, threshold, "home"))
print(len(home_freq))
home_freq.head()

In [None]:
#apply threshold on frequence
len(home_freq[home_freq.frequence < threshold])

In [None]:
#we would loose 843 users that tweeted less than 10 tweets from their potential home
#save index of to_remove users
home_to_remove = home_freq[home_freq.frequence < threshold].index.values.tolist()
len(home_to_remove)

### 4.1.3 Data Merge: Home | Work

In [None]:
#for now do not remove places with frequence = 1
#we do not separate analysis over the years
count_users = count_users.set_index(['userID'])

In [None]:
#join 3 dataframes: work_freq, home_freq, count_users
users = pd.merge(work_freq, home_freq,how='inner', left_index=True, right_index=True, )
columns = ['workLat', 'workLong', 'workTweets', 'homeLat', 'homeLong', 'homeTweets']
users = pd.merge(users, count_users, how='inner', left_index=True, right_index=True, suffixes=('_x', '_y'))
columns.append('#tweets')
users.columns = columns
users.head()

In [None]:
users['userID'] = users.index.values
users = users.reset_index(drop=True)
#users = users[users.columns[-1:]+users.columns[:-1]]
users.head()

In [None]:
#remove users that did not validate conditions on work/home tweet frequence threshold
to_remove = home_to_remove + work_to_remove
print(len(users))
users = users[~users.userID.isin(to_remove)]
print(len(users))

In [None]:
#save dataframe
users.to_csv('users_merged.csv', index=False)

## 4.2 Home-Work route evaluation

### 4.2.1 Administrative locations detection

In [59]:
users = pd.read_csv('users_merged.csv', sep=',')
users.head()

Unnamed: 0,workLat,workLong,workTweets,homeLat,homeLong,homeTweets,#tweets,userID
0,47.08,8.27,25.0,47.08,8.27,73.0,100,701522231
1,46.81,8.22,22.0,46.81,8.22,78.0,100,1525719553
2,46.8,7.15,72.0,46.8,7.15,17.0,100,565009197
3,46.0,8.75,76.0,46.0,8.75,14.0,100,407885922
4,45.8,8.88,28.0,45.8,8.88,72.0,101,23280133


- Country  

In [60]:
geolocator = Nominatim()
countries = []
cache = {}
for i, row in users.iterrows():
    w = 0
    h = 0
    if cache.get((row.homeLat, row.homeLong)):
        home = cache[(row.homeLat, row.homeLong)]
        h = home.address.split(', ')[-1]
    else:
        home = geolocator.reverse(str(row.homeLat)+", "+str(row.homeLong))#.address.split(', ')[-1]
        cache[(row.homeLat, row.homeLong)] = home
        h = home.address.split(', ')[-1]
        time.sleep(0.5)
    if cache.get((row.workLat, row.workLong)):
        work = cache[(row.workLat, row.workLong)]
        w = work.address.split(', ')[-1]
    else:
        work = geolocator.reverse(str(row.workLat)+", "+str(row.workLong))#.address.split(', ')[-1]
        cache[(row.workLat, row.workLong)] = work
        w = work.address.split(', ')[-1]
        time.sleep(0.5)
    if w == "Svizra":
        w = "Suisse"
    if h == "Svizra":
        h = "Suisse"
    countries.append((h,w))

In [61]:
countries[:10]

[('Suisse', 'Suisse'),
 ('Suisse', 'Suisse'),
 ('Suisse', 'Suisse'),
 ('Italia', 'Italia'),
 ('Italia', 'Italia'),
 ('France', 'France'),
 ('Suisse', 'Suisse'),
 ('Italia', 'Italia'),
 ('France', 'France'),
 ('Suisse', 'Suisse')]

In [62]:
#add countries to users
c = pd.DataFrame(countries, columns=['homeCountry', 'workCountry'])
users = pd.concat([users, c], axis=1)
users.head()

Unnamed: 0,workLat,workLong,workTweets,homeLat,homeLong,homeTweets,#tweets,userID,homeCountry,workCountry
0,47.08,8.27,25.0,47.08,8.27,73.0,100,701522231,Suisse,Suisse
1,46.81,8.22,22.0,46.81,8.22,78.0,100,1525719553,Suisse,Suisse
2,46.8,7.15,72.0,46.8,7.15,17.0,100,565009197,Suisse,Suisse
3,46.0,8.75,76.0,46.0,8.75,14.0,100,407885922,Italia,Italia
4,45.8,8.88,28.0,45.8,8.88,72.0,101,23280133,Italia,Italia


- Cantons

In [63]:
#detect Cantons
def get_cantons(row):
    if row.homeCountry == "Suisse":
        homecant = cache[(row.homeLat, row.homeLong)].address.split(', ')[-6]
    else:
        homecant = "-"
    if row.workCountry == "Suisse":
        workcant = cache[(row.workLat, row.workLong)].address.split(', ')[-6]
    else:
        workcant = "-"
    return (homecant, workcant)

#add cantons to users
cantons = list(users.apply(get_cantons, axis=1))
cantons = pd.DataFrame(cantons, columns=['homeCanton', 'workCanton'])
users = pd.concat([users, cantons], axis=1)

In [64]:
users.head()

Unnamed: 0,workLat,workLong,workTweets,homeLat,homeLong,homeTweets,#tweets,userID,homeCountry,workCountry,homeCanton,workCanton
0,47.08,8.27,25.0,47.08,8.27,73.0,100,701522231,Suisse,Suisse,Luzern,Luzern
1,46.81,8.22,22.0,46.81,8.22,78.0,100,1525719553,Suisse,Suisse,Obwalden,Obwalden
2,46.8,7.15,72.0,46.8,7.15,17.0,100,565009197,Suisse,Suisse,Fribourg - Freiburg,Fribourg - Freiburg
3,46.0,8.75,76.0,46.0,8.75,14.0,100,407885922,Italia,Italia,-,-
4,45.8,8.88,28.0,45.8,8.88,72.0,101,23280133,Italia,Italia,-,-


In [65]:
users.to_csv('users_locations.csv', index=False)

### 4.2.2 Distance Home-Work

In [66]:
users = pd.read_csv('users_locations.csv')
users.head()

Unnamed: 0,workLat,workLong,workTweets,homeLat,homeLong,homeTweets,#tweets,userID,homeCountry,workCountry,homeCanton,workCanton
0,47.08,8.27,25.0,47.08,8.27,73.0,100,701522231,Suisse,Suisse,Luzern,Luzern
1,46.81,8.22,22.0,46.81,8.22,78.0,100,1525719553,Suisse,Suisse,Obwalden,Obwalden
2,46.8,7.15,72.0,46.8,7.15,17.0,100,565009197,Suisse,Suisse,Fribourg - Freiburg,Fribourg - Freiburg
3,46.0,8.75,76.0,46.0,8.75,14.0,100,407885922,Italia,Italia,-,-
4,45.8,8.88,28.0,45.8,8.88,72.0,101,23280133,Italia,Italia,-,-


In [67]:
from math import radians, cos, sin, asin, sqrt
def haversine(lon1, lat1, lon2, lat2):
    """
    Calculate the great circle distance between two points 
    on the earth (specified in decimal degrees)
    """
    # convert decimal degrees to radians 
    lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])
    # haversine formula 
    dlon = lon2 - lon1 
    dlat = lat2 - lat1 
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * asin(sqrt(a)) 
    km = 6367 * c
    return km

def dist(row):
    if (row.workLat == row.homeLat) and (row.workLong == row.homeLong):
        return 0
    else:
        return haversine(row.workLong, row.workLat, row.homeLong, row.homeLat)

In [68]:
users['distance'] = users.apply(dist, axis=1)

In [69]:
users[users.distance != 0.0].head()

Unnamed: 0,workLat,workLong,workTweets,homeLat,homeLong,homeTweets,#tweets,userID,homeCountry,workCountry,homeCanton,workCanton,distance
13,47.37,8.54,15.0,47.36,8.52,26.0,102,15519831,Suisse,Suisse,Zürich,Zürich,1.871091
17,47.09,8.28,17.0,47.18,8.55,27.0,102,322220690,Suisse,Suisse,Zug,Luzern,22.729363
19,47.37,8.54,11.0,47.36,8.52,27.0,102,45321360,Suisse,Suisse,Zürich,Zürich,1.871091
23,47.38,8.54,56.0,47.43,8.67,16.0,103,19081386,Suisse,Suisse,Zürich,Zürich,11.245866
24,45.94,6.08,10.0,46.07,6.31,53.0,104,2308044749,France,France,-,-,22.888059


### 4.2.3 Detect closer main cities (work/home)

In [70]:
def get_closer(row, where):
    if where == 'work':
        lat = row.workLat
        long = row.workLong
    else:
        lat = row.homeLat
        long = row.homeLong
    d = 10000
    closer = ""
    for city, coord in main_cities.items():
        new = haversine(long, lat, coord[1], coord[0])
        if new < d:
            d = new
            closer = city
    return (closer, d)


users['closer to home'] = users.apply(lambda r: get_closer(r, "home"), axis=1)
users['closer to work'] = users.apply(lambda r: get_closer(r, "work"), axis=1)

In [71]:
users[users.homeCountry != users.workCountry]

Unnamed: 0,workLat,workLong,workTweets,homeLat,homeLong,homeTweets,#tweets,userID,homeCountry,workCountry,homeCanton,workCanton,distance,closer to home,closer to work
41,47.52,7.69,10.0,47.75,7.33,49.0,106,1013556217,France,Suisse,-,Basel-Landschaft,37.147665,"(Mulhouse, 0.2465655646310726)","(Basel, 8.515260951487871)"
190,46.17,6.2,13.0,46.19,6.2,50.0,154,458592244,Suisse,France,Genève,-,2.222502,"(Geneva, 2.7921473981247624)","(Annemasse, 3.9705850360446107)"
319,47.8,7.33,104.0,47.61,7.61,24.0,234,133924793,Deutschland,France,-,-,29.735875,"(Basel, 4.869738576853841)","(Mulhouse, 5.561718553371034)"
370,46.21,6.12,29.0,46.14,6.01,244.0,284,558706651,France,Suisse,-,Genève,11.495904,"(Geneva, 13.779584185744847)","(Geneva, 3.759564287855678)"
484,46.14,6.04,13.0,46.14,6.02,111.0,646,833235133,France,Suisse,-,Genève,1.539969,"(Geneva, 13.1113959409452)","(Geneva, 11.81209558163891)"
527,47.14,8.58,425.0,47.65,9.54,36.0,1062,2245859163,Deutschland,Suisse,-,Zug,91.797911,"(Zurich, 80.70644229514339)","(Luzern, 24.517871097219224)"
528,47.34,9.58,624.0,47.41,9.74,11.0,1083,1505970014,Österreich,Suisse,-,Sankt Gallen,14.334728,"(Zurich, 89.65764741187124)","(Zurich, 77.59909949920166)"


### 4.2.4 Route Time: home-work

In [72]:
import requests
import json
import codecs
API_KEY = 'hAGkPF87ujmSOXqMIoks4rV4sVYww8jG'

def get_time(row):
    if (row.homeLat == row.workLat) and (row.homeLong == row.workLong):
        return 0
    else:
        src = {'lat':row.homeLat, 'long':row.homeLong}
        dest = {'lat': row.workLat, 'long': row.workLong}
        params = {
            'key': API_KEY,
            'outFormat': "json",
            'inFormat': "json"
        }
        request_body = {
            'locations': [
                {'latLng': {'lat': src['lat'], 'lng': src['long']}},
                {'latLng': {'lat': dest['lat'], 'lng': dest['long']}}
                ]
            }
        r=requests.post('https://www.mapquestapi.com/directions/v2/route',
                        params=params, 
                        data=json.dumps(request_body)
                       )
        if r.status_code != 200:
            # We didn't get a response from Mapquest
            return -1
        return r.json()['route']['time']/60

users['routeTime'] = users.apply(get_time, axis=1)

In [73]:
test = users[users.routeTime != 0]
print(len(test))
test.head()

126


Unnamed: 0,workLat,workLong,workTweets,homeLat,homeLong,homeTweets,#tweets,userID,homeCountry,workCountry,homeCanton,workCanton,distance,closer to home,closer to work,routeTime
13,47.37,8.54,15.0,47.36,8.52,26.0,102,15519831,Suisse,Suisse,Zürich,Zürich,1.871091,"(Zurich, 2.376643291268331)","(Zurich, 0.8386823848142152)",6.966667
17,47.09,8.28,17.0,47.18,8.55,27.0,102,322220690,Suisse,Suisse,Zug,Luzern,22.729363,"(Zurich, 20.74372493228116)","(Luzern, 1.2518161548668694)",28.35
19,47.37,8.54,11.0,47.36,8.52,27.0,102,45321360,Suisse,Suisse,Zürich,Zürich,1.871091,"(Zurich, 2.376643291268331)","(Zurich, 0.8386823848142152)",6.966667
23,47.38,8.54,56.0,47.43,8.67,16.0,103,19081386,Suisse,Suisse,Zürich,Zürich,11.245866,"(Zurich, 11.4456822698284)","(Zurich, 1.6615021190898316)",16.083333
24,45.94,6.08,10.0,46.07,6.31,53.0,104,2308044749,France,France,-,-,22.888059,"(Annemasse, 14.903088793332524)","(Annecy, 5.273260210609062)",25.1


In [74]:
users.to_csv('users_final.csv', index=False)

# 5. Statistics on Data