# Applied Data Analysis - Fall 2016
## Twitter-Swisscom Project

### Mobility Pattern


1 - [Coordinate rounding](#rounding)

2 - [Home/Work Detection](#home_work)

3 - [Route Computing](#route)

In [1]:
import pandas as pd
import math
import numpy as np
from mobility_helper import *
from collections import Counter
import seaborn as sns
import matplotlib.pyplot as plt
import time
from geopy.geocoders import Nominatim,Bing
import datetime as dt
import folium

We first load our cleaned and preprocessed data.

In [2]:
tweets = pd.read_csv('./data/tweets_processed.csv', encoding = "ISO-8859-1")

### 1 - <a id='rounding'>Coordinate rounding</a>

The location coordinate that we have are too precise. It will be impossible to group by coordinate in order to detect key places like home and work. We thus decide to round the latitude and longitude of each tweets. We do this by lowering the coordinate precision (# of decimals).

C.f. https://en.wikipedia.org/wiki/Decimal_degrees

- precision to 3 decimals: equateur: radius of 80m 
- precision to 2 decimals: equateur: raidus of 780m

By keeping 3 decimals, we observed that a lot of places were not grouped with others. Disabling us to infer on key places. We thus applied a 2-decimals precision.

In [3]:
tweets['areaLat'] = tweets.apply(lambda r: float('%.2f' % r.latitude), axis=1)
tweets['areaLong'] = tweets.apply(lambda r: float('%.2f' % r.longitude), axis=1)
tweets.head()

Unnamed: 0,ID,userID,createdAt,longitude,latitude,areaLat,areaLong
0,924163322,17341045,2010-02-23 05:55:51,7.43926,46.9489,46.95,7.44
1,924911820,7198282,2010-02-23 06:22:40,8.53781,47.3678,47.37,8.54
2,926639767,14657884,2010-02-23 07:34:25,6.13396,46.1951,46.2,6.13
3,927264351,14393717,2010-02-23 08:02:57,6.63254,46.5199,46.52,6.63
4,928080163,14393717,2010-02-23 08:40:13,6.63428,46.5191,46.52,6.63


We now infer on tweets sent during work hours: 8:00-18:00

In [4]:
tweets['createdAt'] = pd.to_datetime(tweets['createdAt'])

In [5]:
tweets['@workHour'] = (tweets['createdAt'].dt.hour <= 18) & (tweets['createdAt'].dt.hour >= 8)
tweets.head()

Unnamed: 0,ID,userID,createdAt,longitude,latitude,areaLat,areaLong,@workHour
0,924163322,17341045,2010-02-23 05:55:51,7.43926,46.9489,46.95,7.44,False
1,924911820,7198282,2010-02-23 06:22:40,8.53781,47.3678,47.37,8.54,False
2,926639767,14657884,2010-02-23 07:34:25,6.13396,46.1951,46.2,6.13,False
3,927264351,14393717,2010-02-23 08:02:57,6.63254,46.5199,46.52,6.63,True
4,928080163,14393717,2010-02-23 08:40:13,6.63428,46.5191,46.52,6.63,True


### 2 - <a id='home_work'> Home/Work place Detection </a>

We want to define a key place if at least 10 tweets were sent from there, at least 2 tweets have a 24h offset and it is sent in the predefined correspondinf work or home hours.

We thus defined those two functions that will help us select the given locations per users:

In [6]:
def place_24hdiff(name, most_freq, place):
    if place == "work":
        tw = at_work[(at_work.userID == name) & (at_work.areaLong == most_freq[1]) & (at_work.areaLat == most_freq[0])]
    else:
        tw = nat_work[(nat_work.userID == name) & (nat_work.areaLong == most_freq[1]) & (nat_work.areaLat == most_freq[0])]
    #return true if day distance >=1        
    for i1, row1 in tw.iterrows():
        for i2, row2 in tw.iterrows():
            d = row1.createdAt - row2.createdAt
            if abs(d.days) >= 1:
                return True
    return False

def most_freq_coord(group, threshold, place):
    lat_long = list(zip(group.areaLat, group.areaLong))
    c = Counter(lat_long)
    most_freq = list(c)[0]
    n = c[most_freq]
    while not place_24hdiff(group.name, most_freq, place):
        c.pop(most_freq, 0)
        if len(c) == 0:
            #no places with valid diff: return last one with frequence 0 which will be deleted with threshold
            return pd.Series({'freqLat': most_freq[0], 'freqLong': most_freq[1], 'frequence': 0})
        else:
            most_freq = list(c)[0]
            n = c[most_freq]
    #out of while loop: means we have a valid place (1 day diff)
    return pd.Series({'freqLat': most_freq[0], 'freqLong': most_freq[1], 'frequence': n})

threshold = 10

### Work place detection

In [7]:
at_work = tweets[tweets['@workHour']]
at_work.head()

Unnamed: 0,ID,userID,createdAt,longitude,latitude,areaLat,areaLong,@workHour
3,927264351,14393717,2010-02-23 08:02:57,6.63254,46.5199,46.52,6.63,True
4,928080163,14393717,2010-02-23 08:40:13,6.63428,46.5191,46.52,6.63,True
5,929214686,14260616,2010-02-23 09:32:09,8.29953,47.4829,47.48,8.3,True
6,929803298,14657884,2010-02-23 09:59:41,6.1387,46.175,46.17,6.14,True
7,931855097,9962022,2010-02-23 11:28:27,6.33641,46.4631,46.46,6.34,True


In [8]:
work_freq = at_work.groupby(['userID']).apply(lambda g : most_freq_coord(g,threshold, "work"))
print(len(work_freq))
work_freq.head()

18453


Unnamed: 0_level_0,freqLat,freqLong,frequence
userID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1497,47.59,8.02,3.0
2397,47.59,8.21,2.0
2623,47.76,8.84,2.0
5267,47.46,8.59,5.0
5757,47.67,9.41,75.0


We apply the threshold. And save the index (userID) that we should remove.

In [9]:
len(work_freq[work_freq.frequence < threshold])

10240

In [10]:
work_to_remove = work_freq[work_freq.frequence < threshold].index.values.tolist()
len(work_to_remove)

10240

### Home place detection

In [11]:
nat_work = tweets[~tweets['@workHour']]
nat_work.head()

Unnamed: 0,ID,userID,createdAt,longitude,latitude,areaLat,areaLong,@workHour
0,924163322,17341045,2010-02-23 05:55:51,7.43926,46.9489,46.95,7.44,False
1,924911820,7198282,2010-02-23 06:22:40,8.53781,47.3678,47.37,8.54,False
2,926639767,14657884,2010-02-23 07:34:25,6.13396,46.1951,46.2,6.13,False
16,954731115,6016562,2010-02-23 21:58:49,6.6268,46.5072,46.51,6.63,False
17,957072936,14657884,2010-02-23 22:56:23,6.14474,46.1958,46.2,6.14,False


In [12]:
home_freq = nat_work.groupby(['userID']).apply(lambda g: most_freq_coord(g, threshold, "home"))
print(len(home_freq))
home_freq.head()

18444


Unnamed: 0_level_0,freqLat,freqLong,frequence
userID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1497,47.38,8.54,51.0
2397,46.24,6.09,2.0
2623,47.63,7.68,9.0
5267,46.2,6.14,12.0
5757,47.73,9.4,164.0


We apply the threshold and save the index (userID) that we should remove.

In [13]:
len(home_freq[home_freq.frequence < threshold])

9168

In [14]:
home_to_remove = home_freq[home_freq.frequence < threshold].index.values.tolist()
len(home_to_remove)

9168

### Data Merge

We merge the 2 dataframes: work_freq, home_freq.


Merging:

In [15]:
users = pd.merge(work_freq, home_freq,how='inner', left_index=True, right_index=True)
columns = ['workLat', 'workLong', 'workTweets', 'homeLat', 'homeLong', 'homeTweets']
users.columns = columns
users.head()

Unnamed: 0_level_0,workLat,workLong,workTweets,homeLat,homeLong,homeTweets
userID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1497,47.59,8.02,3.0,47.38,8.54,51.0
2397,47.59,8.21,2.0,46.24,6.09,2.0
2623,47.76,8.84,2.0,47.63,7.68,9.0
5267,47.46,8.59,5.0,46.2,6.14,12.0
5757,47.67,9.41,75.0,47.73,9.4,164.0


In [16]:
users['userID'] = users.index.values
users = users.reset_index(drop=True)
users.head()

Unnamed: 0,workLat,workLong,workTweets,homeLat,homeLong,homeTweets,userID
0,47.59,8.02,3.0,47.38,8.54,51.0,1497
1,47.59,8.21,2.0,46.24,6.09,2.0,2397
2,47.76,8.84,2.0,47.63,7.68,9.0,2623
3,47.46,8.59,5.0,46.2,6.14,12.0,5267
4,47.67,9.41,75.0,47.73,9.4,164.0,5757


We now remove users that did not validate conditions on work/home tweet frequence threshold before.

In [17]:
to_remove = home_to_remove + work_to_remove
print(len(users))
users = users[~users.userID.isin(to_remove)]
print(len(users))

18425
5270


### 3 - <a id='route'> Route Computing </a>

We want to compute the canton and country of the home and work locations that we detected. Then we want to compute the distance from the work place to the home place while finally computing the time it takes by car ride.

### Administrative location computation

Location query and storing.

In [18]:
geolocator = Nominatim()
countries = []
cache = {}
for i, row in users.iterrows():
    w = 0
    h = 0
    if cache.get((row.homeLat, row.homeLong)):
        home = cache[(row.homeLat, row.homeLong)]
        h = home.address.split(', ')[-1]
    else:
        home = geolocator.reverse(str(row.homeLat)+", "+str(row.homeLong))#.address.split(', ')[-1]
        cache[(row.homeLat, row.homeLong)] = home
        h = home.address.split(', ')[-1]
        time.sleep(0.5)
    if cache.get((row.workLat, row.workLong)):
        work = cache[(row.workLat, row.workLong)]
        w = work.address.split(', ')[-1]
    else:
        work = geolocator.reverse(str(row.workLat)+", "+str(row.workLong))#.address.split(', ')[-1]
        cache[(row.workLat, row.workLong)] = work
        w = work.address.split(', ')[-1]
        time.sleep(0.5)
    if w == "Svizra":
        w = "Suisse"
    if h == "Svizra":
        h = "Suisse"
    countries.append((h,w))

Countries: We add countries to users.

In [19]:
countries[:10]

[('Deutschland', 'Deutschland'),
 ('Suisse', 'Suisse'),
 ('Suisse', 'Suisse'),
 ('Italia', 'Italia'),
 ('Suisse', 'Suisse'),
 ('Suisse', 'Suisse'),
 ('Italia', 'Italia'),
 ('Suisse', 'Suisse'),
 ('Suisse', 'Suisse'),
 ('Suisse', 'Suisse')]

In [20]:
c = pd.DataFrame(countries, columns=['homeCountry', 'workCountry'])
users = pd.concat([users, c], axis=1)
users.head()

Unnamed: 0,workLat,workLong,workTweets,homeLat,homeLong,homeTweets,userID,homeCountry,workCountry
0,,,,,,,,Deutschland,Deutschland
1,,,,,,,,Suisse,Suisse
2,,,,,,,,Suisse,Suisse
3,,,,,,,,Italia,Italia
4,47.67,9.41,75.0,47.73,9.4,164.0,5757.0,Suisse,Suisse


Cantons: we add the cantons to users

In [22]:
users.head()

Unnamed: 0,workLat,workLong,workTweets,homeLat,homeLong,homeTweets,userID,homeCountry,workCountry
0,,,,,,,,Deutschland,Deutschland
1,,,,,,,,Suisse,Suisse
2,,,,,,,,Suisse,Suisse
3,,,,,,,,Italia,Italia
4,47.67,9.41,75.0,47.73,9.4,164.0,5757.0,Suisse,Suisse


In [21]:
#detect Cantons
def get_cantons(row):
    if row.homeCountry == "Suisse":
        homecant = cache[(row.homeLat, row.homeLong)].address.split(', ')[-6]
    else:
        homecant = "-"
    if row.workCountry == "Suisse":
        workcant = cache[(row.workLat, row.workLong)].address.split(', ')[-6]
    else:
        workcant = "-"
    return (homecant, workcant)

cantons = list(users.apply(get_cantons, axis=1))
cantons = pd.DataFrame(cantons, columns=['homeCanton', 'workCanton'])
users = pd.concat([users, cantons], axis=1)

KeyError: ((nan, nan), 'occurred at index 1')

### Distance Home-Work computing

We start by defining functions helping us computing distance from coordinate parameters.

In [None]:
from math import radians, cos, sin, asin, sqrt
def haversine(lon1, lat1, lon2, lat2):
    """
    Calculate the great circle distance between two points 
    on the earth (specified in decimal degrees)
    """
    # convert decimal degrees to radians 
    lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])
    # haversine formula 
    dlon = lon2 - lon1 
    dlat = lat2 - lat1 
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * asin(sqrt(a)) 
    km = 6367 * c
    return km

def dist(row):
    if (row.workLat == row.homeLat) and (row.workLong == row.homeLong):
        return 0
    else:
        return haversine(row.workLong, row.workLat, row.homeLong, row.homeLat)

We now defined a new column giving the home-work distance of each users.

In [None]:
users['distance'] = users.apply(dist, axis=1)

### Detection of closer main cities (work/home)

Definition of main cities and definition of new columns.

In [None]:
main_cities = {
    'Zurich': [47.36667, 8.55000],
    'Geneva':[46.2,6.1667],
    'Basel': [47.5667,7.6],
    'Bern' : [46.9167,7.4667],
    'Lausanne': [46.5333,6.6667],
    'Luzern': [47.0833,8.2667],
    'Sion': [46.2333,7.35],
    'Varese': [45.8176,8.8264],
    'Mulhouse': [47.75, 7.3333],
    'Annecy': [45.9,6.1167],
    'Annemasse': [46.1944, 6.2377],
    'Pontarlier': [46.9035,6.3554],
    'Aoste': [45.5833, 5.6]
}

In [None]:
def get_closer(row, where):
    if where == 'work':
        lat = row.workLat
        long = row.workLong
    else:
        lat = row.homeLat
        long = row.homeLong
    d = 10000
    closer = ""
    for city, coord in main_cities.items():
        new = haversine(long, lat, coord[1], coord[0])
        if new < d:
            d = new
            closer = city
    return (closer, d)


users['closer to home'] = users.apply(lambda r: get_closer(r, "home"), axis=1)
users['closer to work'] = users.apply(lambda r: get_closer(r, "work"), axis=1)

### Route time computing

In [None]:
import requests
import json
import codecs
import sys
sys.path.append('/home/hparmantier/Applied Data Analysis/SwissGeoTweet/')
from key import *
#API_KEY imported from file

def get_time(row):
    if (row.homeLat == row.workLat) and (row.homeLong == row.workLong):
        return 0
    else:
        src = {'lat':row.homeLat, 'long':row.homeLong}
        dest = {'lat': row.workLat, 'long': row.workLong}
        params = {
            'key': API_KEY,
            'outFormat': "json",
            'inFormat': "json"
        }
        request_body = {
            'locations': [
                {'latLng': {'lat': src['lat'], 'lng': src['long']}},
                {'latLng': {'lat': dest['lat'], 'lng': dest['long']}}
                ]
            }
        r=requests.post('https://www.mapquestapi.com/directions/v2/route',
                        params=params, 
                        data=json.dumps(request_body)
                       )
        if r.status_code != 200:
            # We didn't get a response from Mapquest
            return -1
        return r.json()['route']['time']/60

users['routeTime'] = users.apply(get_time, axis=1)

In [None]:
test = users[users.routeTime != 0]
print(len(test))
test.head()

We now save the dataframe that we created. It will be further used to extract some statistics.

In [None]:
users.to_csv('./data/users_final.csv', encoding='utf-8', index=False)