# Applied Data Analysis - Fall 2016
## Twitter-Swisscom Project

### Mobility pattern: Statistics


1 - [Population locations](#locations)

2 - [Population routing](#routing)

3 - [Exploration of cities home/work locations](#exploration)

In [None]:
import pandas as pd
import math
import numpy as np
from mobility_helper import *
from collections import Counter
import seaborn as sns
import matplotlib.pyplot as plt
import time
from geopy.geocoders import Nominatim,Bing
import datetime as dt
import folium

In [None]:
users = pd.read_csv('./data/users_final.csv', encoding = "ISO-8859-1")
print('Our final dataset contains '+str(len(users))+' users.')

### 1 - <a id='locations'> Population locations </a>


### Movements between countries

We plot the nationality distribution of our users:

In [None]:
plt.style.use('ggplot')
per_countries = users[users.homeCountry == users.workCountry].groupby('homeCountry').size()
per_countries.columns = ['Number of users']
per_countries.plot.pie(figsize=(6,6), title="Twitter users distribution per Country")
plt.savefig('./figs/tweetpercountry.png')
plt.show()

From the home countries and work countries we extracted, we can now detect the borderers among our users.

In [None]:
#globally
borderers = users[users.homeCountry != users.workCountry]
print("Number of detected borderers: ", len(borderers))
print("Percentage of detected borderers: ", 100*(len(borderers)/len(users)))

We now check the different movements in between countries.

In [None]:
print("Different detected movements: ")
flows = pd.DataFrame(borderers.groupby(['homeCountry', 'workCountry']).size())
flows

We first check the distributions of our borderers' nationalities.

In [None]:
grouped = borderers.groupby('homeCountry').size()

plt.style.use('ggplot')
grouped.plot.pie(figsize=(6,6), title="Distribution of Borderers' home countries")
plt.savefig('./figs/borderersfromcountry.png')
plt.show()

We now check the countries where borderers FROM Switzerland go to.

In [None]:
swiss = borderers[(borderers.homeCountry == "Suisse") & (borderers.workCountry != "Suisse")]
ch = swiss.groupby('workCountry').size()
plt.style.use('ggplot')
ch.plot.pie(figsize=(6,6), title="Distribution of Swiss Borderers destinations")
plt.savefig('./figs/swissborderers.png')
plt.show()

We check the nationalities of borderers going TO Switzerland.

In [None]:
swiss = borderers[(borderers.homeCountry != "Suisse") & (borderers.workCountry == "Suisse")]
ch = swiss.groupby('workCountry').size()
plt.style.use('ggplot')
ch.plot.pie(figsize=(6,6), title="Distribution of Swiss Borderers destinations")
plt.savefig('./figs/otherborderers.png')
plt.show()

### Movements between Cantons

Distribution of users among cantons.

In [None]:
swiss = users[(users.homeCountry == users.workCountry) & (users.homeCountry == "Suisse")]

swiss_equals = swiss[swiss.homeCanton == swiss.workCanton]
g1 = pd.DataFrame(swiss_equals.groupby('homeCanton').size())
g1.columns = ['c1']

swiss_differs = swiss[swiss.homeCanton != swiss.workCanton]
g2 = pd.DataFrame(swiss_differs.groupby('homeCanton').size())
g2.columns = ['c2']

g3 = pd.DataFrame(swiss_differs.groupby('workCanton').size())
g3.columns = ['c3']

g = pd.concat([g1,g2,g3], axis=1)
g = g.fillna(0)
g['cantons'] = g.index.values
g['count'] = g['c1']+g['c2']+g['c3']
g = g.drop(['c1', 'c2', 'c3'], axis=1)
g = g.sort_values(by=['count'], ascending=False)

ax = sns.barplot(x="count", y="cantons", data=g,
            label="Number of users", palette="viridis")
ax.set(ylabel="Cantons",xlabel="Number of users")
sns.plt.title("Twitter users distribution per Canton")
plt.rcParams['ytick.labelsize'] = 5
plt.rcParams['xtick.labelsize'] = 10
plt.savefig('./figs/tweetspercanton.png')
plt.show()

Distribution of Swiss working and living in different cantons.

In [None]:
#globally
diffcant = users[(users.homeCountry == "Suisse") & (users.homeCanton != users.workCanton)]
swiss = users[users.homeCountry == "Suisse"]
print("Number of detected users in different canton: ", len(diffcant))
print("Percentage of detected users among swiss: ", 100*(len(diffcant)/len(swiss)))

In [None]:
print("Different detected movements: ")
main_cantons = ['Zürich', "Genève", "Vaud", "Bern - Berne", "Valais - Wallis", "Aargau"]
flows = pd.DataFrame(diffcant[diffcant.homeCanton.isin(main_cantons)].groupby(['homeCanton', 'workCanton']).size())

flows

### Home Location vs Work Location

We are interested in people working and living at the same location.

In [None]:
#percentage working at home or not working
working_home = users[(users.workLat == users.homeLat) & (users.workLong == users.homeLong)]

In [None]:
n = len(working_home)
perc = 100*(n/len(users))
print("Number of users working at home or not working: "+str(n))
print("Percentage of people working at home: "+str(int(perc))+"%")

It is a high number. Given our assumptions it just means that tweets during the day and during the night were sent at the same place. 

Distribution of working@home per Canton. We discard people living and working in different canton.

In [None]:
def work_percentage(group):
    n = len(group[group["working@home"]])
    N = len(group)
    perc = 100*(n/N)
    return pd.Series({'@home': perc, 'total': 100})


per_cant = users[(users.homeCanton == users.workCanton)\
                 & (users.homeCountry == "Suisse") & (users.homeCanton.isin(main_cantons))]\
.groupby(['workCanton']).apply(work_percentage)

per_cant['Canton'] = per_cant.index.values
per_cant = per_cant.reset_index(drop=True)
per_cant.head()

We now plot the given ratios for the major cantons.

In [None]:
sns.set(style="whitegrid")

# Initialize the matplotlib figure
f, ax = plt.subplots(figsize=(6, 6))

# Plot the total crashes
sns.set_color_codes("pastel")
sns.barplot(x="total", y="Canton", data=per_cant,
            label="Total users", color="b")

# Plot the crashes where alcohol was involved
sns.set_color_codes("muted")
sns.barplot(x="@home", y="Canton", data=per_cant,
            label="Working at home", color="r")

# Add a legend and informative axis label
ax.legend(ncol=2, loc="lower right", frameon=True)
ax.set(ylabel="Cantons",xlabel="Percentages of users working at home")
sns.despine(left=True, bottom=True)

plt.show()

### Population routing: Home-Work

### Average Swiss:
- closeness to center of town
- distance to work
- time to go to work by car

In [None]:
swiss = users[(users.homeCountry == "Suisse")]

s= list(swiss['closer to home'])
split = list(map(lambda x: float(x.split(', ')[1][:-1]), s))

l = list(filter(lambda x: x < 20,map(float, split)))
print("Distance home from center of town swiss:")
pd.DataFrame(l).describe()

In [None]:
print("Distance from work swiss:")
pd.DataFrame(swiss['distance']).describe()

In [None]:
print("Route Time Average Swiss:")
pd.DataFrame(swiss['routeTime']).describe()

### Per canton

In [None]:
def avg_dtime(group):
    return pd.Series({"avg distance": group["distance"].mean(), "avg time": group["routeTime"].mean()})


cant_dist_time = pd.DataFrame(swiss[swiss.homeCanton.isin(main_cantons)].groupby('homeCanton').apply(avg_dtime))
print("Average distance from work and time travel to work per main Cantons:")
cant_dist_time

### For Borderers

In [None]:
print("Distance and time from home to work:")
borderers[["distance", "routeTime"]].describe()

## Exploration of cities home/work locations
- Geneva
- Zürich

We generate a folium map of the latter to visualize potential work and residentials areas.

In [None]:
city = users[['workLat', 'workLong', 'closer to work', 'homeLat', 'homeLong', 'closer to home']]

w = list(city['closer to work'])
work = list(map(lambda x: float(x.split(', ')[1][:-1]), w))
citywork = list(map(lambda x: x.split(', ')[0][2:-1], w))
city['closer to work'] = pd.DataFrame(work)
city['citywork'] = pd.DataFrame(citywork)

h= list(city['closer to home'])
home = list(map(lambda x: float(x.split(', ')[1][:-1]), h))
cityhome = list(map(lambda x: x.split(', ')[0][2:-1], h))
city['closer to home'] = pd.DataFrame(home)
city['cityhome'] = pd.DataFrame(cityhome)

geneva = city[(city.cityhome == "Geneva") | (city.citywork == "Geneva")]
geneva = geneva[(geneva['closer to home'] < 15) & (geneva['closer to work'] < 15)]
geneva = geneva[(geneva.workLat != geneva.homeLat) & (geneva.workLong != geneva.homeLong)]

zurich = city[(city.cityhome == "Zurich") | (city.citywork == "Zurich")]
zurich = zurich[(zurich['closer to home'] < 30) & (zurich['closer to work'] < 30)]
zurich = zurich[(zurich.workLat != zurich.homeLat) & (zurich.workLong != zurich.homeLong)]

Geneva

In [None]:
print("Saving of Geneva neighborhood...")
map_1 = folium.Map(location=main_cities['Geneva'], zoom_start=13,tiles='Stamen Toner')
for index, row in geneva.iterrows():
    folium.CircleMarker([row.workLat, row.workLong], popup=str(row.workLat)+','+str(row.workLong),
                    color='#fe2e2e',fill_color='#fe2e2e', radius =50).add_to(map_1)
    folium.CircleMarker([row.homeLat, row.homeLong], popup=str(row.homeLat)+','+str(row.homeLong),
                  color='#0000FF',fill_color='#0000FF', radius=50).add_to(map_1)
map_1.save('./figs/geneva.html')
print("Saved.")

Zürich

In [None]:
print("Saving of Zurich neighborhood...")
map_1 = folium.Map(location=main_cities['Zurich'], zoom_start=13,tiles='Stamen Toner')
for index, row in zurich.iterrows():
    folium.CircleMarker([row.workLat, row.workLong], popup=str(row.workLat)+','+str(row.workLong),
                    color='#fe2e2e',fill_color='#fe2e2e', radius =50).add_to(map_1)
    folium.CircleMarker([row.homeLat, row.homeLong], popup=str(row.homeLat)+','+str(row.homeLong),
                  color='#0000FF',fill_color='#0000FF', radius=50).add_to(map_1)
map_1.save('./figs/zurich.html')
print("Saved.")