### Dream Team:  Xingce Bao, Sohyeong Kim, Guilio Masinelli, Silvio Zanoli


# PART 0. Get the lists of relevant stations

In this very first part of the project, we process the station data(`BFKOORD_GEO`) to get a list of stations that are within 10km from Zürich HB. The list is then save as both csv-format and pickel-format to use for the next parts of this project. 

In [1]:
import numpy as np
import pandas as pd
import pickle

In [2]:
# Get the dataFrame with station number and position and height as a column , and train station name as a column
data = pd.read_csv('./data/BFKOORD_GEO', sep="%", header=None,error_bad_lines=False)
data.columns = ['data','name']

In [3]:
# Split the station number and the longtitude and the latitude and the height and remove all the ""
number_data = data.data.str.split(' ').tolist()
for index,strs in enumerate(number_data):
    number_data[index] = list(filter(lambda x: x!="",strs))   

In [4]:
# Change to dataFrame and drop the height column because we do not need it 
number_data = pd.DataFrame(number_data,columns=['station_number','longtitude','latitude','height'])
number_data = number_data.drop(columns="height")

In [5]:
# Concat the dataFrame to get the readable data
station_data = pd.concat([number_data,data],axis = 1)
station_data = station_data.drop(columns="data")

In [6]:
station_data.head()

Unnamed: 0,station_number,longtitude,latitude,name
0,2,26.074412,44.44677,Bucuresti
1,3,1.811446,50.901549,Calais
2,4,1.075329,51.284212,Canterbury
3,5,-3.543547,50.729172,Exeter
4,7,9.733756,46.922368,"Fideris, Bahnhof"


In [7]:
# Find the zurich station
zurich = station_data[station_data.station_number=="8503000"].reset_index()

In [8]:
# Save the longtitude and the latitude of the zurich station
zurich_longtitude = float(zurich.at[0,'longtitude'])
zurich_latitude = float(zurich.at[0,'latitude'])

In [9]:
# Define a function to compute the distance with the longtitude and the latitude
from math import sin, cos, sqrt, atan2, radians
def compute_distance(longtitude,latitude):
    # approximate radius of earth in km
    R = 6373.0

    lat1 = radians(zurich_latitude)
    lon1 = radians(zurich_longtitude)
    lat2 = radians(latitude)
    lon2 = radians(longtitude)

    dlon = lon2 - lon1
    dlat = lat2 - lat1

    a = sin(dlat / 2)**2 + cos(lat1) * cos(lat2) * sin(dlon / 2)**2
    c = 2 * atan2(sqrt(a), sqrt(1 - a))

    distance = R * c
    return distance

In [10]:
# Apply the function to add a column to the dataFrame
distance_list = []
for longtitude,latitude in zip(list(station_data.longtitude.as_matrix()),list(station_data.latitude.as_matrix())):
    distance_list.append(compute_distance(float(longtitude),float(latitude)))

station_data = pd.concat([station_data,pd.Series(np.array(distance_list))],axis = 1)

In [11]:
# Change the column name to "distance"
new_column = list(station_data.columns)
new_column[-1] = "distance"
station_data.columns = new_column

In [12]:
station_data.head()

Unnamed: 0,station_number,longtitude,latitude,name,distance
0,2,26.074412,44.44677,Bucuresti,1392.306576
1,3,1.811446,50.901549,Calais,626.736599
2,4,1.075329,51.284212,Canterbury,693.356811
3,5,-3.543547,50.729172,Exeter,955.019886
4,7,9.733756,46.922368,"Fideris, Bahnhof",103.546375


In [13]:
# Remove the data which is more than 10km from Zurich
useful_data = station_data[station_data.distance <= 10]
useful_data.head()

Unnamed: 0,station_number,longtitude,latitude,name,distance
74,176,8.521961,47.351679,Zimmerberg-Basistunnel,3.25169
2084,8502220,8.434713,47.390882,Urdorf,8.06844
2085,8502221,8.437543,47.357432,Birmensdorf ZH,8.069963
2086,8502222,8.468175,47.325896,Bonstetten-Wettswil,7.954183
2092,8502229,8.43033,47.380971,Urdorf Weihermatt,8.280418


In [14]:
# Save the data to the csv file
useful_data.to_csv("train_station_id.csv",index = False)

# Save the matrix to pickle
pickle.dump(useful_data,open("train_station_id.p","wb"))