# Bootstrap

# SURVIVE THE TITANIC

Given a table of boat’s passengers, named titanic.csv, you are about to find out what could have possibly
happened in the early morning hours of 15 April 1912.

## GETTING THE DATAFRAMES

In [183]:
import numpy as np
import pandas as pd

In [184]:
df = pd.read_csv('./titanic.csv')
df.head()

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,211.3375,B5,S,2.0,,"St Louis, MO"
1,1,1,"Allison, Master. Hudson Trevor",male,0.9167,1,2,113781,151.55,C22 C26,S,11.0,,"Montreal, PQ / Chesterville, ON"
2,1,0,"Allison, Miss. Helen Loraine",female,2.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
3,1,0,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1,2,113781,151.55,C22 C26,S,,135.0,"Montreal, PQ / Chesterville, ON"
4,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"


In [185]:
df['age'].isna().map(lambda x: 'null' if x else 'non-null').value_counts()

non-null    1046
null         263
Name: age, dtype: int64

## EXTRACTING STATISTICS

In [186]:
df_survived = df[df['survived'] == 1]

In [187]:
survived_stats = pd.Series({
  "total": df_survived.shape[0],
  "age": df_survived['age'].mean(),
  "fare": df_survived['fare'].mean(),
})
survived_stats

total    500.000000
age       28.918228
fare      49.361184
dtype: float64

## GROUPS ALONG CHARACTERISTICS

In [188]:
pclass_survive_prob = pd.Series({c: f"{df[df['pclass'] == c]['survived'].mean():.1%}" for c in df['pclass'].unique()})
pclass_survive_prob.name = "pclass"
pclass_survive_prob

1    61.9%
2    43.0%
3    25.5%
Name: pclass, dtype: object

In [189]:
sex_survive_prob = pd.Series({s: f"{df[df['sex'] == s]['survived'].mean():.1%}" for s in df['sex'].unique()})
sex_survive_prob.name = "sex"
sex_survive_prob

female    72.7%
male      19.1%
Name: sex, dtype: object

In [190]:
df['age'].min(), df['age'].max()

(0.1667, 80.0)

In [191]:
ranges = [
    (min, max)
    for min, max in zip(np.arange(df['age'].max() + 1, step=10, dtype=int),
                        np.arange(9, df['age'].max() + 10, step=10, dtype=int))
]

age_survive_prob = pd.Series({
    f"{ranges[i][0]}-{ranges[i][1]}":
    f"{df[(df['age'] >= ranges[i][0]) & (df['age'] <= ranges[i][1])]['survived'].mean():.1%}"
    for i in range(len(ranges))
})
age_survive_prob.name = "age"
age_survive_prob

0-9       61.0%
10-19     39.2%
20-29     36.9%
30-39     42.2%
40-49     38.5%
50-59     45.7%
60-69     31.2%
70-79     14.3%
80-89    100.0%
Name: age, dtype: object

# WELCOME TO THE JUNGLE OF TRAIN SYSTEM DATA

In real life, data is not usually given to you in a neat, single file, but you have to merge together different
sources.You will find the data scattered among several files in the repository `data_sncf.zip.`

## WALKING THROUGH THE DATA

In [192]:
stops_df = pd.read_csv('./data-sncf/stops.txt')
stop_times_df = pd.read_csv('./data-sncf/stop_times.txt')

In [193]:
stops_df.head()

Unnamed: 0,stop_id,stop_name,stop_desc,stop_lat,stop_lon,zone_id,stop_url,location_type,parent_station
0,StopArea:OCE87381509,Gare de Mantes-la-Jolie,,48.989687,1.703294,,,1,
1,StopArea:OCE87415604,Gare de Vernon-Giverny,,49.091286,1.478363,,,1,
2,StopArea:OCE87415620,Gare de Gaillon-Aubevoye,,49.174632,1.352518,,,1,
3,StopArea:OCE87415877,Gare de Val-de-Reuil,,49.275399,1.224609,,,1,
4,StopArea:OCE87411207,Gare de Oissel,,49.343042,1.101821,,,1,


In [194]:
stop_times_df.head()

Unnamed: 0,trip_id,arrival_time,departure_time,stop_id,stop_sequence,stop_headsign,pickup_type,drop_off_type,shape_dist_traveled
0,OCESN037071R0100119847,23:05:00,23:05:00,StopPoint:OCECar TER-87381509,0,,0,0,
1,OCESN037071R0100119847,23:35:00,23:35:00,StopPoint:OCECar TER-87415604,1,,0,0,
2,OCESN037071R0100119847,23:55:00,23:55:00,StopPoint:OCECar TER-87415620,2,,0,0,
3,OCESN037071R0100119847,24:25:00,24:25:00,StopPoint:OCECar TER-87415877,3,,0,0,
4,OCESN037071R0100119847,24:45:00,24:45:00,StopPoint:OCECar TER-87411207,4,,0,0,


In [195]:
stops_df.shape, stop_times_df.shape

((9176, 9), (209129, 9))

In [196]:
train_station_df = pd.concat([stop_times_df['trip_id'], stop_times_df['departure_time'], stop_times_df['stop_id']], axis=1)
train_station_df = train_station_df.join(stops_df.set_index('stop_id')['stop_name'], on='stop_id')
train_station_df.sort_values(['stop_name', 'departure_time'], inplace=True)

In [197]:
station_name = 'Gare de Oissel'
train_station_df[train_station_df['stop_name'] == station_name]['departure_time'].unique()

array(['05:24:00', '05:46:00', '06:00:00', '06:19:00', '06:23:00',
       '06:25:00', '06:35:00', '06:46:00', '06:49:00', '06:59:00',
       '07:08:00', '07:19:00', '07:23:00', '07:40:00', '07:45:00',
       '07:59:00', '08:00:00', '08:23:00', '08:37:00', '08:40:00',
       '08:53:00', '08:56:00', '09:00:00', '09:23:00', '09:37:00',
       '09:59:00', '10:00:00', '10:23:00', '10:26:00', '10:59:00',
       '11:00:00', '11:23:00', '11:37:00', '11:59:00', '12:00:00',
       '12:23:00', '12:37:00', '12:59:00', '13:00:00', '13:20:00',
       '13:23:00', '13:37:00', '13:58:00', '13:59:00', '14:00:00',
       '14:20:00', '14:23:00', '15:00:00', '15:23:00', '15:37:00',
       '15:53:00', '15:59:00', '16:00:00', '16:23:00', '16:37:00',
       '16:59:00', '17:00:00', '17:23:00', '17:37:00', '17:59:00',
       '18:00:00', '18:20:00', '18:22:00', '18:23:00', '18:24:00',
       '18:37:00', '18:59:00', '19:00:00', '19:14:00', '19:23:00',
       '19:37:00', '19:38:00', '19:59:00', '20:00:00', '20:14:

## MERGING INFORMATION

In [205]:
station_name = 'Gare de Paris-Est'
before = '10:00:00'
filter = (train_station_df['stop_name']
          == station_name) & (train_station_df['departure_time'] < before)
train_station_df[filter][['trip_id', 'departure_time']]

Unnamed: 0,trip_id,departure_time
143499,OCESN839561F0800838467,05:42:00
143590,OCESN839551F3403438301,06:34:00
148677,OCESN839131F1201238020,06:36:00
148698,OCESN839131F1301338019,06:36:00
148712,OCESN839131F0900938022,06:36:00
...,...,...
149279,OCESN839162F0900938089,09:55:00
149291,OCESN839162F1401438085,09:55:00
144392,OCESN839550F8108138273,09:57:00
144401,OCESN839550F5305338270,09:59:00
