# Preppin' Data 2023 Week 4

#### Load data

In [81]:
import pandas as pd

In [82]:
pupils = pd.read_csv('PD 2022 Wk 1 Input - Input.csv')

In [83]:
pref = pd.read_csv('PD 2021 WK 1 to 4 ideas - Preferences of Travel.csv')

#### Join the data sets together based on their common field

In [84]:
df = pupils.merge(pref, how='inner', left_on='id', right_on='Student ID')

In [85]:
df.head()

Unnamed: 0,id,pupil first name,pupil last name,gender,Date of Birth,Parental Contact Name_1,Parental Contact Name_2,Preferred Contact Employer,Parental Contact,Student ID,M,Tu,W,Th,F
0,1,Ronna,Nellies,Female,12/21/2013,Purcell,Ketti,Demizz,1,1,Car,Car,Car,Car,Bycycle
1,2,Rusty,Andriulis,Male,7/21/2012,Vassili,Rivi,Brainbox,1,2,Bicycle,Bicycle,Bicycle,Walk,Walk
2,3,Roberta,Oakeshott,Female,12/4/2011,Lind,Haskell,Centidel,2,3,Car,Bicycle,Carr,Walk,Car
3,4,Lola,Rubinfajn,Male,6/29/2012,Elie,Tresa,Edgeblab,2,4,Scooter,Scooter,Scootr,Scooter,Scoter
4,5,Kamila,Benedtti,Female,7/10/2012,Adela,Clevey,Trudoo,1,5,Bycycle,Carr,Scoter,Walkk,Scoter


#### Remove any fields you don't need for the challenge

In [86]:
df = df.drop(['id','pupil first name','pupil last name','gender','Date of Birth',\
           'Parental Contact Name_1','Parental Contact Name_2','Preferred Contact Employer',\
              'Parental Contact'], axis=1)

In [87]:
df.head()

Unnamed: 0,Student ID,M,Tu,W,Th,F
0,1,Car,Car,Car,Car,Bycycle
1,2,Bicycle,Bicycle,Bicycle,Walk,Walk
2,3,Car,Bicycle,Carr,Walk,Car
3,4,Scooter,Scooter,Scootr,Scooter,Scoter
4,5,Bycycle,Carr,Scoter,Walkk,Scoter


#### Change the weekdays from separate columns to one column of weekdays and one of the pupil's travel choice

In [88]:
df = df.melt(id_vars=['Student ID'], value_vars=['M', 'Tu','W','Th','F'])

In [89]:
df = df.rename(columns={'variable':'Weekday', 'value':'Method of Travel'})

In [90]:
df.head()

Unnamed: 0,Student ID,Weekday,Method of Travel
0,1,M,Car
1,2,M,Bicycle
2,3,M,Car
3,4,M,Scooter
4,5,M,Bycycle


#### Group the travel choices together to remove spelling mistakes

In [91]:
method = df['Method of Travel'].unique()

In [92]:
method

array(['Car', 'Bicycle', 'Scooter', 'Bycycle', 'Walk', 'Aeroplane',
       'Helicopter', 'Van', "Mum's Shoulders", 'Hopped', 'Carr', 'Walkk',
       "Dad's Shoulders", 'Skipped', 'Scootr', 'Scoter', 'Wallk',
       'Jumped', 'Helicopeter', 'WAlk', 'Waalk'], dtype=object)

In [93]:
df['Method of Travel'] = df['Method of Travel'].replace({
    'Bycycle':'Bicycle',
    'Carr':'Car',
    'Walkk':'Walk',
    'WAlk':'Walk', 
    'Wallk':'Walk', 
    'Waalk':'Walk',
    'Scootr':'Scooter',
    'Scoter':'Scooter',    
    'Helicopeter':'Helicopter'
})
    

In [94]:
df.head()

Unnamed: 0,Student ID,Weekday,Method of Travel
0,1,M,Car
1,2,M,Bicycle
2,3,M,Car
3,4,M,Scooter
4,5,M,Bicycle


#### Create a Sustainable (non-motorised) vs Non-Sustainable (motorised) data field 

In [95]:
df['Method of Travel'].unique()

array(['Car', 'Bicycle', 'Scooter', 'Walk', 'Aeroplane', 'Helicopter',
       'Van', "Mum's Shoulders", 'Hopped', "Dad's Shoulders", 'Skipped',
       'Jumped'], dtype=object)

In [96]:
sustainable = ['Bicycle', 'Scooter', 'Walk', "Mum's Shoulders", 'Hopped', "Dad's Shoulders", 'Skipped',
       'Jumped']

In [97]:
df['Sustainable?'] = df['Method of Travel'].apply(lambda x: 'Sustainable' if x in sustainable else 'Non-Sustainable')

In [98]:
df.head()

Unnamed: 0,Student ID,Weekday,Method of Travel,Sustainable?
0,1,M,Car,Non-Sustainable
1,2,M,Bicycle,Sustainable
2,3,M,Car,Non-Sustainable
3,4,M,Scooter,Sustainable
4,5,M,Bicycle,Sustainable


#### Total up the number of pupil's travelling by each method of travel 

In [99]:
df_sum = df.groupby(['Method of Travel', 'Weekday'])['Student ID'].size().reset_index()

In [100]:
df_sum = df_sum.rename(columns={'Student ID':'Number of Trips'})

In [101]:
df_sum.head()

Unnamed: 0,Method of Travel,Weekday,Number of Trips
0,Aeroplane,F,9
1,Aeroplane,M,9
2,Aeroplane,Th,9
3,Aeroplane,Tu,9
4,Aeroplane,W,9


In [102]:
df_trips = df_sum.groupby(['Weekday'])['Number of Trips'].sum().reset_index()

In [103]:
df_trips = df_trips.rename(columns={'Number of Trips':'Trips per day'})

In [104]:
df_trips.head()

Unnamed: 0,Weekday,Trips per day
0,F,1000
1,M,1000
2,Th,1000
3,Tu,1000
4,W,1000


In [105]:
df = df.merge(df_sum, how='left', on=['Method of Travel','Weekday'])

In [106]:
df = df.merge(df_trips, how='left', on='Weekday')

In [107]:
df.head()

Unnamed: 0,Student ID,Weekday,Method of Travel,Sustainable?,Number of Trips,Trips per day
0,1,M,Car,Non-Sustainable,422,1000
1,2,M,Bicycle,Sustainable,210,1000
2,3,M,Car,Non-Sustainable,422,1000
3,4,M,Scooter,Sustainable,84,1000
4,5,M,Bicycle,Sustainable,210,1000


#### Work out the % of trips taken by each method of travel each day

In [111]:
df['% of trips per day'] = round(df['Number of Trips']/df['Trips per day'], 2)

In [112]:
df.head()

Unnamed: 0,Student ID,Weekday,Method of Travel,Sustainable?,Number of Trips,Trips per day,% of trips per day
0,1,M,Car,Non-Sustainable,422,1000,0.42
1,2,M,Bicycle,Sustainable,210,1000,0.21
2,3,M,Car,Non-Sustainable,422,1000,0.42
3,4,M,Scooter,Sustainable,84,1000,0.08
4,5,M,Bicycle,Sustainable,210,1000,0.21


#### Output the data

In [115]:
df = df.drop('Student ID', axis=1)

In [118]:
df = df.drop_duplicates()

In [117]:
df.to_csv('2022W04.csv', index=False)