In [1]:
import datetime
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns


In [2]:
rides_df = pd.read_csv('data/cab_rides.csv')
weather_df = pd.read_csv('data/weather.csv')
rides_df.drop(['product_id'], axis=1, inplace=True)

In [3]:
rides_df['date_time'] = rides_df.apply(lambda x: datetime.datetime.fromtimestamp(x['time_stamp']/1000)
                                       .strftime('%c'), 
                                       axis=1)
rides_df['date_time'] = pd.to_datetime(rides_df.date_time)
weather_df['date_time'] = weather_df.apply(lambda x: datetime.datetime.fromtimestamp(x['time_stamp'])
                                           .strftime('%c'), 
                                           axis=1)
weather_df['date_time'] = pd.to_datetime(weather_df.date_time)

In [4]:
rides_df['merge_date'] = (rides_df.source.astype('str')+ ' - ' 
                          + rides_df.date_time.dt.date.astype('str') 
                          + ' '
                          + rides_df.date_time.dt.hour.astype('str'))

weather_df['merge_date'] = (weather_df.location.astype('str') + ' - '
                            + weather_df.date_time.dt.date.astype('str')
                            + ' '
                            + weather_df.date_time.dt.hour.astype('str'))


In [5]:
np.sum(rides_df.isna())

distance                0
cab_type                0
time_stamp              0
destination             0
source                  0
price               55095
surge_multiplier        0
id                      0
name                    0
date_time               0
merge_date              0
dtype: int64

In [6]:
rides_df.dropna(inplace=True)

In [7]:
rides_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 637976 entries, 0 to 693070
Data columns (total 11 columns):
distance            637976 non-null float64
cab_type            637976 non-null object
time_stamp          637976 non-null int64
destination         637976 non-null object
source              637976 non-null object
price               637976 non-null float64
surge_multiplier    637976 non-null float64
id                  637976 non-null object
name                637976 non-null object
date_time           637976 non-null datetime64[ns]
merge_date          637976 non-null object
dtypes: datetime64[ns](1), float64(3), int64(1), object(6)
memory usage: 58.4+ MB


In [8]:
full_df = pd.merge(rides_df, weather_df, on='merge_date', suffixes=('_rides', '_weather'), how='left')
full_df.drop(['date_time_weather', 'time_stamp_weather', 
              'time_stamp_rides', 'merge_date', 
              'location'], axis=1, inplace=True)
full_df = pd.get_dummies(full_df, columns=['cab_type', 'destination', 'source', 'name'])
full_df.drop_duplicates(subset='id', keep='first', inplace=True)
full_df.head()

Unnamed: 0,distance,price,surge_multiplier,id,date_time_rides,temp,clouds,pressure,rain,humidity,...,name_Lux,name_Lux Black,name_Lux Black XL,name_Lyft,name_Lyft XL,name_Shared,name_UberPool,name_UberX,name_UberXL,name_WAV
0,0.44,5.0,1.0,424553bb-7174-41ea-aeb4-fe06d4f4b9d7,2018-12-16 04:30:07,38.46,0.29,1022.25,,0.76,...,0,0,0,0,0,1,0,0,0,0
1,0.44,11.0,1.0,4bd23055-6827-41c6-b23b-3c491f24e74d,2018-11-26 21:00:23,44.31,1.0,1003.17,0.1123,0.9,...,1,0,0,0,0,0,0,0,0,0
3,0.44,7.0,1.0,981a3613-77af-4620-a42a-0c0866077d1e,2018-11-27 20:00:22,,,,,,...,0,0,0,1,0,0,0,0,0,0
4,0.44,26.0,1.0,c2d88af2-d278-4bfd-a8d0-29ca77cc5512,2018-11-29 23:53:02,35.08,0.0,1013.71,,0.7,...,0,0,1,0,0,0,0,0,0,0
5,0.44,9.0,1.0,e0126e1f-8ca9-4f2e-82b3-50505a09db9a,2018-11-28 22:49:20,37.58,0.42,998.64,,0.71,...,0,0,0,0,1,0,0,0,0,0


In [9]:
full_df['rain'] = full_df.rain.fillna(0)

In [10]:
np.sum(full_df.isna(), axis=0)

distance                                  0
price                                     0
surge_multiplier                          0
id                                        0
date_time_rides                           0
temp                                   2734
clouds                                 2734
pressure                               2734
rain                                      0
humidity                               2734
wind                                   2734
cab_type_Lyft                             0
cab_type_Uber                             0
destination_Back Bay                      0
destination_Beacon Hill                   0
destination_Boston University             0
destination_Fenway                        0
destination_Financial District            0
destination_Haymarket Square              0
destination_North End                     0
destination_North Station                 0
destination_Northeastern University       0
destination_South Station       

In [11]:
full_df.dropna(inplace=True)

In [12]:
full_df.drop('id', axis=1, inplace=True)

In [13]:
full_df.to_csv('data/rides_weather_df.csv')