## Content:
### 01. Import Libraries and Data
### 02. Consistency Checks
### 03. Data Wrangling
### 04. Create New Columns
### 05. Export Dataframe

## 01. Import Libraries and Data


In [1]:
# import libraries

import pandas as pd
import numpy as np
import seaborn as sns
import os

In [2]:
# define path
path = r'C:\Users\Bilel\Desktop\data_analyst_Carrer Foundry\Data Immersion\A6-Advanced Analytics & Dashboard Design\20-01-24 NY Citi Bike project'

In [3]:
 # import data

df = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'citibike.csv'), index_col = False)

In [4]:
df.head()

Unnamed: 0,trip_id,bike_id,weekday,start_hour,start_time,start_station_id,start_station_name,start_station_latitude,start_station_longitude,end_time,end_station_id,end_station_name,end_station_latitude,end_station_longitude,trip_duration,subscriber,birth_year,gender
0,LnQzQk,16013,Mon,18,2013-09-09 18:18:55,523,W 38 St & 8 Ave,40.754666,-73.991382,2013-09-09 18:35:28,334,W 20 St & 7 Ave,40.742388,-73.997262,993,Subscriber,1968.0,2
1,IL9boN,15230,Thu,18,2013-09-12 18:38:53,257,Lispenard St & Broadway,40.719392,-74.002472,2013-09-12 18:48:34,236,St Marks Pl & 2 Ave,40.728419,-73.98714,581,Subscriber,1983.0,1
2,46clGB,17942,Wed,19,2013-09-18 19:44:04,479,9 Ave & W 45 St,40.760193,-73.991255,2013-09-18 19:50:05,513,W 56 St & 10 Ave,40.768254,-73.988639,361,Subscriber,1989.0,1
3,v7vdFt,19683,Sat,11,2013-09-28 11:54:37,527,E 33 St & 1 Ave,40.743156,-73.974347,2013-09-28 12:03:58,441,E 52 St & 2 Ave,40.756014,-73.967416,561,Subscriber,1988.0,2
4,VGBsb5,18024,Sat,18,2013-09-07 18:08:22,521,8 Ave & W 31 St,40.75045,-73.994811,2013-09-07 18:46:38,476,E 31 St & 3 Ave,40.743943,-73.979661,2296,Non-Subscriber,,0


In [5]:
df.shape

(50000, 18)

In [6]:
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 18 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   trip_id                  50000 non-null  object 
 1   bike_id                  50000 non-null  int64  
 2   weekday                  50000 non-null  object 
 3   start_hour               50000 non-null  int64  
 4   start_time               50000 non-null  object 
 5   start_station_id         50000 non-null  int64  
 6   start_station_name       50000 non-null  object 
 7   start_station_latitude   50000 non-null  float64
 8   start_station_longitude  50000 non-null  float64
 9   end_time                 50000 non-null  object 
 10  end_station_id           50000 non-null  int64  
 11  end_station_name         50000 non-null  object 
 12  end_station_latitude     50000 non-null  float64
 13  end_station_longitude    50000 non-null  float64
 14  trip_duration         

In [7]:
df.describe()


Unnamed: 0,bike_id,start_hour,start_station_id,start_station_latitude,start_station_longitude,end_station_id,end_station_latitude,end_station_longitude,trip_duration,birth_year,gender
count,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,43021.0,50000.0
mean,17615.26936,14.14524,443.3215,40.73417,-73.991109,442.5397,40.733859,-73.991351,838.9829,1975.627786,1.07354
std,1675.407446,4.860541,356.559925,0.019911,0.012555,355.756022,0.019885,0.012569,573.663997,11.089001,0.589389
min,14556.0,0.0,72.0,40.680342,-74.017134,72.0,40.680342,-74.017134,60.0,1899.0,0.0
25%,16188.0,10.0,304.0,40.720196,-74.000271,304.0,40.720196,-74.001547,417.0,1968.0,1.0
50%,17584.0,15.0,402.0,40.735877,-73.990765,402.0,40.735354,-73.991218,672.0,1978.0,1.0
75%,19014.0,18.0,484.0,40.75002,-73.981923,483.0,40.749013,-73.98205,1112.0,1984.0,1.0
max,20642.0,23.0,3002.0,40.770513,-73.950048,3002.0,40.770513,-73.950048,2697.0,1997.0,2.0


## 02. Consistency Checks


In [8]:

# check for mixed-type columns
for col in df.columns.tolist():
    weird = (df[[col]].map(type) != df[[col]].iloc[0].map(type)).any(axis=1)
    if len(df[weird]) > 0:
        print(col)


In [9]:
# check for missing values
df.isnull().sum()


trip_id                       0
bike_id                       0
weekday                       0
start_hour                    0
start_time                    0
start_station_id              0
start_station_name            0
start_station_latitude        0
start_station_longitude       0
end_time                      0
end_station_id                0
end_station_name              0
end_station_latitude          0
end_station_longitude         0
trip_duration                 0
subscriber                    0
birth_year                 6979
gender                        0
dtype: int64

In [10]:
# create subset of data with missing birth years
df_nan = df[df['birth_year'].isnull() == True]


In [11]:
df_nan

Unnamed: 0,trip_id,bike_id,weekday,start_hour,start_time,start_station_id,start_station_name,start_station_latitude,start_station_longitude,end_time,end_station_id,end_station_name,end_station_latitude,end_station_longitude,trip_duration,subscriber,birth_year,gender
4,VGBsb5,18024,Sat,18,2013-09-07 18:08:22,521,8 Ave & W 31 St,40.750450,-73.994811,2013-09-07 18:46:38,476,E 31 St & 3 Ave,40.743943,-73.979661,2296,Non-Subscriber,,0
10,aZsCdK,15732,Sat,20,2013-09-07 20:01:58,457,Broadway & W 58 St,40.766953,-73.981693,2013-09-07 20:23:51,512,W 29 St & 9 Ave,40.750073,-73.998393,1313,Non-Subscriber,,0
17,VqhF3S,15651,Mon,18,2013-09-30 18:32:48,363,West Thames St,40.708347,-74.017134,2013-09-30 19:01:59,514,12 Ave & W 40 St,40.760875,-74.002777,1751,Non-Subscriber,,0
26,9yZrq6,17506,Sun,16,2013-09-01 16:18:58,389,Broadway & Berry St,40.710446,-73.965251,2013-09-01 16:26:56,389,Broadway & Berry St,40.710446,-73.965251,478,Non-Subscriber,,0
46,t1KBiF,19206,Sun,22,2013-09-01 22:49:52,426,West St & Chambers St,40.717548,-74.013221,2013-09-01 23:06:48,247,Perry St & Bleecker St,40.735354,-74.004831,1016,Non-Subscriber,,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49986,8wDcq8,19293,Sun,22,2013-09-29 22:00:43,469,Broadway & W 53 St,40.763441,-73.982681,2013-09-29 22:17:52,444,Broadway & W 24 St,40.742354,-73.989151,1029,Non-Subscriber,,0
49988,C0ivmQ,18118,Sun,12,2013-09-22 12:19:44,352,W 56 St & 6 Ave,40.763406,-73.977225,2013-09-22 12:29:55,464,E 56 St & 3 Ave,40.759345,-73.967597,611,Non-Subscriber,,0
49993,cvgHF4,18458,Wed,19,2013-09-25 19:01:07,284,Greenwich Ave & 8 Ave,40.739017,-74.002638,2013-09-25 19:19:11,329,Greenwich St & N Moore St,40.720434,-74.010206,1084,Non-Subscriber,,0
49995,Fb2bmC,17988,Sat,18,2013-09-21 18:39:20,479,9 Ave & W 45 St,40.760193,-73.991255,2013-09-21 20:08:59,478,11 Ave & W 41 St,40.760301,-73.998842,2697,Non-Subscriber,,0


In [12]:
# impute median value into missing birth years 
df['birth_year'] = df['birth_year'].fillna(df['birth_year'].median())


I chose to impute the median instead of mean to try to account for any outliers, and to reduce the risk of skewing the data. I did not want to delete 13.94% of the data and risk further analysis being drastically altered.

In [13]:
# second check for missing values
df.isnull().sum()


trip_id                    0
bike_id                    0
weekday                    0
start_hour                 0
start_time                 0
start_station_id           0
start_station_name         0
start_station_latitude     0
start_station_longitude    0
end_time                   0
end_station_id             0
end_station_name           0
end_station_latitude       0
end_station_longitude      0
trip_duration              0
subscriber                 0
birth_year                 0
gender                     0
dtype: int64

In [14]:
# check for duplicates
df_dups = df[df.duplicated()]
df_dups

Unnamed: 0,trip_id,bike_id,weekday,start_hour,start_time,start_station_id,start_station_name,start_station_latitude,start_station_longitude,end_time,end_station_id,end_station_name,end_station_latitude,end_station_longitude,trip_duration,subscriber,birth_year,gender


In [15]:
df.describe()

Unnamed: 0,bike_id,start_hour,start_station_id,start_station_latitude,start_station_longitude,end_station_id,end_station_latitude,end_station_longitude,trip_duration,birth_year,gender
count,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0
mean,17615.26936,14.14524,443.3215,40.73417,-73.991109,442.5397,40.733859,-73.991351,838.9829,1975.9589,1.07354
std,1675.407446,4.860541,356.559925,0.019911,0.012555,355.756022,0.019885,0.012569,573.663997,10.318811,0.589389
min,14556.0,0.0,72.0,40.680342,-74.017134,72.0,40.680342,-74.017134,60.0,1899.0,0.0
25%,16188.0,10.0,304.0,40.720196,-74.000271,304.0,40.720196,-74.001547,417.0,1970.0,1.0
50%,17584.0,15.0,402.0,40.735877,-73.990765,402.0,40.735354,-73.991218,672.0,1978.0,1.0
75%,19014.0,18.0,484.0,40.75002,-73.981923,483.0,40.749013,-73.98205,1112.0,1983.0,1.0
max,20642.0,23.0,3002.0,40.770513,-73.950048,3002.0,40.770513,-73.950048,2697.0,1997.0,2.0


In the descriptive stats above, it says that the minimum birth year is 1899, I highly doubt that someone born in 1899 would be renting a bike in 2013. I'm going to check the birth year colun for extreme values.

In [16]:
# check for birth years prior to 1913

df[df['birth_year'] <1913].shape

(23, 18)

Since there are less than 30 records with birth years prior to 1913, I will delete them from the dataset. Choosing 1913 was generous, however there are some spry centenarians out there!

In [17]:
df_new = df[df['birth_year'] > 1913]


In [18]:
df_new.shape


(49977, 18)

In [19]:
# making sure there are 7 days of the week
df_new['weekday'].unique()

array(['Mon', 'Thu', 'Wed', 'Sat', 'Fri', 'Sun', 'Tue'], dtype=object)

## 03. Data Wrangling

In [20]:
df_new.info()

<class 'pandas.core.frame.DataFrame'>
Index: 49977 entries, 0 to 49999
Data columns (total 18 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   trip_id                  49977 non-null  object 
 1   bike_id                  49977 non-null  int64  
 2   weekday                  49977 non-null  object 
 3   start_hour               49977 non-null  int64  
 4   start_time               49977 non-null  object 
 5   start_station_id         49977 non-null  int64  
 6   start_station_name       49977 non-null  object 
 7   start_station_latitude   49977 non-null  float64
 8   start_station_longitude  49977 non-null  float64
 9   end_time                 49977 non-null  object 
 10  end_station_id           49977 non-null  int64  
 11  end_station_name         49977 non-null  object 
 12  end_station_latitude     49977 non-null  float64
 13  end_station_longitude    49977 non-null  float64
 14  trip_duration            49

In [21]:
# convert start_time data type to datetime
df_new['start_time'] = pd.to_datetime(df_new['start_time'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_new['start_time'] = pd.to_datetime(df_new['start_time'])


In [22]:
# convert end_time data type to datetime

df_new['end_time'] = pd.to_datetime(df_new['end_time'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_new['end_time'] = pd.to_datetime(df_new['end_time'])


In [23]:
df_new.dtypes

trip_id                            object
bike_id                             int64
weekday                            object
start_hour                          int64
start_time                 datetime64[ns]
start_station_id                    int64
start_station_name                 object
start_station_latitude            float64
start_station_longitude           float64
end_time                   datetime64[ns]
end_station_id                      int64
end_station_name                   object
end_station_latitude              float64
end_station_longitude             float64
trip_duration                       int64
subscriber                         object
birth_year                        float64
gender                              int64
dtype: object

In [24]:
# convert id_columns to object

cols = ['bike_id', 'start_station_id', 'end_station_id', 'gender']
for col in cols:
    df_new[col] = df_new[col].astype('str')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_new[col] = df_new[col].astype('str')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_new[col] = df_new[col].astype('str')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_new[col] = df_new[col].astype('str')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row

In [25]:
df_new.info()

<class 'pandas.core.frame.DataFrame'>
Index: 49977 entries, 0 to 49999
Data columns (total 18 columns):
 #   Column                   Non-Null Count  Dtype         
---  ------                   --------------  -----         
 0   trip_id                  49977 non-null  object        
 1   bike_id                  49977 non-null  object        
 2   weekday                  49977 non-null  object        
 3   start_hour               49977 non-null  int64         
 4   start_time               49977 non-null  datetime64[ns]
 5   start_station_id         49977 non-null  object        
 6   start_station_name       49977 non-null  object        
 7   start_station_latitude   49977 non-null  float64       
 8   start_station_longitude  49977 non-null  float64       
 9   end_time                 49977 non-null  datetime64[ns]
 10  end_station_id           49977 non-null  object        
 11  end_station_name         49977 non-null  object        
 12  end_station_latitude     49977 non-nu

In [26]:
#changing column name from weekday to day of week
df_new.rename(columns = {'weekday':'day_of_week'}, inplace = True)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_new.rename(columns = {'weekday':'day_of_week'}, inplace = True)


In [27]:
df_new.head()

Unnamed: 0,trip_id,bike_id,day_of_week,start_hour,start_time,start_station_id,start_station_name,start_station_latitude,start_station_longitude,end_time,end_station_id,end_station_name,end_station_latitude,end_station_longitude,trip_duration,subscriber,birth_year,gender
0,LnQzQk,16013,Mon,18,2013-09-09 18:18:55,523,W 38 St & 8 Ave,40.754666,-73.991382,2013-09-09 18:35:28,334,W 20 St & 7 Ave,40.742388,-73.997262,993,Subscriber,1968.0,2
1,IL9boN,15230,Thu,18,2013-09-12 18:38:53,257,Lispenard St & Broadway,40.719392,-74.002472,2013-09-12 18:48:34,236,St Marks Pl & 2 Ave,40.728419,-73.98714,581,Subscriber,1983.0,1
2,46clGB,17942,Wed,19,2013-09-18 19:44:04,479,9 Ave & W 45 St,40.760193,-73.991255,2013-09-18 19:50:05,513,W 56 St & 10 Ave,40.768254,-73.988639,361,Subscriber,1989.0,1
3,v7vdFt,19683,Sat,11,2013-09-28 11:54:37,527,E 33 St & 1 Ave,40.743156,-73.974347,2013-09-28 12:03:58,441,E 52 St & 2 Ave,40.756014,-73.967416,561,Subscriber,1988.0,2
4,VGBsb5,18024,Sat,18,2013-09-07 18:08:22,521,8 Ave & W 31 St,40.75045,-73.994811,2013-09-07 18:46:38,476,E 31 St & 3 Ave,40.743943,-73.979661,2296,Non-Subscriber,1978.0,0


In [28]:
# making sure there are 7 days of the week

df_new['day_of_week'].unique()

array(['Mon', 'Thu', 'Wed', 'Sat', 'Fri', 'Sun', 'Tue'], dtype=object)

## 04. Create New Columns

In [29]:
# create trip_duration_minutes column to convert trip time from seconds to minutes
df_new['trip_duration_minutes'] = df_new['trip_duration'] // 60

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_new['trip_duration_minutes'] = df_new['trip_duration'] // 60


In [30]:
df_new.head()

Unnamed: 0,trip_id,bike_id,day_of_week,start_hour,start_time,start_station_id,start_station_name,start_station_latitude,start_station_longitude,end_time,end_station_id,end_station_name,end_station_latitude,end_station_longitude,trip_duration,subscriber,birth_year,gender,trip_duration_minutes
0,LnQzQk,16013,Mon,18,2013-09-09 18:18:55,523,W 38 St & 8 Ave,40.754666,-73.991382,2013-09-09 18:35:28,334,W 20 St & 7 Ave,40.742388,-73.997262,993,Subscriber,1968.0,2,16
1,IL9boN,15230,Thu,18,2013-09-12 18:38:53,257,Lispenard St & Broadway,40.719392,-74.002472,2013-09-12 18:48:34,236,St Marks Pl & 2 Ave,40.728419,-73.98714,581,Subscriber,1983.0,1,9
2,46clGB,17942,Wed,19,2013-09-18 19:44:04,479,9 Ave & W 45 St,40.760193,-73.991255,2013-09-18 19:50:05,513,W 56 St & 10 Ave,40.768254,-73.988639,361,Subscriber,1989.0,1,6
3,v7vdFt,19683,Sat,11,2013-09-28 11:54:37,527,E 33 St & 1 Ave,40.743156,-73.974347,2013-09-28 12:03:58,441,E 52 St & 2 Ave,40.756014,-73.967416,561,Subscriber,1988.0,2,9
4,VGBsb5,18024,Sat,18,2013-09-07 18:08:22,521,8 Ave & W 31 St,40.75045,-73.994811,2013-09-07 18:46:38,476,E 31 St & 3 Ave,40.743943,-73.979661,2296,Non-Subscriber,1978.0,0,38


In [31]:
# create new column for age of customer
df_new['birth_year'] = df_new['birth_year'].astype(int)
df_new['age'] = (2013 - df_new['birth_year'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_new['birth_year'] = df_new['birth_year'].astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_new['age'] = (2013 - df_new['birth_year'])


In [32]:
df_new.head()

Unnamed: 0,trip_id,bike_id,day_of_week,start_hour,start_time,start_station_id,start_station_name,start_station_latitude,start_station_longitude,end_time,end_station_id,end_station_name,end_station_latitude,end_station_longitude,trip_duration,subscriber,birth_year,gender,trip_duration_minutes,age
0,LnQzQk,16013,Mon,18,2013-09-09 18:18:55,523,W 38 St & 8 Ave,40.754666,-73.991382,2013-09-09 18:35:28,334,W 20 St & 7 Ave,40.742388,-73.997262,993,Subscriber,1968,2,16,45
1,IL9boN,15230,Thu,18,2013-09-12 18:38:53,257,Lispenard St & Broadway,40.719392,-74.002472,2013-09-12 18:48:34,236,St Marks Pl & 2 Ave,40.728419,-73.98714,581,Subscriber,1983,1,9,30
2,46clGB,17942,Wed,19,2013-09-18 19:44:04,479,9 Ave & W 45 St,40.760193,-73.991255,2013-09-18 19:50:05,513,W 56 St & 10 Ave,40.768254,-73.988639,361,Subscriber,1989,1,6,24
3,v7vdFt,19683,Sat,11,2013-09-28 11:54:37,527,E 33 St & 1 Ave,40.743156,-73.974347,2013-09-28 12:03:58,441,E 52 St & 2 Ave,40.756014,-73.967416,561,Subscriber,1988,2,9,25
4,VGBsb5,18024,Sat,18,2013-09-07 18:08:22,521,8 Ave & W 31 St,40.75045,-73.994811,2013-09-07 18:46:38,476,E 31 St & 3 Ave,40.743943,-73.979661,2296,Non-Subscriber,1978,0,38,35


In [33]:
df_new.info()

<class 'pandas.core.frame.DataFrame'>
Index: 49977 entries, 0 to 49999
Data columns (total 20 columns):
 #   Column                   Non-Null Count  Dtype         
---  ------                   --------------  -----         
 0   trip_id                  49977 non-null  object        
 1   bike_id                  49977 non-null  object        
 2   day_of_week              49977 non-null  object        
 3   start_hour               49977 non-null  int64         
 4   start_time               49977 non-null  datetime64[ns]
 5   start_station_id         49977 non-null  object        
 6   start_station_name       49977 non-null  object        
 7   start_station_latitude   49977 non-null  float64       
 8   start_station_longitude  49977 non-null  float64       
 9   end_time                 49977 non-null  datetime64[ns]
 10  end_station_id           49977 non-null  object        
 11  end_station_name         49977 non-null  object        
 12  end_station_latitude     49977 non-nu

In [34]:
df_new.describe()

Unnamed: 0,start_hour,start_time,start_station_latitude,start_station_longitude,end_time,end_station_latitude,end_station_longitude,trip_duration,birth_year,trip_duration_minutes,age
count,49977.0,49977,49977.0,49977.0,49977,49977.0,49977.0,49977.0,49977.0,49977.0,49977.0
mean,14.146027,2013-09-16 14:32:00.605778688,40.734171,-73.991106,2013-09-16 14:47:12.717830400,40.733861,-73.991348,838.877624,1975.993717,13.480821,37.006283
min,0.0,2013-09-01 00:00:35,40.680342,-74.017134,2013-09-01 00:07:15,40.680342,-74.017134,60.0,1917.0,1.0,16.0
25%,10.0,2013-09-09 10:25:48,40.720196,-74.000271,2013-09-09 10:41:15,40.720196,-74.001547,417.0,1970.0,6.0,30.0
50%,15.0,2013-09-16 18:54:24,40.735877,-73.990765,2013-09-16 19:12:12,40.735354,-73.991218,672.0,1978.0,11.0,35.0
75%,18.0,2013-09-23 21:56:14,40.75002,-73.981923,2013-09-23 22:07:20,40.749013,-73.98205,1112.0,1983.0,18.0,43.0
max,23.0,2013-09-30 23:58:17,40.770513,-73.950048,2013-10-01 00:15:29,40.770513,-73.950048,2697.0,1997.0,44.0,96.0
std,4.860599,,0.019911,0.012552,,0.019885,0.012569,573.675762,10.19261,9.53439,10.19261


## 05. Export Dataframe

In [35]:
df_new.to_csv(os.path.join(path,'02 Data','Prepared Data', 'citi_clean.csv'))