# Workshop de Python

Cargamos datasets con información de admisiones a hospitales de enfermos de diabetes. El objetivo es, una vez limpiado el dataset, estudiarlo para extraer el máximo número de insights de los datos.

## 1 Load the libraries

In [1]:
%matplotlib inline

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime
import seaborn as sns
pd.options.display.max_columns = None

## 2 Read the data

### 2.1 Taxi trips

In [2]:
dtypes_taxi = {'trip_id':'category',
              'taxi_id':'category',
              'trip_start_timestamp':'category',
              'trip_end_timestamp':'category',
              'trip_seconds':'float16',
              'trip_miles':'int16',
              'pickup_community_area':'int8',
              'dropoff_community_area':'int8',
              'fare':'float16',
              'tips':'float16',
              'tolls':'float16',
              'extras':'float16',
              'trip_total':'float16',
              'payment_type':'category',
              'company':'category',
              'pickup_centroid_latitude':'float16',
              'pickup_centroid_longitude':'float16',
              'dropoff_centroid_latitude':'float16',
              'dropoff_centroid_longitude':'float16'
              }

In [3]:
taxi_trips = pd.read_csv('../Data/chicago_data_clean.csv.gz')#,nrows=2000000)#dtype=dtypes_taxi)

#### 2.1.1 Check the correct reading of the data

In [4]:
taxi_trips.sample(5)

Unnamed: 0,trip_id,taxi_id,trip_start_timestamp,trip_end_timestamp,trip_seconds,trip_miles,pickup_community_area,dropoff_community_area,fare,tips,tolls,extras,trip_total,payment_type,company,pickup_centroid_latitude,pickup_centroid_longitude,dropoff_centroid_latitude,dropoff_centroid_longitude
20457661,aefe4c0cfc9db474d65533d8d4b06010338b1dde,96e2479c0b0a37ca6e98a08153c2d5062d96e5d831737d...,2014-09-05 17:45:00,2014-09-05 18:00:00,660,0.1,6,5,7.45,2.0,0.0,0.0,9.45,Credit Card,Taxi Affiliation Services,41.944227,-87.655998,41.947792,-87.683835
9229541,6096af16834e71c8e70fabf2d3d64f3c2bb98e6d,f454eed0504cea35dec37d008e940841cd60610ddacc5e...,2015-05-10 18:30:00,2015-05-10 18:30:00,120,0.5,7,7,4.25,2.0,0.0,0.0,6.25,Credit Card,Taxi Affiliation Services,41.914616,-87.631717,41.914616,-87.631717
13188981,7c38029229197f4850413ad77d9f925f8dbfce04,c61c244a477893ef6ae3e9f3b98c74df49945c59048d92...,2016-07-08 09:30:00,2016-07-08 09:45:00,300,1.0,28,28,6.0,1.0,0.0,0.0,7.0,Credit Card,Dispatch Taxi Affiliation,41.879255,-87.642649,41.879067,-87.657005
28150117,e4baea3b1a774a80148272f646453b944868b3b4,dc0668f5377f0c62a08db559e8853b68550b5373c7d74f...,2013-07-11 20:00:00,2013-07-11 21:00:00,3540,1.0,76,7,37.05,9.75,0.0,2.0,48.8,Credit Card,Blue Ribbon Taxi Association Inc.,41.979071,-87.90304,41.914616,-87.631717
19064088,a54473b785d08cad4cdbc6ca6eaa402c4ce82bcc,42bfc19863617733aaa5ef96de670e3307690921a42d3d...,2014-08-25 15:30:00,2014-08-25 15:45:00,1140,6.2,12,1,16.25,0.0,0.0,1.0,17.25,Cash,Choice Taxi Association,41.99393,-87.758354,42.009623,-87.670167


In [5]:
taxi_trips.dtypes

trip_id                        object
taxi_id                        object
trip_start_timestamp           object
trip_end_timestamp             object
trip_seconds                    int64
trip_miles                    float64
pickup_community_area           int64
dropoff_community_area          int64
fare                          float64
tips                          float64
tolls                         float64
extras                        float64
trip_total                    float64
payment_type                   object
company                        object
pickup_centroid_latitude      float64
pickup_centroid_longitude     float64
dropoff_centroid_latitude     float64
dropoff_centroid_longitude    float64
dtype: object

#### 2.1.2 Check the shape

In [6]:
taxi_trips.shape

(37658685, 19)

### 2.2 Chicago Weather

In [7]:
chicago_weather = pd.read_csv('../Data/Chicago_weather.csv.gz')

#### 2.2.1 Check the correct reading of the data

In [8]:
chicago_weather.sample(5)

Unnamed: 0,datetime,humidity,pressure,temperature,weather_description,wind_direction,wind_speed
4647,2013-04-13 04:00:00,100.0,1002.0,279.14,mist,250.0,7.0
13733,2014-04-26 18:00:00,66.5,1017.0,282.7,few clouds,40.0,7.0
3538,2013-02-25 23:00:00,51.0,1019.0,275.42,few clouds,230.0,2.0
6360,2013-06-23 13:00:00,89.0,1017.0,293.95,light rain,153.0,0.0
12007,2014-02-13 20:00:00,63.0,1003.0,270.74,broken clouds,180.0,5.0


In [9]:
chicago_weather.dtypes

datetime                object
humidity               float64
pressure               float64
temperature            float64
weather_description     object
wind_direction         float64
wind_speed             float64
dtype: object

#### 2.2.2 Check the shape

In [10]:
chicago_weather.shape

(45252, 7)

## 3 Get the datetime format to the column datetime

### 3.1 Chicago trips

In [11]:
taxi_trips['trip_start_timestamp'] = pd.to_datetime(taxi_trips['trip_start_timestamp'],errors='coerce')
taxi_trips['trip_end_timestamp'] = pd.to_datetime(taxi_trips['trip_end_timestamp'],errors='coerce')
taxi_trips.dtypes

trip_id                               object
taxi_id                               object
trip_start_timestamp          datetime64[ns]
trip_end_timestamp            datetime64[ns]
trip_seconds                           int64
trip_miles                           float64
pickup_community_area                  int64
dropoff_community_area                 int64
fare                                 float64
tips                                 float64
tolls                                float64
extras                               float64
trip_total                           float64
payment_type                          object
company                               object
pickup_centroid_latitude             float64
pickup_centroid_longitude            float64
dropoff_centroid_latitude            float64
dropoff_centroid_longitude           float64
dtype: object

### 3.2 Chicago weather

In [12]:
# We change the datetime column from string format to datetime format
chicago_weather['datetime'] = pd.to_datetime(chicago_weather['datetime'], errors='coerce')
chicago_weather.dtypes

datetime               datetime64[ns]
humidity                      float64
pressure                      float64
temperature                   float64
weather_description            object
wind_direction                float64
wind_speed                    float64
dtype: object

## 4 Join both Dataset

### 4.1 Define the keys to join both dataset

In [13]:
# Taxi key
taxi_key=[taxi_trips['trip_start_timestamp'].dt.year,
          taxi_trips['trip_start_timestamp'].dt.month,
          taxi_trips['trip_start_timestamp'].dt.day,
          taxi_trips['trip_start_timestamp'].dt.hour]                                                                                          

In [14]:
# Weather key
weather_key=[chicago_weather['datetime'].dt.year,
             chicago_weather['datetime'].dt.month,
             chicago_weather['datetime'].dt.day,
             chicago_weather['datetime'].dt.hour]

### 4.2 Join both Dataset

In [15]:
taxi_trips = taxi_trips.merge(chicago_weather,
                               how='left',
                               left_on=taxi_key,
                               right_on=weather_key)

#### 4.2.1 Check the correct join of the data

In [16]:
taxi_trips.shape

(37658685, 30)

In [17]:
taxi_trips.sample(5)

Unnamed: 0,key_0,key_1,key_2,key_3,trip_id,taxi_id,trip_start_timestamp,trip_end_timestamp,trip_seconds,trip_miles,pickup_community_area,dropoff_community_area,fare,tips,tolls,extras,trip_total,payment_type,company,pickup_centroid_latitude,pickup_centroid_longitude,dropoff_centroid_latitude,dropoff_centroid_longitude,datetime,humidity,pressure,temperature,weather_description,wind_direction,wind_speed
14994676,2015,7,5,3,88d3b4349b1d3a298c2b32e0d02ed0849c467482,b4df572ad07112d99548875cbd778982a6a2918acec411...,2015-07-05 03:45:00,2015-07-05 03:45:00,540,2.7,3,2,8.65,0.0,0.0,0.0,8.65,Cash,Northwest Management LLC,41.965812,-87.655879,42.001571,-87.695013,2015-07-05 03:00:00,60.0,1017.0,293.19,haze,120.0,2.0
22844059,2014,4,16,1,bfaa57377d285044a74094a6eaf4ec41c5ab50da,4d383b5d473f9402a59c74dd17d0d1c4d7b2f2c640bd06...,2014-04-16 01:30:00,2014-04-16 01:45:00,720,0.3,32,6,13.65,2.7,0.0,0.0,16.35,Credit Card,Taxi Affiliation Services,41.878866,-87.625192,41.944227,-87.655998,2014-04-16 01:00:00,51.0,1022.0,274.01,broken clouds,240.0,3.0
3056174,2015,3,14,8,3574f5a64aacbfb0f963ce235035a9009de6ee4c,46aa5f56690093619bf1d247a8160b0a86071e585e6d60...,2015-03-14 08:30:00,2015-03-14 08:45:00,660,0.3,6,2,14.05,2.8,0.0,0.0,16.85,Credit Card,Taxi Affiliation Services,41.944227,-87.655998,42.001571,-87.695013,2015-03-14 08:00:00,78.0,1030.0,277.983667,overcast clouds,284.0,4.0
23215442,2014,7,6,1,c24091b1d5a96bd0b187edd1fdd845c60a98f836,33f6337bd40150c7d943571092a8e2c8b97b985ffe331a...,2014-07-06 01:30:00,2014-07-06 01:30:00,420,1.2,8,32,6.65,0.0,0.0,1.0,7.65,Cash,Taxi Affiliation Services,41.892508,-87.626215,41.877406,-87.621972,2014-07-06 01:00:00,60.0,1019.0,296.16,light rain,210.0,5.0
32952469,2017,7,19,20,db5add868bb111ec75ef33b7a83964b61083101c,680ab36f04ff11ffbc868022c1c1b0090bf9608afd1f1c...,2017-07-19 20:00:00,2017-07-19 20:00:00,420,1.4,28,8,7.25,0.0,0.0,0.0,7.25,Cash,Northwest Management LLC,41.885281,-87.657233,41.904935,-87.649907,2017-07-19 20:00:00,69.0,1019.0,303.15,few clouds,20.0,3.0


#### 4.2.2 Remove useless columns

In [18]:
taxi_trips.columns

Index(['key_0', 'key_1', 'key_2', 'key_3', 'trip_id', 'taxi_id',
       'trip_start_timestamp', 'trip_end_timestamp', 'trip_seconds',
       'trip_miles', 'pickup_community_area', 'dropoff_community_area', 'fare',
       'tips', 'tolls', 'extras', 'trip_total', 'payment_type', 'company',
       'pickup_centroid_latitude', 'pickup_centroid_longitude',
       'dropoff_centroid_latitude', 'dropoff_centroid_longitude', 'datetime',
       'humidity', 'pressure', 'temperature', 'weather_description',
       'wind_direction', 'wind_speed'],
      dtype='object')

In [19]:
taxi_trips = taxi_trips.drop(columns=['key_0',
                                      'key_1',
                                      'key_2',
                                      'key_3',
                                      'datetime'])

#### 4.2.3 Check the dataset again

In [20]:
taxi_trips.shape

(37658685, 25)

In [21]:
taxi_trips.sample(5)

Unnamed: 0,trip_id,taxi_id,trip_start_timestamp,trip_end_timestamp,trip_seconds,trip_miles,pickup_community_area,dropoff_community_area,fare,tips,tolls,extras,trip_total,payment_type,company,pickup_centroid_latitude,pickup_centroid_longitude,dropoff_centroid_latitude,dropoff_centroid_longitude,humidity,pressure,temperature,weather_description,wind_direction,wind_speed
1707333,2c049b986fbbf3e4936041d45ca8789a382faa25,07f3ce687dce8fa4518b83cd63bfbfb0383339fd188b5b...,2015-05-18 20:45:00,2015-05-18 21:00:00,840,0.2,33,8,10.85,0.0,0.0,1.0,11.85,Cash,Taxi Affiliation Services,41.85935,-87.617358,41.892042,-87.631864,54.0,1029.0,290.969333,sky is clear,244.0,5.0
26566004,d9ac6e4099405291b56aab54fcac377cbb1f5d39,db6ae6e5b925e374cce11bd2cfdb000785d713eb72c0cb...,2015-12-03 22:30:00,2015-12-03 22:45:00,1380,0.8,56,7,28.05,6.0,0.0,2.0,36.05,Credit Card,Taxi Affiliation Services,41.785999,-87.750934,41.914747,-87.654007,92.0,1035.0,276.578842,scattered clouds,251.0,5.0
12643784,786af523812b7c32f3f8abe1e687f8613bf3ce18,e1919b836766d9b29f1ff4e478b2e36538101feff55a86...,2013-08-06 15:30:00,2013-08-06 16:00:00,1800,0.7,32,46,28.85,0.0,0.0,1.0,29.85,Cash,Taxi Affiliation Services,41.878866,-87.625192,41.741243,-87.551428,75.0,1017.0,293.14,scattered clouds,165.0,3.0
15391697,8b982bae6621403a35b3e7898fa3a89ebcb97b51,8d62e14d64ef3e311261023ef505a0fae787394b442da3...,2014-05-08 19:15:00,2014-05-08 19:45:00,2160,17.8,8,76,36.65,7.9,0.0,3.0,47.55,Credit Card,Northwest Management LLC,41.895033,-87.619711,41.979071,-87.90304,56.0,1007.0,303.7,sky is clear,188.0,4.0
16895054,961c41b1fa762ce2994070a6b4f0e9e0d54e15c5,c839de167830c1d8f7e1a489a1b9be2940314aba5857c6...,2016-02-22 11:15:00,2016-02-22 11:45:00,1560,0.5,2,8,25.75,0.0,0.0,0.0,25.75,Cash,Blue Ribbon Taxi Association Inc.,42.001571,-87.695013,41.899602,-87.633308,86.0,1022.0,270.75,mist,320.0,1.0


#### 4.2.4 Check the descriptive statistics 

In [25]:
taxi_trips.describe()

Unnamed: 0,trip_seconds,trip_miles,pickup_community_area,dropoff_community_area,fare,tips,tolls,extras,trip_total,pickup_centroid_latitude,pickup_centroid_longitude,dropoff_centroid_latitude,dropoff_centroid_longitude,humidity,pressure,temperature,wind_direction,wind_speed
count,37658680.0,37658680.0,37658680.0,37658680.0,37658680.0,37658680.0,37658680.0,37658680.0,37658680.0,37658680.0,37658680.0,37658680.0,37658680.0,37658680.0,37658680.0,37658680.0,37658680.0,37658680.0
mean,841.5293,3.416954,22.27116,21.47082,12.78409,1.333418,0.0007875028,0.8555467,14.97384,41.90123,-87.65636,41.90228,-87.65534,74.75701,1020.417,283.7216,190.1931,3.916641
std,712.4767,8.079623,19.00468,17.99092,10.36345,2.347944,0.5554161,26.0394,28.86223,0.03745047,0.06615392,0.03892095,0.05872626,19.32165,10.89138,11.35368,98.35538,2.311868
min,60.0,0.1,1.0,1.0,0.01,0.0,0.0,0.0,0.01,41.66014,-87.91362,41.66014,-87.91362,9.0,941.0,248.89,0.0,0.0
25%,420.0,0.7,8.0,8.0,6.45,0.0,0.0,0.0,7.5,41.88099,-87.656,41.88099,-87.65701,61.0,1013.0,275.36,110.0,2.0
50%,660.0,1.4,14.0,16.0,8.85,0.0,0.0,0.0,10.25,41.89251,-87.63275,41.89322,-87.63509,78.0,1019.0,284.7391,201.0,4.0
75%,1020.0,3.2,32.0,32.0,13.85,2.0,0.0,1.0,15.65,41.91462,-87.62621,41.92269,-87.62621,90.0,1027.0,292.9813,270.0,5.0
max,83520.0,1998.1,77.0,77.0,3963.92,500.0,1999.98,9989.05,9999.82,42.02122,-87.5349,42.02122,-87.5349,100.0,1077.0,308.48,360.0,25.0


### estudiar los outliers

#### Almacenamos en un diccionario los medicamentos con sus threshold, q1 y q3

In [28]:
thresholds={}
for columna in (taxi_trips.describe().dtypes.index):
    q1 = taxi_trips.describe().loc["25%"][columna]
    q3 = taxi_trips.describe().loc["75%"][columna]
    iqr = q3 - q1
    threshold = 1.5 * iqr
    thresholds[columna]=[threshold,q1,q3]
thresholds

KeyboardInterrupt: 

#### Obtenemos el numeros de outliers con el porcentaje de outliers por variable

In [None]:
outliers = {}
for i in thresholds:
    outlier = taxi_trips[taxi_trips[i] < (thresholds[i][1]-thresholds[i][0])].shape[0] \
    + taxi_trips[taxi_trips[i] > (thresholds[i][2]+thresholds[i][0])].shape[0]
    outliers[i] = [outlier,outlier/df_ws.shape[0]*100]
outliers

#### Graficamos

##### Outliers

In [None]:
df_outliers = pd.DataFrame(outliers).T
df_outliers.columns = ['numero_outliers','%outliers']
df_outliers

In [None]:
ax=df_outliers['%outliers'].plot(kind='bar', title ="Outliers", figsize=(15, 10), \
                                         fontsize=12, rot=45)
ax.set_xlabel("Campos", fontsize=12)
ax.set_ylabel("%outliers", fontsize=12)
plt.show()