## Importação de bibliotecas

In [8]:
import pandas as pd
import os
from pathlib import Path
from datetime import date
import json
import seaborn as sns
import matplotlib.pyplot as plt

## Pré-processando

In [10]:
caminho_entrada = '../data/raw/trips_data-0-2.csv'
df = pd.read_csv(caminho_entrada, sep = ',', decimal = '.')
df.head()

Unnamed: 0,city,product_type,status,request_time,begin_trip_time,begintrip_lat,begintrip_lng,begintrip_address,dropoff_time,dropoff_lat,dropoff_lng,dropoff_address,distance,fare_currency,fare_amount
0,Manaus,UberX,completed,2025-06-18T00:00:42.000Z,2025-06-18T00:14:15.000Z,-3.09217,-60.01713,"Av. Darcy Vargas, 1.200 - Parque Dez de Novemb...",2025-06-18T00:32:08.000Z,-3.072692,-60.06748,"Rua Ubaira, 65 - Manaus - AM, 69038-288",6.32,Brazilian Real,17.96
1,Manaus,UberX,completed,2025-06-17T20:25:51.000Z,2025-06-17T20:34:14.000Z,-3.07292,-60.06749,"Rua Ubaira, 65 - Manaus - AM, 69038-288",2025-06-17T20:56:33.000Z,-3.124575,-60.043434,"R. Sagrado Coração de Jesus, 261 - Santa Etelv...",6.54,Brazilian Real,24.14
2,Manaus,UberX,completed,2025-06-17T12:47:43.000Z,2025-06-17T13:04:47.000Z,-3.072717,-60.067455,"Rua Ubaira, 65 - Manaus - AM, 69038-288",2025-06-17T13:26:49.000Z,-3.092227,-60.017178,"Av. Darcy Vargas, 1.200 - Parque Dez de Novemb...",5.19,Brazilian Real,17.05
3,Manaus,UberX,rider_canceled,2025-06-17T12:45:09.000Z,,-3.072766,-60.067474,,,-3.072766,-60.067474,"Av. Darcy Vargas, 1.200 - Parque Dez de Novemb...",0.0,Brazilian Real,0.0
4,Manaus,UberX,rider_canceled,2025-06-17T01:13:04.000Z,,-3.092251,-60.017185,,,-3.092251,-60.017185,"Rua Ubaira, 65 - Manaus - AM, 69038-288",0.0,Brazilian Real,0.0


### Selecionando apenas as colunas desejadas

In [12]:
colunas_desejadas = ['request_time', 'city', 'product_type', 'status', 'distance', 'fare_amount']
df_filtrado = df[colunas_desejadas]

In [13]:
df_filtrado.head()

Unnamed: 0,request_time,city,product_type,status,distance,fare_amount
0,2025-06-18T00:00:42.000Z,Manaus,UberX,completed,6.32,17.96
1,2025-06-17T20:25:51.000Z,Manaus,UberX,completed,6.54,24.14
2,2025-06-17T12:47:43.000Z,Manaus,UberX,completed,5.19,17.05
3,2025-06-17T12:45:09.000Z,Manaus,UberX,rider_canceled,0.0,0.0
4,2025-06-17T01:13:04.000Z,Manaus,UberX,rider_canceled,0.0,0.0


### Visualizando quantas cidades existem no dataset

In [14]:
df_filtrado["city"].drop_duplicates()

0              Manaus
299    Rio de Janeiro
Name: city, dtype: object

### Visualizando quais são os status das corridas

In [15]:
df_filtrado["status"].drop_duplicates()

0            completed
3       rider_canceled
6          unfulfilled
143             failed
214    driver_canceled
Name: status, dtype: object

### Visualizando quais são os tipos de corrida disponíveis

In [16]:
df_filtrado["product_type"].drop_duplicates()

0             UberX
6         uberX VIP
24          Comfort
63             Moto
112      Flash Moto
131       Wait&Save
159      Prioridade
176     Flash Bikes
299           uberX
495           Flash
1652    UberX Promo
Name: product_type, dtype: object

### Filtrando somente a cidade de Manaus

In [17]:
df_processed_city = df_filtrado[df_filtrado["city"] == "Manaus"]

### Filtrando corridas completas e corridas conceladas pelo motorista

In [18]:
df_processed_completed = df_processed_city[(df_processed_city["status"] == "completed") | (df_processed_city["status"] == "driver_canceled")]

### Filtrando corridas de carro -- que não são entregas de comida ou encomendas

In [19]:
df_processed_type = df_processed_completed[df_processed_completed["product_type"].isin(["UberX", "uberX VIP", "Moto", "Comfort", "Prioridade", "UberX Promo", "uberX"])]

### Comparando quantos registros tinham inicialmente e quantos restaram pós limpeza

In [20]:
print("Inicialmente: ", len(df_filtrado))
print("Pós limpeza: ", len(df_processed_type))

Inicialmente:  1676
Pós limpeza:  996


### Contando o número de corridas por tipo de produto

In [21]:
df_processed_type["product_type"].value_counts()


product_type
UberX          580
uberX VIP      337
Moto            46
Prioridade      25
Comfort          5
UberX Promo      3
Name: count, dtype: int64

### Transformando o campo "Request Time" de String para Datetime

In [22]:
df_processed_type = df_processed_type.copy()
df_processed_type['request_time'] = pd.to_datetime(df_processed_type['request_time'])

### Criando colunas de dia, mês, ano, dia da semana, hora do dia e custo por km

In [23]:
df_processed_type['day'] = df_processed_type['request_time'].dt.day
df_processed_type['month'] = df_processed_type['request_time'].dt.month
df_processed_type['year'] = df_processed_type['request_time'].dt.year
df_processed_type['day_of_week'] = df_processed_type['request_time'].dt.dayofweek
df_processed_type['hour_of_day'] = df_processed_type['request_time'].dt.hour
df_processed_type['cost_per_km'] = df_processed_type['fare_amount'] / df_processed_type['distance']

### Criando função para converter o inteiro do dia da semana para string

In [24]:
def converter_dia_da_semana(dia_da_semana):
    dias_da_semana = {
        0: 'segunda',
        1: 'terça',
        2: 'quarta',
        3: 'quinta',
        4: 'sexta',
        5: 'sábado',
        6: 'domingo'}
    return dias_da_semana.get(dia_da_semana)

In [25]:
df_processed_type["day_week"] = df_processed_type["day_of_week"].apply(converter_dia_da_semana)

### Excluindo colunas que não são mais necessárias

In [26]:
df_processed_type.drop(columns=['request_time', 'city', 'day_of_week'], inplace=True)

### Transformando o campo "cost_per_km" em um decimal de 2 casas

In [27]:
df_processed_type["cost_per_km"] = round(df_processed_type["cost_per_km"], 2)

In [28]:
df_processed_type.head()

Unnamed: 0,product_type,status,distance,fare_amount,day,month,year,hour_of_day,cost_per_km,day_week
0,UberX,completed,6.32,17.96,18,6,2025,0,2.84,quarta
1,UberX,completed,6.54,24.14,17,6,2025,20,3.69,terça
2,UberX,completed,5.19,17.05,17,6,2025,12,3.29,terça
5,UberX,completed,5.13,15.97,16,6,2025,13,3.11,segunda
7,UberX,completed,5.19,16.95,13,6,2025,13,3.27,sexta


### Caminho onde o arquivo processado será salvo

In [29]:
caminho_processed = Path('../data/processed')
caminho_processed.mkdir(parents=True, exist_ok=True)

df_processed_type.to_csv(f'{caminho_processed}/dados-preprocessados.csv', index=False)