<img src="https://www.pucsp.br/sites/default/files/download/brasao-PUCSP-assinatura-principal-RGB.png" alt="Logo PUC / ODS" style="width: 10%;">

# Análise de dados e aplicação de técnicas de regressão para a previsão de preços
### Desenvolver aplicação que prevê preços de corridas de Uber no Peru  utilizando regressão.

## Componentes do Grupo

| Nome | 
| --- | 
| Ana Clara Rodrigues | 
| Bernardo Rosa |
| Felipe Martins |
| Lívia Rainho |



In [None]:
import pandas as pd

In [3]:
df = pd.read_csv("uber_peru_2010.csv", sep =';')
df.head()

Unnamed: 0,journey_id,user_id,driver_id,taxi_id,icon,start_type,start_at,start_lat,start_lon,end_at,...,price_distance,price_duration,distance,duration,cost,cost_distance,cost_duration,source,driver_score,rider_score
0,23a1406fc6a11d866e3c82f22eed4d4c,0e9af5bbf1edfe591b54ecdfd7e91e26,583949a89a9ee17d19e3ca4f137b6b4c,b12f4f09c783e29fe0d0ea624530db56,executive,asap,16/11/2010 16:44,-1213983536,-7702355957,16/11/2010 17:29,...,3626.0,195.0,11331.0,234.0,0.0,0.0,0.0,iPhone,5.0,5.0
1,dd2af4715d0dc16eded53afc0e243577,a553c46e3a22fb9c326aeb3d72b3334e,,,executive,asap,01/06/2010 00:34,-1213874817,-7699536133,01/06/2010 00:37,...,,,0.0,0.0,0.0,,,iPhone,,
2,dd91e131888064bf7df3ce08f3d4b4ad,a553c46e3a22fb9c326aeb3d72b3334e,,,executive,asap,31/05/2010 05:01,-1212453079,-7702780151,31/05/2010 05:04,...,,,0.0,0.0,0.0,,,iPhone,,
3,dd2af4715d0dc16eded53afc0e2466d0,a553c46e3a22fb9c326aeb3d72b3334e,,,executive,asap,01/06/2010 00:29,-1213885117,-7699530029,01/06/2010 00:32,...,,,0.0,0.0,0.0,,,iPhone,,
4,85b7eabcf5d84e42dc7629b7d27781af,56772d544fdfa589a020a1ff894a86f7,d665fb9f75ef5d9cd0fd89479380ba78,0accdd3aa5a322f4129fa20b53278c69,executive,reserved,11/09/2010 23:55,-1208995438,-7692626953,12/09/2010 01:07,...,7665.0,562.0,30270.0,715.0,6173.0,5756.0,417.0,iPhone,4.0,5.0


In [4]:
# Transformação com datetime
df['start_at'] = pd.to_datetime(df['start_at'], format='%d/%m/%Y %H:%M')
df['end_at'] = pd.to_datetime(df['end_at'], format='%d/%m/%Y %H:%M')

#Tranformação dos dados de hora e dia da semana 
df['hour_of_day'] = df['start_at'].dt.hour
df['day_of_week'] = df['start_at'].dt.day_name()

# Vairáveis Categoricas
categorical_means = df.groupby(['icon', 'start_type', 'source'])['price'].mean().reset_index()

# Horas e Dia
temporal_means = df.groupby(['hour_of_day', 'day_of_week'])['price'].mean().reset_index()

categorical_means, temporal_means


(         icon start_type   source         price
 0        easy       asap  Android    917.647059
 1        easy       asap      Wap      0.000000
 2        easy       asap   iPhone      0.000000
 3        easy       asap      web    715.375000
 4        easy   reserved  Android   2076.777778
 5        easy   reserved      web   1950.271605
 6   executive       asap  Android   1863.273790
 7   executive       asap      Wap   2323.540230
 8   executive       asap     iPad   2459.507740
 9   executive       asap   iPhone   2078.605893
 10  executive       asap      web   2607.372627
 11  executive    delayed  Android   1227.882353
 12  executive   reserved  Android   3479.386497
 13  executive   reserved      Wap   2503.300000
 14  executive   reserved     iPad   3348.534247
 15  executive   reserved   iPhone   3437.569012
 16  executive   reserved      web   4044.043928
 17      group       asap  Android    631.578947
 18      group       asap      Wap      0.000000
 19      group      

In [5]:
!pip show scikit-learn

Name: scikit-learn
Version: 1.2.2
Summary: A set of python modules for machine learning and data mining
Home-page: http://scikit-learn.org
Author: 
Author-email: 
License: new BSD
Location: C:\Users\Ana Clara\anaconda3atualizado\Lib\site-packages
Requires: joblib, numpy, scipy, threadpoolctl
Required-by: imbalanced-learn


In [6]:
from sklearn.preprocessing import OneHotEncoder

# Instanciando o OneHotEncoder
encoder = OneHotEncoder(sparse=False)

# Variáveis categóricas para codificar
categorical_features = ['icon', 'start_type', 'source', 'day_of_week']

# Codificando as variáveis categóricas
encoded_features = encoder.fit_transform(df[categorical_features])

# Criando um DataFrame com as características codificadas
encoded_df = pd.DataFrame(encoded_features, columns=encoder.get_feature_names_out(categorical_features))

# Adicionando a característica 'hour_of_day' diretamente, pois é numérica
encoded_df['hour_of_day'] = df['hour_of_day']

# Concatenando as características codificadas de volta ao DataFrame original
model_df = pd.concat([df[['price', 'price_distance', 'price_duration', 'distance', 'duration', 'cost_distance', 'cost_duration']], encoded_df], axis=1)

# Verificando por valores faltantes
missing_data = model_df.isnull().sum()

missing_data, model_df.head()




(price                     398
 price_distance           3170
 price_duration           3170
 distance                  263
 duration                  263
 cost_distance            5073
 cost_duration            5073
 icon_easy                   0
 icon_executive              0
 icon_group                  0
 start_type_asap             0
 start_type_delayed          0
 start_type_reserved         0
 source_Android              0
 source_Wap                  0
 source_iPad                 0
 source_iPhone               0
 source_web                  0
 source_nan                  0
 day_of_week_Friday          0
 day_of_week_Monday          0
 day_of_week_Saturday        0
 day_of_week_Sunday          0
 day_of_week_Thursday        0
 day_of_week_Tuesday         0
 day_of_week_Wednesday       0
 hour_of_day                 0
 dtype: int64,
     price  price_distance  price_duration  distance  duration  cost_distance  \
 0  3821.0          3626.0           195.0   11331.0     234.0     

In [7]:
# Removendo as linhas onde o preço está faltando
model_df_clean = model_df.dropna(subset=['price'])

# Verificando novamente por valores faltantes e exibindo as primeiras linhas do DataFrame limpo
missing_data_clean = model_df_clean.isnull().sum()
missing_data_clean, model_df_clean.head()


(price                       0
 price_distance           2782
 price_duration           2782
 distance                    0
 duration                    0
 cost_distance            4682
 cost_duration            4682
 icon_easy                   0
 icon_executive              0
 icon_group                  0
 start_type_asap             0
 start_type_delayed          0
 start_type_reserved         0
 source_Android              0
 source_Wap                  0
 source_iPad                 0
 source_iPhone               0
 source_web                  0
 source_nan                  0
 day_of_week_Friday          0
 day_of_week_Monday          0
 day_of_week_Saturday        0
 day_of_week_Sunday          0
 day_of_week_Thursday        0
 day_of_week_Tuesday         0
 day_of_week_Wednesday       0
 hour_of_day                 0
 dtype: int64,
     price  price_distance  price_duration  distance  duration  cost_distance  \
 0  3821.0          3626.0           195.0   11331.0     234.0     

In [8]:
from sklearn.impute import SimpleImputer

# Criando um imputer que substitui cada valor NaN pela mediana da coluna
imputer = SimpleImputer(strategy='median')

# Colunas para imputar
columns_to_impute = ['price_distance', 'price_duration', 'cost_distance', 'cost_duration']

# Aplicando o imputer ao DataFrame
model_df_clean[columns_to_impute] = imputer.fit_transform(model_df_clean[columns_to_impute])

# Verificando novamente por valores faltantes
missing_data_after_imputation = model_df_clean.isnull().sum()
print(missing_data_after_imputation)


price                    0
price_distance           0
price_duration           0
distance                 0
duration                 0
cost_distance            0
cost_duration            0
icon_easy                0
icon_executive           0
icon_group               0
start_type_asap          0
start_type_delayed       0
start_type_reserved      0
source_Android           0
source_Wap               0
source_iPad              0
source_iPhone            0
source_web               0
source_nan               0
day_of_week_Friday       0
day_of_week_Monday       0
day_of_week_Saturday     0
day_of_week_Sunday       0
day_of_week_Thursday     0
day_of_week_Tuesday      0
day_of_week_Wednesday    0
hour_of_day              0
dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  model_df_clean[columns_to_impute] = imputer.fit_transform(model_df_clean[columns_to_impute])


In [12]:
model_df_clean.head(15)

Unnamed: 0,price,price_distance,price_duration,distance,duration,cost_distance,cost_duration,icon_easy,icon_executive,icon_group,...,source_web,source_nan,day_of_week_Friday,day_of_week_Monday,day_of_week_Saturday,day_of_week_Sunday,day_of_week_Thursday,day_of_week_Tuesday,day_of_week_Wednesday,hour_of_day
0,3821.0,3626.0,195.0,11331.0,234.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,16
1,0.0,1754.0,247.0,0.0,0.0,1048.0,127.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0
2,0.0,1754.0,247.0,0.0,0.0,1048.0,127.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,5
3,0.0,1754.0,247.0,0.0,0.0,1048.0,127.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0
4,8227.0,7665.0,562.0,30270.0,715.0,5756.0,417.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,23
5,8394.0,7393.0,1001.0,29080.0,1201.0,5175.0,700.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,13
6,7112.0,7112.0,0.0,27230.0,0.0,5340.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,12
7,5712.0,5519.0,193.0,17247.0,232.0,4139.0,143.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,3
8,4680.0,4602.0,78.0,14380.0,94.0,3451.0,58.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,5
9,3145.0,2580.0,565.0,8064.0,676.0,1935.0,418.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,23


In [10]:
import pandas as pd

# Suponha que model_df_clean é o DataFrame já carregado e tratado

# Selecionando as colunas específicas, incluindo 'price'
columns_of_interest = [
    'price', 'distance', 'duration', 'icon_easy', 'icon_executive', 'icon_group',
    'source_web', 'source_nan', 'day_of_week_Friday', 'day_of_week_Monday', 
    'day_of_week_Saturday', 'day_of_week_Sunday', 'day_of_week_Thursday', 
    'day_of_week_Tuesday', 'day_of_week_Wednesday', 'hour_of_day'
]

# Calculando a matriz de correlação para as colunas selecionadas
correlation_matrix_selected = model_df_clean[columns_of_interest].corr()

# Filtrando a correlação com a variável 'price'
price_correlations = correlation_matrix_selected['price'].sort_values(ascending=False)

# Exibindo as correlações com 'price'
print(price_correlations)


price                    1.000000
duration                 0.605069
icon_group               0.139862
source_web               0.139039
distance                 0.028382
source_nan               0.015833
day_of_week_Thursday     0.013358
day_of_week_Monday       0.011858
day_of_week_Wednesday   -0.000131
day_of_week_Sunday      -0.002504
day_of_week_Saturday    -0.004081
day_of_week_Tuesday     -0.004521
day_of_week_Friday      -0.014603
hour_of_day             -0.015294
icon_easy               -0.031385
icon_executive          -0.092936
Name: price, dtype: float64


In [106]:
# Calculando a matriz de correlação para o DataFrame limpo
correlation_matrix_clean = model_df_clean.corr()

# Filtrando a correlação com a variável 'price'
price_correlations = correlation_matrix_clean['price'].sort_values(ascending=False)

price_correlations


price                    1.000000
duration                 0.605069
price_duration           0.595260
cost_duration            0.554347
start_type_reserved      0.258964
icon_group               0.139862
source_web               0.139039
cost_distance            0.040854
price_distance           0.038771
distance                 0.028382
source_nan               0.015833
day_of_week_Thursday     0.013358
day_of_week_Monday       0.011858
source_iPad              0.008393
day_of_week_Wednesday   -0.000131
day_of_week_Sunday      -0.002504
day_of_week_Saturday    -0.004081
day_of_week_Tuesday     -0.004521
source_Wap              -0.011694
day_of_week_Friday      -0.014603
start_type_delayed      -0.014830
hour_of_day             -0.015294
icon_easy               -0.031385
source_iPhone           -0.060915
source_Android          -0.090886
icon_executive          -0.092936
start_type_asap         -0.257968
Name: price, dtype: float64

### Regressão Linear

In [107]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Selecionando as variáveis independentes e a dependente
X = model_df_clean[['duration', 'cost_duration', 'start_type_reserved',
                    'icon_group', 'source_web',  'distance']]
y = model_df_clean['price']
 #'start_at', 'end_at', 'arrived_at', 'distance', 'duration', 'source'

# Dividindo os dados em conjuntos de treino e teste
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Criando o modelo de regressão linear
model = LinearRegression()

# Treinando o modelo com os dados de treino
model.fit(X_train, y_train)

# Previsões do modelo
y_pred = model.predict(X_test)

# Avaliação do modelo
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

mse, r2


(6191186.717831991, 0.42277264243969337)

#### Árvore de decisão

In [112]:
from sklearn.tree import DecisionTreeRegressor

# Criando e treinando o modelo de árvore de decisão
tree_model = DecisionTreeRegressor(random_state=42)
tree_model.fit(X_train, y_train)

# Previsões do modelo de árvore de decisão
y_pred_tree = tree_model.predict(X_test)

# Avaliação do modelo de árvore de decisão
mse_tree = mean_squared_error(y_test, y_pred_tree)
r2_tree = r2_score(y_test, y_pred_tree)

mse_tree, r2_tree


(637746.1657962078, 0.940540553717031)

#### Floresta Aleatória

In [113]:
from sklearn.ensemble import RandomForestRegressor

# Criando e treinando o modelo de Florestas Aleatórias
forest_model = RandomForestRegressor(n_estimators=100, random_state=42)
forest_model.fit(X_train, y_train)

# Previsões do modelo de Florestas Aleatórias
y_pred_forest = forest_model.predict(X_test)

# Avaliação do modelo de Florestas Aleatórias
mse_forest = mean_squared_error(y_test, y_pred_forest)
r2_forest = r2_score(y_test, y_pred_forest)

mse_forest, r2_forest


(658598.9157131884, 0.9385963743082957)

###### Melhor modelo Floresta Aleatória e Árvore Decisão

In [117]:
# Criando um DataFrame para visualizar os preços reais e os preços previstos
comparison_df = pd.DataFrame({'Real Prices': y_test, 'Predicted Prices': y_pred_forest})

# Exibindo os primeiros 10 resultados
comparison_df.head(10)


Unnamed: 0,Real Prices,Predicted Prices
22135,3372.0,3393.11
4162,1700.0,1123.0
7623,2331.0,2333.55
12863,5047.0,4916.9
517,1700.0,1700.0
6243,0.0,21.604245
14170,1833.0,1825.11
1609,2219.0,2223.45
1118,4245.0,4261.4
2140,0.0,151.108382


In [118]:
# Previsões do modelo de árvore de decisão
y_pred_tree

# Para visualização, vamos mostrar os valores previstos ao lado dos valores reais dos primeiros 20 testes
predictions_df = pd.DataFrame({
    "Preço Real": y_test,
    "Preço Previsto": y_pred_tree
}).reset_index(drop=True)

# Mostrar as primeiras 20 previsões
predictions_df.head(10)


Unnamed: 0,Preço Real,Preço Previsto
0,3372.0,3374.0
1,1700.0,1700.0
2,2331.0,2320.0
3,5047.0,5030.0
4,1700.0,1700.0
5,0.0,22.222222
6,1833.0,1872.0
7,2219.0,2218.0
8,4245.0,4407.0
9,0.0,149.758454
