In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('../data/cleaned_data.csv')

In [3]:
print(df.head())

                                     id     timestamp day_of_week     month  \
0  009e9c53-074d-43cf-aef2-0fbc7a47ed3d  1.543616e+09      Friday  November   
1  23f145da-f0c1-4d1f-a184-496bc003a7db  1.544698e+09    Thursday  December   
2  357559cb-8c58-4278-a41a-e33b2e0997a3  1.544729e+09    Thursday  December   
3  50ef1165-9d23-416c-a65c-18906207b295  1.545005e+09      Sunday  December   
4  91c4861c-1780-42b0-bca1-bbd64a422cc3  1.544748e+09      Friday  December   

   day  year   time             datetime  hour          timezone  ...  \
0   30  2018  22:13  2018-11-30 22:13:01    22  America/New_York  ...   
1   13  2018  10:50  2018-12-13 10:50:11    10  America/New_York  ...   
2   13  2018  19:15  2018-12-13 19:15:03    19  America/New_York  ...   
3   16  2018  23:55  2018-12-16 23:55:11    23  America/New_York  ...   
4   14  2018  00:40  2018-12-14 00:40:07     0  America/New_York  ...   

  precipIntensityMax uvIndexTime temperatureMin temperatureMinTime  \
0             0.

In [4]:
feature_columns = ['distance', 'temperature' , 'sunriseTime', 'sunsetTime']
X = df[feature_columns]

In [5]:
y = df['price'] 

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [7]:
model = DecisionTreeRegressor(max_depth=10, random_state=42)
model.fit(X_train, y_train)

In [9]:
from sklearn.tree import DecisionTreeRegressor, export_text


In [10]:
model_text = export_text(model, feature_names=feature_columns)
print(model_text)

|--- distance <= 2.29
|   |--- distance <= 1.64
|   |   |--- distance <= 1.03
|   |   |   |--- distance <= 0.66
|   |   |   |   |--- distance <= 0.55
|   |   |   |   |   |--- temperature <= 47.06
|   |   |   |   |   |   |--- temperature <= 46.33
|   |   |   |   |   |   |   |--- temperature <= 44.22
|   |   |   |   |   |   |   |   |--- temperature <= 43.10
|   |   |   |   |   |   |   |   |   |--- temperature <= 43.05
|   |   |   |   |   |   |   |   |   |   |--- value: [12.08]
|   |   |   |   |   |   |   |   |   |--- temperature >  43.05
|   |   |   |   |   |   |   |   |   |   |--- value: [14.50]
|   |   |   |   |   |   |   |   |--- temperature >  43.10
|   |   |   |   |   |   |   |   |   |--- distance <= 0.50
|   |   |   |   |   |   |   |   |   |   |--- value: [11.73]
|   |   |   |   |   |   |   |   |   |--- distance >  0.50
|   |   |   |   |   |   |   |   |   |   |--- value: [10.60]
|   |   |   |   |   |   |   |--- temperature >  44.22
|   |   |   |   |   |   |   |   |--- distance <= 0

In [11]:
y_pred = model.predict(X_test)

In [12]:
print("Actual vs. Predicted values:\n")
for actual, predicted in zip(y_test[:10], y_pred[:10]):
    print(f"Actual: {actual:.2f} - Predicted: {predicted:.2f}")

Actual vs. Predicted values:

Actual: 5.50 - Predicted: 12.49
Actual: 8.00 - Predicted: 13.31
Actual: 18.50 - Predicted: 18.23
Actual: 20.50 - Predicted: 14.60
Actual: 23.50 - Predicted: 16.74
Actual: 9.50 - Predicted: 15.58
Actual: 31.50 - Predicted: 16.28
Actual: 9.50 - Predicted: 12.60
Actual: 8.50 - Predicted: 12.90
Actual: 9.50 - Predicted: 16.13


In [13]:
print("\nModel Mean Squared Error:", mean_squared_error(y_test, y_pred))



Model Mean Squared Error: 65.28636114363019


In [14]:
# This is the first one which only predicts using distance temp and other parameters. 

In [65]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error

In [66]:
features = ['distance', 'surge_multiplier', 'cab_type', 'hour', 'day_of_week', 'month', 
            'temperature', 'short_summary', 'windGust','windSpeed','precipIntensityMax','cloudCover','ozone','precipProbability']
X = df[features]
y = df['price']

In [67]:
categorical_features = ['cab_type', 'day_of_week', 'month', 'short_summary']
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

In [68]:
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, categorical_features),
    ],
    remainder='passthrough'
)

In [75]:
model = Pipeline(steps=[('preprocessor', preprocessor),
                        ('regressor', DecisionTreeRegressor(max_depth=10, random_state=50))])

In [76]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [77]:
model.fit(X_train, y_train)

In [78]:
y_pred = model.predict(X_test)

In [79]:
print("Actual vs. Predicted values:\n")
for actual, predicted in zip(y_test[:10], y_pred[:10]):
    print(f"Actual: {actual:.2f} - Predicted: {predicted:.2f}")

Actual vs. Predicted values:

Actual: 10.50 - Predicted: 17.41
Actual: 10.50 - Predicted: 18.10
Actual: 8.00 - Predicted: 15.00
Actual: 17.00 - Predicted: 17.79
Actual: 17.00 - Predicted: 19.50
Actual: 7.50 - Predicted: 12.59
Actual: 14.50 - Predicted: 13.97
Actual: 14.50 - Predicted: 16.25
Actual: 11.50 - Predicted: 16.80
Actual: 7.50 - Predicted: 12.45


In [80]:
print("\nModel Mean Squared Error:", mean_squared_error(y_test, y_pred))



Model Mean Squared Error: 65.8771562572212


In [81]:
# Using the pipelines and other elements shown as important 

In [82]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

In [63]:
feature_columns = ['temperature', 'distance', 'sunriseTime', 'sunsetTime']
X = df[feature_columns]
y = df['price']

In [64]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)


In [46]:
model = RandomForestRegressor(n_estimators=100, max_depth=10, random_state=42)

In [47]:
model.fit(X_train, y_train)

In [48]:
y_pred = model.predict(X_test)

In [49]:
print("Actual vs. Predicted values:\n")
for actual, predicted in zip(y_test[:10], y_pred[:10]):
    print(f"Actual: {actual:.2f} - Predicted: {predicted:.2f}")

Actual vs. Predicted values:

Actual: 10.50 - Predicted: 17.45
Actual: 10.50 - Predicted: 17.21
Actual: 8.00 - Predicted: 14.77
Actual: 17.00 - Predicted: 18.06
Actual: 17.00 - Predicted: 13.60
Actual: 7.50 - Predicted: 12.35
Actual: 14.50 - Predicted: 13.82
Actual: 14.50 - Predicted: 16.54
Actual: 11.50 - Predicted: 16.69
Actual: 7.50 - Predicted: 12.55


In [50]:
print("\nModel Mean Squared Error:", mean_squared_error(y_test, y_pred))


Model Mean Squared Error: 65.17963975636503


In [51]:
# Pipeline work 

In [52]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error

In [53]:
features = ['distance', 'surge_multiplier', 'cab_type', 'hour', 'day_of_week', 'month', 
            'temperature', 'short_summary']
X = df[features]
y = df['price']

In [54]:
categorical_features = ['cab_type', 'day_of_week', 'month', 'short_summary']
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

In [55]:
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, categorical_features),
    ],
    remainder='passthrough'
)


In [56]:
model = Pipeline(steps=[('preprocessor', preprocessor),
                        ('regressor', RandomForestRegressor(n_estimators=100, max_depth=10, random_state=42))])

In [57]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [58]:
model.fit(X_train, y_train)

In [59]:
y_pred = model.predict(X_test)

In [60]:
print("Actual vs. Predicted values:\n")
for actual, predicted in zip(y_test[:10], y_pred[:10]):
    print(f"Actual: {actual:.2f} - Predicted: {predicted:.2f}")

Actual vs. Predicted values:

Actual: 10.50 - Predicted: 17.82
Actual: 10.50 - Predicted: 17.19
Actual: 8.00 - Predicted: 14.71
Actual: 17.00 - Predicted: 18.17
Actual: 17.00 - Predicted: 13.76
Actual: 7.50 - Predicted: 12.35
Actual: 14.50 - Predicted: 13.96
Actual: 14.50 - Predicted: 16.57
Actual: 11.50 - Predicted: 16.60
Actual: 7.50 - Predicted: 12.53


In [61]:
print("\nModel Mean Squared Error:", mean_squared_error(y_test, y_pred))


Model Mean Squared Error: 65.26784865448764
