# Import required libraries

In [None]:
import pandas as pd
from tqdm import tqdm
import datetime


#https://www.kaggle.com/thegurusteam/spanish-high-speed-rail-system-ticket-pricing

In [None]:
tqdm.pandas()

# Read dataframe

In [None]:
df=pd.read_csv('thegurus-opendata-renfe-trips.csv')

In [None]:
df.shape

filter rows without price

In [None]:
df = df[df['price'].notna()]

In [None]:
df.shape

sample data, if order to reduce the amount of data

In [None]:
df=df.sample(n=500000, replace=False, random_state=1)

In [None]:
df.shape

In [None]:
df.head()

# Feature engineering

Extrac: dat, day of the week, month, hour from departure time

In [None]:
def processDate(x):
    x['departureDate']=datetime.datetime.strptime(x["departure"], '%Y-%m-%d %H:%M:%S')
    x['departureDay']=x["departureDate"].day
    x['departureMonth']=x["departureDate"].month
    x['departureHour']=x["departureDate"].hour
    x['departureWeekday']=x["departureDate"].weekday()
    return x

df=df.progress_apply(processDate, axis=1)

#df['departureDate']=df.progress_apply(lambda x: datetime.datetime.strptime(x["departure"], '%Y-%m-%d %H:%M:%S'),axis=1)
#df['departureDay']=df.progress_apply(lambda x: x["departureDate"].day,axis=1)
#df['departureMonth']=df.progress_apply(lambda x: x["departureDate"].month,axis=1)
#df['departureHour']=df.progress_apply(lambda x: x["departureDate"].hour,axis=1)
#df['departureWeekday']=df.progress_apply(lambda x: x["departureDate"].weekday(),axis=1)

In [None]:
df.shape

recode departure time

In [None]:
df['departureTime']=""
df.loc[(df['departureHour']<6),'departureTime' ] = "earlyMorning"
df.loc[((df['departureHour']>=6)&(df['departureHour']<9)),'departureTime' ] = "morning"
df.loc[((df['departureHour']>=9)&(df['departureHour']<12)),'departureTime' ] = "midmorning"
df.loc[((df['departureHour']>=12)&(df['departureHour']<16)),'departureTime' ] = "midday"
df.loc[((df['departureHour']>=16)&(df['departureHour']<18)),'departureTime' ] = "afternoon"
df.loc[((df['departureHour']>=18)&(df['departureHour']<21)),'departureTime' ] = "lateNight"
df.loc[(df['departureHour']>=21),'departureTime' ] = "night"


# Vars 

In [None]:
categoricalVar=['origin', 'destination','vehicle_type','vehicle_class', 'fare','departureWeekday', 'departureTime']
target='price'
continueVar=['duration','departureDay','departureMonth']


# Filter unknown obs

In [None]:
df['vehicle_class']=df['vehicle_class'].fillna("unknow", inplace=True)
df['fare']=df['fare'].fillna("unknow", inplace=True)

# Create dummys variables

In [None]:
X=df[continueVar]
y=df[target]
print(X.shape)
for i in categoricalVar:
    X=pd.concat([X, pd.get_dummies(pd.Series(df[i]), drop_first=True)], axis=1)
print(X.shape)
print(y.shape)


# Save clean data

In [None]:
data=X.copy()
data['y']=y

data.to_csv('data.csv',sep=";",index=False)

# Model

## Split data

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.33, random_state=42)

## Train

In [None]:
from sklearn.ensemble import RandomForestRegressor
clf = RandomForestRegressor(max_depth=200, random_state=11)
clf.fit(X_train, y_train)

## Predict

In [None]:
yhat_train=clf.predict(X_train)
yhat_valid=clf.predict(X_valid)

In [None]:
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_squared_error
from math import sqrt
from sklearn.metrics import explained_variance_score

## Evaluate

In [None]:
print(f"MAE train: {mean_absolute_error(y_train, yhat_train)}")
print(f"MAE valid: {mean_absolute_error(y_valid, yhat_valid)}")
print(f"MSE train: {mean_squared_error(y_train, yhat_train)}")
print(f"MSE valid: {mean_squared_error(y_valid, yhat_valid)}")
print(f"RMSE train: {sqrt(mean_squared_error(y_train, yhat_train))}")
print(f"RMSE valid: {sqrt(mean_squared_error(y_valid, yhat_valid))}")
print(f"explained_variance train: {explained_variance_score(y_train, yhat_train, multioutput='uniform_average')}")
print(f"explained_variance valid: {explained_variance_score(y_valid, yhat_valid, multioutput='uniform_average')}")


