In [162]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn import ensemble
from sklearn import datasets
from sklearn.utils import shuffle
import pickle
from sklearn.metrics import mean_absolute_error

In [163]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from rfpimp import permutation_importances
from sklearn.metrics import mean_squared_error

In [164]:
def percentage_error(actual, predicted):
    res = np.empty(actual.shape)
    for j in range(actual.shape[0]):
        if actual[j] != 0:
            res[j] = (actual[j] - predicted[j]) / actual[j]
        else:
            res[j] = predicted[j] / np.mean(actual)
    return res

def mean_absolute_percentage_error(y_true, y_pred): 
    return np.mean(np.abs(percentage_error(np.asarray(y_true), np.asarray(y_pred)))) * 100

In [165]:
def metric(rf):
    print("Traning Score")
    print(rf.score(x_train,y_train))
    print("Test Score")
    print("MAE Train")
    print(mean_absolute_error(np.rint(rf.predict(x_train), y_train)))
    print("MAE Test")
    print(mean_absolute_error(np.rint(rf.predict(x_train), y_test)))

    print("MSE Train")
    print(mean_squared_error(np.rint(rf.predict(x_train)), y_train))
    print("MSE Test")
    print(mean_squared_error(np.rint(rf.predict(x_test)), y_test))
    print("MAPE Train")
    print(mean_absolute_percentage_error(y_test,np.rint(rf6.predict(x_test))))
    print("MAPE Test")
    print(mean_absolute_percentage_error(y_test,np.rint(rf6.predict(x_test))))

In [166]:
# In order to get reproducible results

# Seed value (can actually be different for each attribution step)
seed_value= 0

# 1. Set `PYTHONHASHSEED` environment variable at a fixed value
import os
os.environ['PYTHONHASHSEED']=str(seed_value)

# 2. Set `python` built-in pseudo-random generator at a fixed value
import random
random.seed(seed_value)

# 3. Set `numpy` pseudo-random generator at a fixed value
import numpy as np
np.random.seed(seed_value)

In [167]:
def load_data():
    
    def dummie_and_drop(df, name):
        # Creates a dummy variable, concatenates it and finally drops the original categorical variable.
        # In order not to have redundant variables, one of the dummy variables is dropped too
        dummies = pd.get_dummies(df[name]).rename(columns = lambda x: name + '_' + str(x))
        dummies = dummies.drop(dummies.columns[-1], axis = 1)
        df = pd.concat([df, dummies], axis = 1)
        df.drop(columns = [name], inplace=True, axis=1)

        return df
    
    def convert_to_categorical(df, categorical_variables, categories, need_pickup = True):
        """ 
        The dataframe's selected variables are converted to categorical, and each variable's categories are also specified.
        It is also specified if the "pickup community area" has to be converted into categorical or no. If it is not 
        converted into categorical it is because it's not going to be used in the model.            
        """
        
        if need_pickup:
            begin = 0
        else:
            df.drop(columns = ['pickup_community_area'], inplace = True, axis = 1)
            begin = 1
        
        for i in range(begin, len(categorical_variables)):
            df[categorical_variables[i]] = df[categorical_variables[i]].astype('category').cat.set_categories(categories[i])
        return df
    
    
    def load(name, need_pickup = False, drop_correlated = False):
    
        # This parameter has to be set to True if the "pickup_community_area" variable is needed in the model
        

        # Load needed dataset and choose the useful columns
        df = pd.read_csv(name) #'dataset_train.csv')

        x = df[['pickup_community_area' ,'temperature', 'relative_humidity', 'wind_direction', 'wind_speed', 'precipitation_cat', 
                'sky_level', 'daytype', 'Day Name', 'Month', 'Hour', 'Fare Last Month', 'Trips Last Hour',
                'Trips Last Week (Same Hour)', 'Trips 2 Weeks Ago (Same Hour)', 'Year']]
#        float32=['temperature','relative_humidity','wind_direction','wind_speed','Fare Last Month', 'Trips Last Hour',
#                'Trips Last Week (Same Hour)', 'Trips 2 Weeks Ago (Same Hour)']
#        x= x[float32]=x[float32].astype('float32')
        # Convert the categorical variables
        categorical_variables = ['pickup_community_area', 'daytype', 'sky_level', 'Day Name', 'Month','Hour', 'Year']
        categories = [[*(range(1,78))], ['U', 'W', 'A'], ['OVC', 'BKN', 'SCT', 'FEW', 'CLR', 'VV '], 
                      ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'], 
                      [*(range(1,13))], [*(range(0, 24))], ['2017', '2018', '2019']]

        
        
        x = convert_to_categorical(x, categorical_variables, categories, need_pickup = need_pickup)

        float32=['temperature','relative_humidity','wind_direction','wind_speed','Fare Last Month', 'Trips Last Hour',
                'Trips Last Week (Same Hour)', 'Trips 2 Weeks Ago (Same Hour)']
        
        x[float32]=x[float32].astype('float32')
        # Make dummy variables with the categorical ones
        if need_pickup:
            begin = 0
        else:
            begin = 1
        for i in range(begin, len(categorical_variables)):
            x = dummie_and_drop(x, name = categorical_variables[i])

        y = df['Trips'].to_numpy()

        if need_pickup == False:
            # If we don't need the pickup, it means this is Neural Network case. Therefore we have to modify Y, in order
            # to have "n_areas" outputs per input (because there are "n_areas" regressions per input)
            n_areas = 77
            y = np.reshape(y, [-1, n_areas]) # If 
        
        if drop_correlated:
            x.drop(columns = ['Trips Last Week (Same Hour)'], inplace = True, axis = 1)
            x.drop(columns = ['Trips 2 Weeks Ago (Same Hour)'], inplace = True, axis = 1)

#        x = x.to_numpy()
        
        return (x,y)   
    
# ------------------------------------- MAIN PROGRAM ------------------------

    need_pickup = True 
    drop_correlated = False
    
    
    name_train = 'dataset_train.csv'
#    name_test = 'dataset_test.csv'
    x, y = load(name_train, need_pickup, drop_correlated)
#    x_test, y_test = load(name_test, need_pickup, drop_correlated)
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.15,shuffle=True)
    
    return (x_train, x_test, y_train, y_test)

### Load the dataset

In [168]:
x_train, x_test, y_train, y_test=load_data()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


### Using Gradient Boosting Regressor for prediction

In [169]:
params = {'n_estimators': 128, 'max_depth': 5, 'min_samples_split': 2,
          'learning_rate': 0.05, 'loss': 'ls','max_features':0.5,'verbose':4}
clf1 = ensemble.GradientBoostingRegressor(**params)

In [None]:
clf1=clf1.fit(x_train, y_train)

      Iter       Train Loss   Remaining Time 
         1         242.0837           17.02m
         2         220.2569           17.51m
         3         200.5437           21.11m
         4         182.7471           30.56m
         5         166.7827           34.94m
         6         152.1278           36.95m
         7         138.9348           38.92m
         8         127.0488           40.41m
         9         116.3313           42.59m
        10         106.8469           42.50m
        11          97.9551           42.73m
        12          89.9491           43.37m
        13          82.8097           43.83m
        14          76.1952           43.69m
        15          70.3704           44.08m
        16          65.0100           44.26m
        17          60.2881           44.61m
        18          55.8451           44.85m
        19          51.8463           44.62m


In [None]:
metric(clf1)

### Plot the learning curve

In [None]:
def plot(clf):
    test_score = np.zeros((params['n_estimators'],), dtype=np.float64)

    for i, y_pred in enumerate(clf.staged_predict(x_test)):
        test_score[i] = clf.loss_(y_test, y_pred)

    plt.figure(figsize=(12, 6))
    plt.subplot(1, 2, 1)
    plt.title('Deviance')
    plt.plot(np.arange(params['n_estimators']) + 1, clf.train_score_, 'b-',
             label='Training Set Deviance')
    plt.plot(np.arange(params['n_estimators']) + 1, test_score, 'r-',
             label='Test Set Deviance')
    plt.legend(loc='upper right')
    plt.xlabel('Boosting Iterations')
    plt.ylabel('Deviance')


In [None]:
plot(clf1)

In [None]:
params = {'n_estimators': 256, 'max_depth': 5, 'min_samples_split': 2,
          'learning_rate': 0.05, 'loss': 'ls','max_features':0.5,'verbose':4}
clf2 = ensemble.GradientBoostingRegressor(**params)

In [None]:
clf2.fit(x_train, y_train)

### Error increased after removing correlated columns

In [None]:
metric(clf2)

In [None]:
plot(clf2)

In [None]:
params = {'n_estimators': 256, 'max_depth': 16, 'min_samples_split': 64,'min_samples_leaf':32 ,
          'learning_rate': 0.05, 'loss': 'ls','max_features':'sqrt','verbose':4}
clf3 = ensemble.GradientBoostingRegressor(**params)

In [None]:
clf3.fit(x_train, y_train)

In [None]:
metric(clf3)

In [None]:
plot(clf3)

In [None]:
params = {'n_estimators': 128, 'max_depth': 16, 'min_samples_split': 64,'min_samples_leaf':64 ,
          'learning_rate': 0.08, 'loss': 'ls','max_features':0.6,'verbose':4}
clf4 = ensemble.GradientBoostingRegressor(**params)

In [None]:
clf4=clf4.fit(x_train, y_train)

In [None]:
metric(clf4)

In [None]:
plot(clf4)

In [None]:

with open('clf3.pickle', 'wb') as f:
    pickle.dump(clf3, f)

In [None]:

with open('clf3.pickle', 'rb') as f:
        
        clf3 = pickle.load(f)