In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
import pickle
from sklearn.metrics import r2_score
from rfpimp import permutation_importances
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error



In [19]:
def load_data():
    
    def dummie_and_drop(df, name):
        # Creates a dummy variable, concatenates it and finally drops the original categorical variable.
        # In order not to have redundant variables, one of the dummy variables is dropped too
        dummies = pd.get_dummies(df[name]).rename(columns = lambda x: name + '_' + str(x))
        dummies = dummies.drop(dummies.columns[-1], axis = 1)
        df = pd.concat([df, dummies], axis = 1)
        df.drop(columns = [name], inplace=True, axis=1)

        return df
    
    def convert_to_categorical(df, categorical_variables, categories, need_pickup = True):
        """ 
        The dataframe's selected variables are converted to categorical, and each variable's categories are also specified.
        It is also specified if the "pickup community area" has to be converted into categorical or no. If it is not 
        converted into categorical it is because it's not going to be used in the model.            
        """
        
        if need_pickup:
            begin = 0
        else:
            df.drop(columns = ['pickup_community_area'], inplace = True, axis = 1)
            begin = 1
        
        for i in range(begin, len(categorical_variables)):
            df[categorical_variables[i]] = df[categorical_variables[i]].astype('category').cat.set_categories(categories[i])
        return df
    
    
    def load(name, need_pickup = False, drop_correlated = False):
    
        # This parameter has to be set to True if the "pickup_community_area" variable is needed in the model
        

        # Load needed dataset and choose the useful columns
        df = pd.read_csv(name) #'dataset_train.csv')

        x = df[['pickup_community_area' ,'temperature', 'relative_humidity', 'wind_direction', 'wind_speed', 'precipitation_cat', 
                'sky_level', 'daytype', 'Day Name', 'Month', 'Hour', 'Fare Last Month', 'Trips Last Hour',
                'Trips Last Week (Same Hour)', 'Trips 2 Weeks Ago (Same Hour)', 'Year']]
#        float32=['temperature','relative_humidity','wind_direction','wind_speed','Fare Last Month', 'Trips Last Hour',
#                'Trips Last Week (Same Hour)', 'Trips 2 Weeks Ago (Same Hour)']
#        x= x[float32]=x[float32].astype('float32')
        # Convert the categorical variables
        categorical_variables = ['pickup_community_area', 'daytype', 'sky_level', 'Day Name', 'Month','Hour', 'Year']
        categories = [[*(range(1,78))], ['U', 'W', 'A'], ['OVC', 'BKN', 'SCT', 'FEW', 'CLR', 'VV '], 
                      ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'], 
                      [*(range(1,13))], [*(range(0, 24))], [2017, 2018,2019]]

        
        
        x = convert_to_categorical(x, categorical_variables, categories, need_pickup = need_pickup)

        float32=['temperature','relative_humidity','wind_direction','wind_speed','Fare Last Month', 'Trips Last Hour',
                'Trips Last Week (Same Hour)', 'Trips 2 Weeks Ago (Same Hour)']
        
        x[float32]=x[float32].astype('float32')
        # Make dummy variables with the categorical ones
        if need_pickup:
            begin = 0
        else:
            begin = 1
        for i in range(begin, len(categorical_variables)):
            x = dummie_and_drop(x, name = categorical_variables[i])

        y = df['Trips'].to_numpy()

        if need_pickup == False:
            # If we don't need the pickup, it means this is Neural Network case. Therefore we have to modify Y, in order
            # to have "n_areas" outputs per input (because there are "n_areas" regressions per input)
            n_areas = 77
            y = np.reshape(y, [-1, n_areas]) # If 
        
        if drop_correlated:
            x.drop(columns = ['Trips Last Week (Same Hour)'], inplace = True, axis = 1)
            x.drop(columns = ['Trips 2 Weeks Ago (Same Hour)'], inplace = True, axis = 1)

#        x = x.to_numpy()
        
        return (x,y)   
    
# ------------------------------------- MAIN PROGRAM ------------------------

    need_pickup = True 
    drop_correlated = False
    
    
#    name_train = 'dataset_train.csv'
    name_test = 'dataset_test.csv'
#    x, y = load(name_train, need_pickup, drop_correlated)
    x_test, y_test = load(name_test, need_pickup, drop_correlated)
#    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20,shuffle =True )
    
    return ( x_test, y_test)


In [20]:
x_test, y_test=load_data()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


In [21]:
x_test

Unnamed: 0,temperature,relative_humidity,wind_direction,wind_speed,precipitation_cat,Fare Last Month,Trips Last Hour,Trips Last Week (Same Hour),Trips 2 Weeks Ago (Same Hour),pickup_community_area_1,...,Hour_15,Hour_16,Hour_17,Hour_18,Hour_19,Hour_20,Hour_21,Hour_22,Year_2017,Year_2018
0,26.100000,80.750000,240.0,8.0,0.0,31592.800781,2.0,0.0,0.0,1,...,0,0,0,0,0,0,0,0,0,0
1,26.100000,80.750000,240.0,8.0,0.0,27917.800781,0.0,1.0,1.0,0,...,0,0,0,0,0,0,0,0,0,0
2,26.100000,80.750000,240.0,8.0,0.0,46936.000000,8.0,6.0,0.0,0,...,0,0,0,0,0,0,0,0,0,0
3,26.100000,80.750000,240.0,8.0,0.0,27275.750000,1.0,0.0,3.0,0,...,0,0,0,0,0,0,0,0,0,0
4,26.100000,80.750000,240.0,8.0,0.0,14535.750000,0.0,2.0,0.0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
55435,39.900002,96.550003,0.0,0.0,2.0,0.000000,0.0,0.0,0.0,0,...,0,0,0,0,0,0,0,0,0,0
55436,39.900002,96.550003,0.0,0.0,2.0,0.000000,0.0,0.0,0.0,0,...,0,0,0,0,0,0,0,0,0,0
55437,39.900002,96.550003,0.0,0.0,2.0,0.000000,0.0,0.0,0.0,0,...,0,0,0,0,0,0,0,0,0,0
55438,39.900002,96.550003,0.0,0.0,2.0,0.000000,37.0,5.0,4.0,0,...,0,0,0,0,0,0,0,0,0,0
