In [1]:
import numpy as np
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GroupKFold
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_log_error
from ASHRAE_tools import meter_dict
from ASHRAE_tools import submit
import ASHRAE_data_prep

meter_dict = meter_dict()

## Modelo de Regresión 1

In [2]:
X_train, y_train = ASHRAE_data_prep.train_reg()

In [4]:
X_train.dtypes

building_id             int64
meter                   int64
site_id                 int64
square_feet             int64
air_temperature       float64
dew_temperature       float64
sea_level_pressure    float64
wind_direction        float64
wind_speed            float64
hour                    int64
wday                    int64
week                    int64
dtype: object

In [5]:
X_train.describe()

Unnamed: 0,building_id,meter,site_id,square_feet,air_temperature,dew_temperature,sea_level_pressure,wind_direction,wind_speed,hour,wday,week
count,20216100.0,20216100.0,20216100.0,20216100.0,20216100.0,20216100.0,20216100.0,20216100.0,20216100.0,20216100.0,20216100.0,20216100.0
mean,799.278,0.6624412,7.992232,107783.0,15.95523,7.729669,1016.08,171.3399,3.377227,11.50232,3.006958,26.92979
std,426.9133,0.9309921,5.09906,117142.4,10.9663,10.18906,6.925514,112.8653,2.263302,6.922017,1.997191,15.03481
min,0.0,0.0,0.0,283.0,-28.9,-35.0,968.2,0.0,0.0,0.0,0.0,1.0
25%,393.0,0.0,3.0,32527.0,8.3,0.0,1011.8,72.85714,2.1,6.0,1.0,14.0
50%,895.0,0.0,9.0,72709.0,16.7,8.9,1016.1,170.0,3.1,12.0,3.0,27.0
75%,1179.0,1.0,13.0,139113.0,23.9,16.1,1020.3,270.0,4.6,18.0,5.0,40.0
max,1448.0,3.0,15.0,875000.0,47.2,26.1,1045.5,360.0,19.0,23.0,6.0,53.0


In [6]:
X_train.head()

Unnamed: 0,building_id,meter,site_id,square_feet,air_temperature,dew_temperature,sea_level_pressure,wind_direction,wind_speed,hour,wday,week
0,0,0,0,7432,25.0,20.0,1019.7,0.0,0.0,0,4,53
1,0,0,0,7432,24.4,21.1,1020.2,70.0,1.5,1,4,53
2,0,0,0,7432,22.8,21.1,1020.2,0.0,0.0,2,4,53
3,0,0,0,7432,21.1,20.6,1020.1,0.0,0.0,3,4,53
4,0,0,0,7432,20.0,20.0,1020.0,250.0,2.6,4,4,53


In [7]:
mod_reg = LinearRegression()

In [8]:
mod_reg.fit(X_train,y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [9]:
del X_train

In [10]:
mod_reg.coef_

array([[-2.65885923e+01,  2.08145493e+03,  2.34789920e+03,
         2.92220651e-02, -1.17494723e+02,  1.39814835e+02,
         2.28839207e+00, -3.57551059e+00,  5.70947189e+02,
         1.05215779e+01, -5.52471680e+01, -7.39483025e+01]])

In [11]:
row, X_test = ASHRAE_data_prep.test_reg()

In [12]:
X_test.shape

(41697600, 12)

In [13]:
y = mod_reg.predict(X_test)

In [54]:
submit(row)

## Modelo de Regresión 1 CV

In [18]:
X_train, y_train = ASHRAE_data_prep.train_reg()
mod_reg = LinearRegression()

In [8]:
kfold = GroupKFold(n_splits = 3)

In [9]:
fold_metrics = []

In [28]:
for train_index, test_index in kfold.split(X_train, y_train, X_train['building_id']):
    xv_train, xv_test = X_train.iloc[train_index], X_train.iloc[test_index]
    yv_train, yv_test = y_train.iloc[train_index], y_train.iloc[test_index]
    mod_reg.fit(xv_train, yv_train)
    y_pred = np.maximum(mod_reg.predict(xv_test),0)
    metric = np.sqrt(mean_squared_log_error(yv_test,y_pred))
    fold_metrics.append(metric)

In [29]:
fold_metrics

[4.358489391356827, 2.8934359319885976, 4.40673941672632]

In [30]:
xv_train.shape

(13477585, 12)

## Modelo de Regresión 2

In [2]:
X_train, y_train = ASHRAE_data_prep.train_reg_cat()

In [3]:
X_train.shape

(20216100, 29)

In [3]:
X_train.head()

Unnamed: 0,building_id,square_feet,air_temperature,dew_temperature,precip_depth_1_hr,sea_level_pressure,wind_direction,wind_speed,hour,wday,week,meter_1,meter_2,meter_3
0,0,7432,25.0,20.0,0.0,1019.7,0.0,0.0,0,4,53,0.0,0.0,0.0
1,0,7432,24.4,21.1,-1.0,1020.2,70.0,1.5,1,4,53,0.0,0.0,0.0
2,0,7432,22.8,21.1,0.0,1020.2,0.0,0.0,2,4,53,0.0,0.0,0.0
3,0,7432,21.1,20.6,0.0,1020.1,0.0,0.0,3,4,53,0.0,0.0,0.0
4,0,7432,20.0,20.0,-1.0,1020.0,250.0,2.6,4,4,53,0.0,0.0,0.0


In [16]:
X_train.isnull().sum()

building_id           0
square_feet           0
air_temperature       0
dew_temperature       0
precip_depth_1_hr     0
sea_level_pressure    0
wind_direction        0
wind_speed            0
hour                  0
wday                  0
week                  0
meter_1               0
meter_2               0
meter_3               0
site_1                0
site_2                0
site_3                0
site_4                0
site_5                0
site_6                0
site_7                0
site_8                0
site_9                0
site_10               0
site_11               0
site_12               0
site_13               0
site_14               0
site_15               0
Entretainment         0
Food                  0
Healthcare            0
Lodging               0
Manufacturing         0
Office                0
Other                 0
Parking               0
Public                0
Religius              0
Retail                0
Services              0
Technology      

In [4]:
mod_reg = LinearRegression()

In [5]:
kfold = GroupKFold(n_splits = 2)

In [6]:
fold_metrics = []

In [7]:
for train_index, test_index in kfold.split(X_train, y_train, X_train['building_id']):
    xv_train, xv_test = X_train.iloc[train_index], X_train.iloc[test_index]
    yv_train, yv_test = y_train.iloc[train_index], y_train.iloc[test_index]
    mod_reg.fit(xv_train, yv_train)
    y_pred = np.maximum(mod_reg.predict(xv_test),0)
    metric = np.sqrt(mean_squared_log_error(yv_test,y_pred))
    fold_metrics.append(metric)

In [8]:
fold_metrics

[3.0229925026832896, 4.318974829424748]

## Ahora construimos el modelo el conjunto completo

In [9]:
mod_reg_cat = LinearRegression()
mod_reg_cat.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [12]:
X_test = ASHRAE_data_prep.test_reg_cat()

In [13]:
X_test.head()

Unnamed: 0_level_0,building_id,square_feet,air_temperature,dew_temperature,precip_depth_1_hr,sea_level_pressure,wind_direction,wind_speed,hour,wday,week,meter_1,meter_2,meter_3
row_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
0,0,7432,17.8,11.7,0.0,1021.4,100.0,3.6,0,6,52,0.0,0.0,0.0
129,0,7432,17.8,12.8,0.0,1022.0,130.0,3.1,1,6,52,0.0,0.0,0.0
258,0,7432,16.1,12.8,0.0,1021.9,140.0,3.1,2,6,52,0.0,0.0,0.0
387,0,7432,17.2,13.3,0.0,1022.2,140.0,3.1,3,6,52,0.0,0.0,0.0
516,0,7432,16.7,13.3,0.0,1022.3,130.0,2.6,4,6,52,0.0,0.0,0.0


In [14]:
y_pred = mod_reg_cat.predict(X_test)

In [16]:
y_pred[0:10]

array([[-3575.4675263 ],
       [-4010.07587839],
       [-4003.58486418],
       [-4056.24932079],
       [-4287.88459267],
       [-4523.87489611],
       [-4892.7639361 ],
       [-5117.72399037],
       [-5065.74159873],
       [-5042.18033945]])

In [47]:
sub = pd.DataFrame(np.maximum(0,y_pred), index = X_test.index, columns = ['meter_reading'])

In [55]:
sub.sort_values(by = 'row_id', inplace = True)

In [57]:
sub.shape

(41697600, 1)

In [58]:
sub.to_csv('./submission.csv')

##  Lasso Model

In [2]:
X_train, y_train = ASHRAE_data_prep.train_lasso()

MemoryError: 

In [2]:
train, encode = ASHRAE_data_prep.train_lasso()

In [3]:
train.shape

(1927688, 6)

In [4]:
train.head(50)

Unnamed: 0,building_id,meter_reading,air_temperature,hour,wday,week
580,0,0.0,6.1,4,0,4
585,0,0.0,2.8,9,0,4
590,0,0.0,10.0,14,0,4
594,0,0.0,18.3,18,0,4
598,0,0.0,20.6,22,0,4
604,0,0.0,13.3,4,1,4
609,0,0.0,10.6,9,1,4
614,0,0.0,15.6,14,1,4
618,0,0.0,23.3,18,1,4
622,0,0.0,22.8,22,1,4


In [5]:
encode.shape

(1927688, 1451)

In [6]:
encode.head()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,1442,1443,1444,1445,1446,1447,1448,meter_1,meter_2,meter_3
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
X_train = encode

In [50]:
X_train['week'] = train['week']

In [56]:
X_train.drop(['meter_reading'], inplace=True, axis=1)

MemoryError: 

In [34]:
train.reset_index(drop=True, inplace=True)

In [55]:
X_train.head()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,1448,meter_1,meter_2,meter_3,building_id,meter_reading,air_temperature,hour,wday,week
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0,,6.1,4,0,4
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0,,2.8,9,0,4
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0,,10.0,14,0,4
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0,,18.3,18,0,4
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0,,20.6,22,0,4


In [39]:
X_train = encode