## Regresion 2

In [1]:
import numpy as np
import pandas as pd
from sklearn.impute import KNNImputer
from sklearn.preprocessing import OneHotEncoder

In [2]:
train = pd.read_csv('./data/train.csv')
building_metadata = pd.read_csv('./data/building_metadata.csv')
weather_train = pd.read_csv('./data/weather_train.csv')

In [3]:
train = (train
.merge(building_metadata, on = 'building_id', how='left')
.merge(weather_train, on = ['site_id','timestamp'], how='left'))

In [4]:
train['timestamp'] = pd.to_datetime(train['timestamp'])
train['hour'] = train.timestamp.dt.hour
train['wday'] = train.timestamp.dt.dayofweek
train['week'] = train.timestamp.dt.weekofyear

In [5]:
train.shape

(20216100, 19)

In [6]:
train.drop('timestamp', inplace=True, axis = 1)
train.drop('year_built', inplace=True, axis = 1)
train.drop('floor_count', inplace=True, axis = 1)
train.drop('cloud_coverage', inplace=True, axis = 1)

In [7]:
train.dtypes

building_id             int64
meter                   int64
meter_reading         float64
site_id                 int64
primary_use            object
square_feet             int64
air_temperature       float64
dew_temperature       float64
precip_depth_1_hr     float64
sea_level_pressure    float64
wind_direction        float64
wind_speed            float64
hour                    int64
wday                    int64
week                    int64
dtype: object

In [8]:
train = train.interpolate()

In [9]:
train.isnull().sum()
#Substituir con 0 en caso de que no corra la regresión

building_id              0
meter                    0
meter_reading            0
site_id                  0
primary_use              0
square_feet              0
air_temperature          0
dew_temperature          0
precip_depth_1_hr     2301
sea_level_pressure       0
wind_direction           0
wind_speed               0
hour                     0
wday                     0
week                     0
dtype: int64

In [10]:
train.describe()

Unnamed: 0,building_id,meter,meter_reading,site_id,square_feet,air_temperature,dew_temperature,precip_depth_1_hr,sea_level_pressure,wind_direction,wind_speed,hour,wday,week
count,20216100.0,20216100.0,20216100.0,20216100.0,20216100.0,20216100.0,20216100.0,20213800.0,20216100.0,20216100.0,20216100.0,20216100.0,20216100.0,20216100.0
mean,799.278,0.6624412,2117.121,7.992232,107783.0,15.991,7.761942,0.815593,1016.144,173.4657,3.376491,11.50232,3.006958,26.92979
std,426.9133,0.9309921,153235.6,5.09906,117142.4,10.94178,10.17788,7.437876,6.961648,112.4925,2.263189,6.922017,1.997191,15.03481
min,0.0,0.0,0.0,0.0,283.0,-28.9,-35.0,-1.0,968.2,0.0,0.0,0.0,0.0,1.0
25%,393.0,0.0,18.3,3.0,32527.0,8.6,0.0,0.0,1011.8,80.0,2.1,6.0,1.0,14.0
50%,895.0,0.0,78.775,9.0,72709.0,16.7,8.9,0.0,1016.1,180.0,3.1,12.0,3.0,27.0
75%,1179.0,1.0,267.984,13.0,139113.0,24.1,16.1,0.0,1020.469,270.0,4.6,18.0,5.0,40.0
max,1448.0,3.0,21904700.0,15.0,875000.0,47.2,26.1,343.0,1045.5,360.0,19.0,23.0,6.0,53.0


## One Hot Encoding

In [11]:
encode = OneHotEncoder(drop = 'first')

In [12]:
catego_var = train.loc[:,['meter','site_id','primary_use']].to_numpy()

In [32]:
encode_var = encode.fit_transform(catego_var).toarray()

In [33]:
encode_var.shape

(20216100, 33)

In [20]:
encode_var[20216099,:]

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       1., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [19]:
train.tail(1)

Unnamed: 0,building_id,meter,meter_reading,site_id,primary_use,square_feet,air_temperature,dew_temperature,precip_depth_1_hr,sea_level_pressure,wind_direction,wind_speed,hour,wday,week
20216099,1448,0,2.85,15,Office,92271,1.7,-5.6,-1.0,1008.5,180.0,8.8,23,5,52


In [22]:
encode_names = ['meter_1','meter_2','meter_3','site_1','site_2','site_3','site_4','site_5','site_6','site_7','site_8','site_9','site_10','site_11','site_12','site_13','site_14','site_15','Entretainment','Food','Healthcare','Lodging','Manufacturing','Office','Other','Parking','Public','Religius','Retail','Services','Technology','Utility','Warehouse']

In [34]:
encode_var = pd.DataFrame(encode_var, columns = encode_names)

In [35]:
encode_var.head()

Unnamed: 0,meter_1,meter_2,meter_3,site_1,site_2,site_3,site_4,site_5,site_6,site_7,...,Office,Other,Parking,Public,Religius,Retail,Services,Technology,Utility,Warehouse
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [36]:
train.drop('meter', inplace=True, axis = 1)
train.drop('site_id', inplace=True, axis = 1)
train.drop('primary_use', inplace=True, axis = 1)

In [38]:
train = train.join(encode_var)

Unnamed: 0,building_id,meter_reading,square_feet,air_temperature,dew_temperature,precip_depth_1_hr,sea_level_pressure,wind_direction,wind_speed,hour,...,Office,Other,Parking,Public,Religius,Retail,Services,Technology,Utility,Warehouse
0,0,0.0000,7432,25.0,20.0,,1019.7,0.0,0.0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,0.0000,2720,25.0,20.0,,1019.7,0.0,0.0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2,0.0000,5376,25.0,20.0,,1019.7,0.0,0.0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3,0.0000,23685,25.0,20.0,,1019.7,0.0,0.0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4,0.0000,116607,25.0,20.0,,1019.7,0.0,0.0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,5,0.0000,8000,25.0,20.0,,1019.7,0.0,0.0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,6,0.0000,27926,25.0,20.0,,1019.7,0.0,0.0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,7,0.0000,121074,25.0,20.0,,1019.7,0.0,0.0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,8,0.0000,60809,25.0,20.0,,1019.7,0.0,0.0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,9,0.0000,27000,25.0,20.0,,1019.7,0.0,0.0,0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [39]:
[1,2
,3,4]

[1, 2, 3, 4]