In [1]:
import numpy as np
import pandas as pd

import re
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet, SGDRegressor, HuberRegressor

In [2]:
data = pd.read_csv('https://raw.githubusercontent.com/benvictoria21/MachineLearning/master/dataset/Consumo_cerveja.csv')

In [3]:
data

Unnamed: 0,Data,Temperatura Media (C),Temperatura Minima (C),Temperatura Maxima (C),Precipitacao (mm),Final de Semana,Consumo de cerveja (litros)
0,2015-01-01,273,239,325,0,0.0,25.461
1,2015-01-02,2702,245,335,0,0.0,28.972
2,2015-01-03,2482,224,299,0,1.0,30.814
3,2015-01-04,2398,215,286,12,1.0,29.799
4,2015-01-05,2382,21,283,0,0.0,28.900
...,...,...,...,...,...,...,...
936,,,,,,,
937,,,,,,,
938,,,,,,,
939,,,,,,,


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 941 entries, 0 to 940
Data columns (total 7 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   Data                         365 non-null    object 
 1   Temperatura Media (C)        365 non-null    object 
 2   Temperatura Minima (C)       365 non-null    object 
 3   Temperatura Maxima (C)       365 non-null    object 
 4   Precipitacao (mm)            365 non-null    object 
 5   Final de Semana              365 non-null    float64
 6   Consumo de cerveja (litros)  365 non-null    float64
dtypes: float64(2), object(5)
memory usage: 51.6+ KB


In [5]:
data.isna().mean()

Data                           0.612115
Temperatura Media (C)          0.612115
Temperatura Minima (C)         0.612115
Temperatura Maxima (C)         0.612115
Precipitacao (mm)              0.612115
Final de Semana                0.612115
Consumo de cerveja (litros)    0.612115
dtype: float64

In [6]:
def preprocess_inputs(df):
    df = df.copy()
    
    # Drop missing rows
    df = df.dropna(axis=0).reset_index(drop=True)
    
    # Replace , with . in numeric columns
    for column in ['Temperatura Media (C)', 'Temperatura Minima (C)' ,'Temperatura Maxima (C)', 'Precipitacao (mm)']:
        df[column] = df[column].apply(lambda x: np.float(re.sub(r',', '.', x)))
    
    # Create date features
    df['Data'] = pd.to_datetime(df['Data'])

    df['Month'] = df['Data'].apply(lambda x: x.month)
    df['Day'] = df['Data'].apply(lambda x: x.day)
    
    df = df.drop('Data', axis=1)
    
    # Split df into X and y
    y = df['Consumo de cerveja (litros)'].copy()
    X = df.drop('Consumo de cerveja (litros)', axis=1).copy()
    
    # Scale X with a standard scaler
    scaler = StandardScaler()
    X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)
    
    return X, y

In [7]:
X, y = preprocess_inputs(data)

In [8]:
X

Unnamed: 0,Temperatura Media (C),Temperatura Minima (C),Temperatura Maxima (C),Precipitacao (mm),Final de Semana,Month,Day
0,1.912508,2.281333,1.365781,-0.419062,-0.631243,-1.602745,-1.673503
1,1.824340,2.493924,1.597722,-0.419062,-0.631243,-1.602745,-1.559818
2,1.131590,1.749853,0.762735,-0.419062,1.584177,-1.602745,-1.446134
3,0.867085,1.430966,0.461212,-0.322294,1.584177,-1.602745,-1.332449
4,0.816703,1.253806,0.391630,-0.419062,-0.631243,-1.602745,-1.218764
...,...,...,...,...,...,...,...
360,0.873383,1.289238,0.368436,0.677640,1.584177,1.587648,1.282303
361,0.445137,1.289238,0.020525,-0.419062,-0.631243,1.587648,1.395988
362,0.142846,1.005782,-0.582521,0.411528,-0.631243,1.587648,1.509672
363,0.048380,0.651463,-0.976820,0.088969,-0.631243,1.587648,1.623357


In [9]:
X.dtypes

Temperatura Media (C)     float64
Temperatura Minima (C)    float64
Temperatura Maxima (C)    float64
Precipitacao (mm)         float64
Final de Semana           float64
Month                     float64
Day                       float64
dtype: object

In [10]:
y

0      25.461
1      28.972
2      30.814
3      29.799
4      28.900
        ...  
360    32.307
361    26.095
362    22.309
363    20.467
364    22.446
Name: Consumo de cerveja (litros), Length: 365, dtype: float64

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=123)

In [12]:
models = {
    '    Linear Regression': LinearRegression(),
    '     Ridge Regression': Ridge(),
    '     Lasso Regression': Lasso(),
    'ElasticNet Regression': ElasticNet(),
    '       SGD Regression': SGDRegressor(),
    '     Huber Regression': HuberRegressor()
}

for model in models.values():
    model.fit(X_train, y_train)

In [13]:
for name, model in models.items():
    print(name + " R^2 Score: {:.4f}".format(model.score(X_test, y_test)))

    Linear Regression R^2 Score: 0.6896
     Ridge Regression R^2 Score: 0.6909
     Lasso Regression R^2 Score: 0.5763
ElasticNet Regression R^2 Score: 0.5646
       SGD Regression R^2 Score: 0.6926
     Huber Regression R^2 Score: 0.6699
