In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

<h1>Goal</h1>
The objective is to estimate a Machine Learning model using the Linear Regression technique to demonstrate the impacts of the variables available in this dataset on beer consumption (Y). At the end of the project we will have a forecast model for the average consumption of beer according to the inputs of a set of variables (X's).

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
# Reading the dataset file and replace the commam for dot
df = pd.read_csv('../input/Consumo_cerveja.csv', decimal=',')

In [None]:
# Update the name of the columns
df.columns=["date", "temp_medio", "temp_min", "temp_max", "rain", "weekend", "consumo"]

In [None]:
# Removing the NAN
df["consumo"] = df["consumo"].astype(float)
df = df.dropna()

In [None]:
# Displaying the first 5 rows of the DataFrame
df.head()

In [None]:
# Number of rows and columns
df.shape

In [None]:
# Summary of information in all columns
df.describe().round(2)

In [None]:
# Correlation Matrix
df.corr().round(4)

<h1>Behaviour of our dependent variable</h1>
In the next steps we will observe the behavior of our dependent variable (Consumption) related to Maximum Temperature, Rain and Weekend.

In [None]:
fig, ax = plt.subplots(figsize=(20, 6))

ax.set_title('Beer consumption', fontsize=20)
ax.set_ylabel('Liters', fontsize=16)
ax.set_xlabel('Days', fontsize=16)
ax = df['consumo'].plot(fontsize=16)

In [None]:
fig, ax = plt.subplots(figsize=(20, 6))

ax.set_title('Average Temperature', fontsize=20)
ax.set_xlabel('Days', fontsize=16)
ax.set_ylabel('Graus Celsius', fontsize=16)
ax = df['temp_medio'].plot(fontsize=16)

Investigating the dependent variable (y) according to a certain characteristic

In [None]:
ax = sns.boxplot(x = 'weekend', y = 'consumo', data = df, orient = 'v', width = 0.5)
ax.figure.set_size_inches(12, 6)
ax.set_title('Beer Consumption', fontsize=20)
ax.set_ylabel('Liters', fontsize=16)
ax.set_xlabel('Weekend', fontsize=16)
ax

We noticed that on weekends we consumed a larger number of liters of beer.

<h1>Dependent Variable X Explanatory Variables</h1>

In [None]:
ax = sns.pairplot(df, y_vars = 'consumo', x_vars = ['temp_min', 'temp_medio', 'temp_max', 'rain', 'weekend'])
ax.fig.suptitle('Dispersion between the variables', fontsize=20, y=1.10)
ax

In [None]:
ax = sns.pairplot(df, y_vars='consumo', x_vars=['temp_min', 'temp_medio', 'temp_max', 'rain', 'weekend'], kind='reg')
ax.fig.suptitle("Dispersion between the variables", fontsize=20, y=1.10)
ax

In [None]:
ax = sns.lmplot(x = 'temp_max', y = 'consumo', data = df, hue='weekend', markers=['o', '*'], legend=False)
ax.fig.suptitle("Regression Line - Consumption X Temperature", fontsize=20, y=1.10)
ax.set_xlabels("Max Temperature (Celsius)", fontsize=16)
ax.set_ylabels("Beer Consumption (Liters)", fontsize=16)
ax.add_legend(title="Weekend")
ax

<h1>Linear Regression</h1>

In [None]:
from sklearn.model_selection import train_test_split

Creating one Series to store beer consumption

In [None]:
y = df['consumo']

Creating one DataFrame to store the explanatory variables

In [None]:
X = df[['temp_max', 'rain', 'weekend']]

Creating the datasets de train and test

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=2811)

Checking the shape

In [None]:
X_train.shape

In [None]:
X_test.shape

In [None]:
X.shape[0] * 0.3

In [None]:
X.shape[0] * 0.7

<h1> Regression function with three explanatory variables</h1>

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn import metrics

In [None]:
# Instanciando a classe LinearRegression()
model = LinearRegression()

In [None]:
# Method fit() object model
model.fit(X_train, y_train)

<h2> Getting the determination coefience </h2>

In [None]:
print("R2 = {}".format(model.score(X_train, y_train).round(2)))

In [None]:
y_predict = model.predict(X_test)

In [None]:
print("R2 = %s" % metrics.r2_score(y_test, y_predict).round(2))

<h1>Creating a simple simulator</h1>

In [None]:
temp_max = 40
rain = 0
weekend = 1
entrance = [[temp_max, rain, weekend]]

print('Consumption: {0:.2f} liters'.format(model.predict(entrance)[0]))

In [None]:
entrada = X_test[0:1]
entrada

In [None]:
# Consumption average in liters in a day of maximum temperature of 30.5, with precipitation of rain of 12.2 mm and is not weekend
model.predict(entrada)[0]

<h1>Conclusion</h1>

When the maximum temperature is high beer consumption is also high, when it is the weekend beer consumption is also high, we can observe this in detail in our correlation matrix.

Using the simulator we can predict in average the quantity in liters that will be consumed when the variables average temperature, rain and weekend is filled.

This analysis is part of the [Alura](https://cursos.alura.com.br) Data Science Training course.