# Renta de Bicicletas

## Prediccion de la demanda en el alquiler de bicicletas

**librerias**

In [None]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
import pickle

import matplotlib.pyplot as plt
%matplotlib inline

import statsmodels.api as sm

from sklearn.linear_model import LinearRegression as LinReg
from sklearn.preprocessing import PolynomialFeatures as PF
from sklearn.ensemble import GradientBoostingRegressor as GBR


from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import r2_score

## Datos

UCI's Bike Sharing Dataset Data Set
[link](https://archive.ics.uci.edu/ml/datasets/bike+sharing+dataset)

<B>Resumen:</B> Este dataset contiene las rentas de bicicletas por hora y dia entre los años 2011 y 2012 en el sistema Capital Bikeshare con la correspondiente informacion climatologica y estacional.

**Atributos:**


Ambos csv tienen los siguientes campos, excepto hr que no esta en day.csv

- **instant**: indice
- **dteday** : fecha
- **season** : estacion (1:primavera, 2:verano, 3:otoño, 4:invierno)
- **yr** : año (0: 2011, 1:2012)
- **mnth** : mes ( 1 to 12)
- **hr** : hora (0 to 23)
- **holiday** : dia festivo o no
- **weekday** : dia de la semana
- **workingday** : dia laborable o no
+ **weathersit** : 
    - 1: Despejado, Pocas nubes, Parcialmente nublado
    - 2: Niebla, Niebla+Nublado
    - 3: Nieve ligera, LLuvia ligera+Tormenta+Nube dispersa
    - 4: LLuvia fuerte+Hielo+Tormenta+Niebla,Nieve+Niebla
- **temp** : temperatura normalizada en grados centigrados. MinMax Scaler (t-t_min)/(t_max-t_min), t_min=-8, t_max=+39 (solo en escala horaria)
- **atemp**: sensacion termica normalizada en grados centigrados. MinMax Scaler (t-t_min)/(t_max-t_min), t_min=-16, t_max=+50 (solo en escala horaria)
- **hum**: humedad normalizada, dividido entre 100 (max)
- **windspeed**: velocidad del viento normalizada, dividido entre 67 (max)
- **casual**: cuenta de usuarios casuales
- **registered**: cuenta de usuarios registrados
- **cnt**: cuenta total del alquiler de bicicletas, incluyendo tanto los casuales como los registrados



In [None]:
hour_df=pd.read_csv('data/hour.csv')  # datos por hora
day_df=pd.read_csv('data/day.csv')    # datos por dia

hour_df.head()

In [None]:
day_df.head()

**nos centraremos en los datos por hora**

In [None]:
# primero se quitan dos columnas con info redundante...

hour_df=hour_df.drop(columns=['casual', 'registered', 'workingday'])

In [None]:
# info del dataframe, no hay nulos

hour_df.info(memory_usage='deep')

In [None]:
# downcasting

for e in hour_df.select_dtypes('integer').columns:
    hour_df[e]=pd.to_numeric(hour_df[e], downcast='integer')

for e in hour_df.select_dtypes('float').columns:
    hour_df[e]=pd.to_numeric(hour_df[e], downcast='float')


hour_df.info(memory_usage='deep')

In [None]:
# descripcion

hour_df.describe()

In [None]:
# objetivo - cnt (cuenta)

hour_df.cnt.describe()

In [None]:
hour_df.cnt.plot()
plt.ylabel('# Alquileres', fontsize=12)
plt.title('Registro Alquiler Bicicletas', fontsize=16)
plt.show();

In [None]:
plt.plot(sorted(hour_df.cnt))
plt.ylabel('# ordenado de alquileres', fontsize=12)
plt.title('Registro Alquiler Bicicletas (ordenado)', fontsize=16)
plt.show();

**comparacion del objetivo con la variables numericas**

In [None]:
# cuenta contra temperatura, sensacion termica, humedad y velocidad del viento

count=0


for t in hour_df.dtypes:
    
    if (t=='float32'):
        hour_df=hour_df.sort_values(hour_df.dtypes.index[count])
        feat=hour_df.dtypes.index[count]
        
        plt.scatter(hour_df[feat], hour_df.cnt)
        plt.title('Cnt  vs  ' + feat)
        plt.xlabel(feat)
        plt.ylabel('# Alquileres')
        plt.show();
        
    count+=1

**comparacion del objetivo con la variables categoricas**

In [None]:
# por estacion del año

df1=hour_df[['season','cnt']].groupby(['season']).sum().reset_index()

df1.plot(kind='bar', legend=False, 
         title ='Alquiler de bicicletas por estacion', 
         stacked=True, 
         fontsize=12)

plt.xlabel('Estacion', fontsize=12)
plt.ylabel('# Alquileres', fontsize=12)
plt.xticks(range(4), 
           ['primavera','verano','otoño','invierno'], 
           rotation=45)
plt.show();

In [None]:
# segun clima

df2=hour_df[['weathersit','cnt']].groupby(['weathersit']).sum().reset_index()

df2.plot(kind='bar', legend=False, 
         title ='Alquiler de bicicletas segun clima', 
         stacked=True, 
         fontsize=12)

plt.xlabel('Clima', fontsize=12)
plt.ylabel('# Alquileres', fontsize=12)
plt.xticks(range(4), 
           ['despejado','nublado','lluvia ligera','lluvia fuerte'], 
           rotation=45)
plt.show();

In [None]:
# por horas

df3=hour_df[['hr','cnt']].groupby(['hr']).sum().reset_index()

df3.plot(kind='bar', legend=False, 
         title ='Alquiler de bicicletas por hora del dia', 
         stacked=True, 
         fontsize=12)

plt.xlabel('Hora', fontsize=12)
plt.ylabel('# Alquileres', fontsize=12)
plt.show();

In [None]:
# por mes

df3=hour_df[['mnth','cnt']].groupby(['mnth']).sum().reset_index()

df3.plot(kind='bar', legend=False, 
         title ='Alquiler de bicicletas por mes', 
         stacked=True, 
         fontsize=12)

plt.xlabel('Mes', fontsize=12)
plt.ylabel('# Alquileres', fontsize=12)
plt.show();

## Modelo

**primero statsmodels para ver significancia**

In [None]:
X=hour_df.drop(columns=['cnt', 'instant', 'dteday'])  # datos
y=hour_df.cnt                  # objetivo

In [None]:
modelo=sm.OLS(y, X).fit()
pred=modelo.predict(X)


modelo.summary()

A traves de los p-values, podemos ver que el mes lo podriamos quitar

**split de los datos**

In [None]:
X=hour_df.drop(columns=['cnt', 'instant', 'dteday', 'mnth'])  
y=hour_df.cnt                  


X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.2,
                                                    random_state=16)

**Regresion Lineal**

In [None]:
linreg=LinReg()
linreg.fit(X_train, y_train)

y_pred=linreg.predict(X_test)

print ('RMSE = {:.2f}'.format(mse(y_test, y_pred)**.5))

**Polinomios de grado 2**

In [None]:
poly=PF(2)
X_train_pf2=poly.fit_transform(X_train)
X_test_pf2=poly.fit_transform(X_test)

linreg.fit(X_train_pf2, y_train)

y_pred=linreg.predict(X_test_pf2)

print ('RMSE = {:.2f}'.format(mse(y_test, y_pred)**.5))

**Polinomios de grado 3**

In [None]:
poly=PF(3)
X_train_pf3=poly.fit_transform(X_train)
X_test_pf3=poly.fit_transform(X_test)

linreg.fit(X_train_pf3, y_train)

y_pred=linreg.predict(X_test_pf3)

print ('RMSE = {:.2f}'.format(mse(y_test, y_pred)**.5))

**one hot encoding**

In [None]:
X_train_dummy=pd.get_dummies(X_train, 
                       columns=['season', 'weekday', 'weathersit'], 
                       drop_first=False)
X_train_dummy.head()

In [None]:
X_test_dummy=pd.get_dummies(X_test, 
                       columns=['season', 'weekday', 'weathersit'], 
                       drop_first=False)
X_test_dummy.head()

In [None]:
X_train.shape

In [None]:
linreg.fit(X_train_dummy, y_train)

y_pred=linreg.predict(X_test_dummy)

print ('RMSE = {:.2f}'.format(mse(y_test, y_pred)**.5))

In [None]:
print('Coeficientes: \n', linreg.coef_)

In [None]:
print('Ordenada en el origen \n', linreg.intercept_)

**probando gradient boosting regressor**

In [None]:
gbr=GBR()
gbr.fit(X_train_dummy, np.ravel(y_train))

y_pred=gbr.predict(X_test_dummy)

print ('RMSE = {:.2f}'.format(mse(y_test, y_pred)**.5))

## Modelado temporal

Se usa la hora anterior (suma de alquileres) para predecir la siguiente

In [None]:
# horas
df5=hour_df[['dteday','hr','cnt']].groupby(['dteday','hr']).sum().reset_index()
df5.sort_values(['dteday','hr'])

# se tienen en consideracion dos horas atras
df5['sum_hr_1']=df5.cnt.shift(+1)
df5['sum_hr_2']=df5.cnt.shift(+2)


merged=pd.merge(hour_df, df5[['dteday', 'hr', 
                              'sum_hr_1', 
                              'sum_hr_2']], 
                how='inner', 
                on=['dteday', 'hr']).dropna()
merged.head()

In [None]:
X=merged.drop(columns=['cnt', 'instant', 'dteday', 'mnth'])  
y=merged.cnt                  


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [None]:
%time gbr.fit(X_train, np.ravel(y_train))

y_pred=gbr.predict(X_test)

print ('RMSE = {:.2f}'.format(mse(y_test, y_pred)**.5))

**Pasos que se van a seguir con los datos:**

- **modelado temporal**
- **one-hot**
- **seleccion de caracteristicas** : 
- **train_test_split** 

In [None]:
def mod_temp(df):
    
    m_df=df[['dteday','hr','cnt']].groupby(['dteday','hr']).sum().reset_index()
    m_df.sort_values(['dteday','hr'])
    
    m_df['sum_hr_1']=m_df.cnt.shift(+1)
    m_df['sum_hr_2']=m_df.cnt.shift(+2)
    
    merged=pd.merge(df, m_df[['dteday', 'hr', 
                              'sum_hr_1', 
                              'sum_hr_2']], 
                    how='inner', 
                    on=['dteday', 'hr']).dropna()
    return merged

In [None]:
def one_hot(df):
   return pd.get_dummies(df, 
                   columns=['season', 'weekday', 'weathersit'], 
                   drop_first=False)

In [None]:
def sel_split(df, test_size=0.2):
    X=df.drop(columns=['instant', 'dteday', 'mnth',
                       'workingday', 'cnt', 'casual', 'registered'])
    y=df.cnt
    
    return train_test_split(X, y, test_size=test_size)    

## Evaluacion (R2, RMSE)

In [None]:
df=pd.read_csv('data/hour.csv')  
df=mod_temp(df)
df=one_hot(df)
X_train, X_test, y_train, y_test=sel_split(df)

In [None]:
linreg=LinReg()


for e in X_train.columns:
    linreg.fit(X_train[[e]], y_train)
    y_pred=linreg.predict(X_test[[e]])
    
    print ('R^2 para {} es {:.2f}'.format(e, r2_score(y_test, y_pred)))
    print ('RMSE para {} = {:.2f}'.format(e, mse(y_test, y_pred)**.5))
    print ()

**coeficientes**

In [None]:
linreg.fit(X_train, y_train)

y_pred=linreg.predict(X_test)
 
# root mean squared error
print ('RMSE = {:.2f}'.format(mse(y_test, y_pred)**.5))
print ()
print ('Intercept = {}'.format(linreg.intercept_))
 
coefs=pd.DataFrame({'coeficientes':linreg.coef_, 
                    'caracteristica':X_train.columns.values})\
                    .sort_values('coeficientes').reset_index()\
                    .drop('index', axis=1)


coefs

# Codigo para Web-App (main.py)

El modelo va a estar basado en la temperatura, estacion del año, si es fiesta o no y la hora del dia.


**Preparacion de los datos**

In [None]:
# funcion auxiliar para cargar los datos
from sklearn.linear_model import LinearRegression as LinReg
import pandas as pd

def data():
    df=pd.read_csv('data/hour.csv')
    
    m_df=df[['dteday','hr','cnt']].groupby(['dteday','hr']).sum().reset_index()
    m_df.sort_values(['dteday','hr'])
    m_df['sum_hr_1']=m_df.cnt.shift(+1)
    m_df['sum_hr_2']=m_df.cnt.shift(+2)
    
    merged=pd.merge(df, m_df[['dteday', 'hr', 
                              'sum_hr_1', 
                              'sum_hr_2']], 
                    how='inner', 
                    on=['dteday', 'hr']).dropna()
    
    
    dummy=pd.get_dummies(merged, 
                         columns=['season', 'weekday', 'weathersit'], 
                         drop_first=False)
    
    
    X=dummy.drop(columns=['instant', 'dteday', 'mnth',
                          'workingday', 'cnt', 'casual', 
                          'registered'])
    y=dummy.cnt
    
    linreg=LinReg().fit(X,y)
    
    coefs=pd.DataFrame({'coef':linreg.coef_, 
                        'carac':X_train.columns.values})\
                        .sort_values('coef').reset_index()\
                        .drop('index', axis=1)
    coefs=coefs.append({'coef':linreg.intercept_, 
                        'carac':'intercept'},
                       ignore_index=True)
    
    coefs.to_csv('data/coefs.csv') 
    X.describe().T['mean'].to_csv('data/means.csv')

    
data()

# main.py

In [None]:
from flask import Flask
from flask import render_template


import pandas as pd

    
# inicia Flask
app = Flask(__name__)


    
# coeficientes
coefs=None


# valores medios como datos de entrada
input_data=None


@app.before_first_request
def startup():
    global coefs
    coefs=pd.read_csv('data/coefs.csv')\
            .set_index('carac')\
            .to_dict()['coef']
    
    global input_data
    input_data=pd.read_csv('data/means.csv', names=['carac', 'coef'])\
                 .set_index('carac')\
                 .to_dict()['coef']
    

# cuando carga, son los valores por defecto
# se va a predecir sobre 4 caracteristicas, pero se necesita evaluar
# con todas.
@app.route("/", methods=['POST', 'GET'])
def main():
    return render_template('index.html',
                           
            holiday=input_data['holiday'],
            hr=input_data['hr'],
            yr=input_data['yr'],
            hum=input_data['hum'],
            temp=input_data['temp'],
            atemp=input_data['atemp'],
            windspeed=input_data['windspeed'],
            season_1=1,
            season_2=0,
            season_3=0,
            season_4=0,
            weathersit_1=input_data['weathersit_1'],
            weathersit_2=input_data['weathersit_2'],
            weathersit_3=input_data['weathersit_3'],
            weathersit_4=input_data['weathersit_4'],
            weekday_1=input_data['weekday_1'],
            weekday_2=input_data['weekday_2'],
            weekday_3=input_data['weekday_3'],
            weekday_4=input_data['weekday_4'],
            weekday_5=input_data['weekday_5'],
            weekday_6=input_data['weekday_6'],
            weekday_0=input_data['weekday_0'],
            sum_hr_1=input_data['sum_hr_1'],
            sum_hr_2=input_data['sum_hr_2'],
                        
                           
            coef_intercept=coefs['intercept'],
            coef_holiday=coefs['holiday'],
            coef_hr=coefs['hr'],
            coef_yr=coefs['yr'],
            coef_hum=coefs['hum'],
            coef_temp=coefs['temp'],
            coef_atemp=coefs['atemp'],
            coef_windspeed=coefs['windspeed'],
            coef_season_1=coefs['season_1'],
            coef_season_2=coefs['season_2'],
            coef_season_3=coefs['season_3'],
            coef_season_4=coefs['season_4'],
            coef_weathersit_1=coefs['weathersit_1'],
            coef_weathersit_2=coefs['weathersit_2'],
            coef_weathersit_3=coefs['weathersit_3'],
            coef_weathersit_4=coefs['weathersit_4'],
            coef_weekday_1=coefs['weekday_1'],
            coef_weekday_2=coefs['weekday_2'],
            coef_weekday_3=coefs['weekday_3'],
            coef_weekday_4=coefs['weekday_4'],
            coef_weekday_5=coefs['weekday_5'],
            coef_weekday_6=coefs['weekday_6'],
            coef_weekday_0=coefs['weekday_0'],
            coef_sum_hr_1=coefs['sum_hr_1'],
            coef_sum_hr_2=coefs['sum_hr_2'])




# para ejecutar en local
if __name__=='__main__':
      app.run(debug=False)

## Codigo HTML