# Modelo 1

Vamos fazer um modelo simples de regressão linear usando a quantidade de funcionários e o faturamento da empresa para prever o volume de resíduo solicitado dentro de um ano.

As features serão normalizadas para evitar que as escalas façam alguma diferença. Também será necessário retirar outliers tanto da variável alvo quanto das duas features

In [1]:
import sqlalchemy as sql

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

import statsmodels.api as sm

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

In [2]:
target_col = 'ton | m3 - ano'

# Buscando dados
Vamos buscar os dados nos nossos bancos postgres e cadri

In [3]:
conn_empresas = sql.create_engine('postgresql://postgres:%24h%5B6%3B%3AsqyA4%25f6nL%5Bg@postgres.driva.io:5433/postgres')
conn_cadri = sql.create_engine('postgresql://postgres:%24h%5B6%3B%3AsqyA4%25f6nL%5Bg@postgres.driva.io:5432/cadri')

df_empresas = pd.read_sql(
    sql.text(f'SELECT * FROM clientes.verdera_empresas WHERE cnae_principal_subclasse IN {(2219600, 4520006, 2342701, 2342702)}'),
    con=conn_empresas.connect()
)
df_cadri = pd.read_sql(sql.text('SELECT * FROM cadri.cadri_e_pareceres_com_residuos'), con=conn_cadri.connect())

# Pré-processamento

In [4]:
df_empresas['tipo'] = df_empresas['cnae_principal_subclasse'].apply(lambda cnae: 'borracha' if cnae in [2219600, 4520006] else 'piso' if cnae in [2342701, 2342702] else 'desconhecido')

df_cadri['cadri_data'] = pd.to_datetime(df_cadri['cadri_data'], format='%d/%m/%Y')
df_cadri['cadri_ano_solicitado'] = df_cadri['cadri_data'].dt.year
df_residuo_solicitado_por_cnpj_ano = df_cadri[['geradora_cnpj', 'cadri_ano_solicitado', target_col]].groupby(['geradora_cnpj', 'cadri_ano_solicitado']).sum().reset_index()

df = pd.merge(
    df_empresas,
    df_residuo_solicitado_por_cnpj_ano,
    left_on='cnpj', right_on='geradora_cnpj', how='inner'
)[['qtde_funcionarios', 'faturamento', 'cadri_ano_solicitado', target_col]]

In [8]:
pd.merge(
    df_empresas,
    df_residuo_solicitado_por_cnpj_ano,
    left_on='cnpj', right_on='geradora_cnpj', how='inner'
)[['cnpj']].drop_duplicates().to_csv('empresas.csv', index=False)

In [73]:
df_train, df_test = train_test_split(df, test_size=0.2)

In [74]:
def retirar_outliers(df, col):
    '''Retira os outliers pelo método do IQR'''
    quant25 = df[col].quantile(0.25)
    quant75 = df[col].quantile(0.75)
    iqr = quant75 - quant25
    
    return df[(quant25 - 1.5*iqr < df[col]) & (df[col] < quant75 + 1.5*iqr)]

In [75]:
df_train = retirar_outliers(df_train, 'qtde_funcionarios')
df_train = retirar_outliers(df_train, 'faturamento')
df_train = retirar_outliers(df_train, target_col)

In [76]:
def normalizar_minmax(df, col):
    '''Normaliza os dados pelo método min max'''
    df[col] = (df[col] - df[col].min()) / (df[col].max() - df[col].min())
    return df

In [77]:
df_train = normalizar_minmax(df_train, 'qtde_funcionarios')
df_train = normalizar_minmax(df_train, 'faturamento')
df_train = normalizar_minmax(df_train, 'cadri_ano_solicitado')
df_train = normalizar_minmax(df_train, target_col)

In [78]:
df_train = df_train.dropna()

In [79]:
df_train.head()

Unnamed: 0,qtde_funcionarios,faturamento,cadri_ano_solicitado,ton | m3 - ano
269,0.278472,0.175572,0.842105,0.092529
87,0.086701,0.13981,0.473684,0.016869
336,0.005143,0.008294,0.368421,0.058208
75,0.16385,0.264218,0.842105,0.333244
73,0.003674,0.005924,0.947368,0.003867


In [80]:
X_train = df_train.drop(target_col, axis=1)
X_train = sm.add_constant(X_train)
y_train = df_train[target_col]

In [81]:
results = sm.OLS(y_train, X_train).fit()

In [82]:
results.summary()

0,1,2,3
Dep. Variable:,ton | m3 - ano,R-squared:,0.054
Model:,OLS,Adj. R-squared:,0.046
Method:,Least Squares,F-statistic:,6.696
Date:,"Wed, 22 Feb 2023",Prob (F-statistic):,0.000209
Time:,16:32:58,Log-Likelihood:,16.84
No. Observations:,356,AIC:,-25.68
Df Residuals:,352,BIC:,-10.18
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,0.1487,0.040,3.705,0.000,0.070,0.228
qtde_funcionarios,0.0860,0.116,0.744,0.457,-0.141,0.313
faturamento,0.1505,0.094,1.608,0.109,-0.034,0.335
cadri_ano_solicitado,-0.0464,0.057,-0.808,0.420,-0.159,0.066

0,1,2,3
Omnibus:,113.549,Durbin-Watson:,1.998
Prob(Omnibus):,0.0,Jarque-Bera (JB):,245.563
Skew:,1.665,Prob(JB):,4.75e-54
Kurtosis:,5.337,Cond. No.,14.4


In [92]:
def preprocess(df):
    df = retirar_outliers(df, 'qtde_funcionarios')
    df = retirar_outliers(df, 'faturamento')
    df = retirar_outliers(df, target_col)

    df = normalizar_minmax(df, 'qtde_funcionarios')
    df = normalizar_minmax(df, 'faturamento')
    df = normalizar_minmax(df, 'cadri_ano_solicitado')
    df = normalizar_minmax(df, target_col)

    df = df.dropna()
    X = df.drop(target_col, axis=1)
    X = sm.add_constant(X)
    y = df[target_col]
    
    return X, y

In [84]:
X, y = preprocess(df_test)

In [85]:
preds = results.predict(X)

In [86]:
def mse(y, preds):
    return sum((preds_i - y_i)**2 for y_i, preds_i in zip(y, preds)) / len(preds)

In [87]:
mse(y, preds)

0.05460060295783258

In [88]:
lr = LinearRegression()

In [89]:
lr.fit(X_train, y_train)

In [90]:
lr.score(X, y)

0.12877425112575291

In [93]:
def preprocess2(df):
    df = retirar_outliers(df, 'qtde_funcionarios')
    df = retirar_outliers(df, 'faturamento')
    df = retirar_outliers(df, target_col)

    df = normalizar_minmax(df, 'qtde_funcionarios')
    df = normalizar_minmax(df, 'faturamento')
    df = normalizar_minmax(df, target_col)

    df = df.dropna()
    X = df.drop(target_col, axis=1)
    X = sm.add_constant(X)
    y = df[target_col]
    
    return X, y

lr2 = LinearRegression()
X_train2, y_train2 = preprocess2(df_train[['qtde_funcionarios', 'faturamento', target_col]])
lr2.fit(X_train2, y_train2)
X_test2, y_test2 = preprocess2(df_test[['qtde_funcionarios', 'faturamento', target_col]])
lr2.score(X_test2, y_test2)

0.09981048280605498