# Modelo 1

Vamos fazer um modelo simples de regressão linear usando a quantidade de funcionários e o faturamento da empresa para prever o volume de resíduo solicitado dentro de um ano.

As features serão normalizadas para evitar que as escalas façam alguma diferença. Também será necessário retirar outliers tanto da variável alvo quanto das duas features

In [4]:
import sqlalchemy as sql

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

import statsmodels.api as sm

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

In [5]:
target_col = 'ton | m3 - ano'

# Buscando dados
Vamos buscar os dados nos nossos bancos postgres e cadri

In [6]:
conn_empresas = sql.create_engine('postgresql://postgres:%24h%5B6%3B%3AsqyA4%25f6nL%5Bg@postgres.driva.io:5433/postgres')
conn_cadri = sql.create_engine('postgresql://postgres:%24h%5B6%3B%3AsqyA4%25f6nL%5Bg@postgres.driva.io:5432/cadri')

df_empresas = pd.read_sql(
    sql.text(f'SELECT * FROM clientes.verdera_empresas WHERE cnae_principal_subclasse IN {(2219600, 4520006, 2342701, 2342702)}'),
    con=conn_empresas.connect()
)
df_cadri = pd.read_sql(sql.text('SELECT * FROM cadri.cadri_e_pareceres_com_residuos'), con=conn_cadri.connect())

# Pré-processamento

In [7]:
df_empresas['tipo'] = df_empresas['cnae_principal_subclasse'].apply(lambda cnae: 'borracha' if cnae in [2219600, 4520006] else 'piso' if cnae in [2342701, 2342702] else 'desconhecido')

df_cadri['cadri_data'] = pd.to_datetime(df_cadri['cadri_data'], format='%d/%m/%Y')
df_cadri['cadri_ano_solicitado'] = df_cadri['cadri_data'].dt.year
df_residuo_solicitado_por_cnpj_ano = df_cadri[['geradora_cnpj', 'cadri_ano_solicitado', target_col]].groupby(['geradora_cnpj', 'cadri_ano_solicitado']).sum().reset_index()

df = pd.merge(
    df_empresas,
    df_residuo_solicitado_por_cnpj_ano,
    left_on='cnpj', right_on='geradora_cnpj', how='inner'
)[['qtde_funcionarios', target_col]]

In [9]:
df.head()

Unnamed: 0,qtde_funcionarios,ton | m3 - ano
0,1321.0,8.82
1,1321.0,6.36
2,1321.0,183.96
3,1321.0,60.0
4,1321.0,56.2


In [10]:
def categorizar_faixa_volume(y: float):
    limites = [
        (0, 1000),
        (1000, 5000),
        (5000,10000),
        (10000,20000),
        (20000,40000),
        (40000,80000),
        (80000,160000),
        (160000,320000),
        (320000,640000),
        (640000,float('inf')),
    ]
    
    for i, limite in enumerate(limites):
        lim_inferior, lim_superior = limite
        if lim_inferior < y < lim_superior:
            return i
    return -1

In [11]:
df['target'] = df[target_col].apply(categorizar_faixa_volume)
df[[target_col, 'target']].groupby('target').count()

Unnamed: 0_level_0,ton | m3 - ano
target,Unnamed: 1_level_1
0,592
1,64
2,5
3,4
4,3
5,1
6,1


In [30]:
df_train, df_test = train_test_split(df, test_size=0.2)

In [31]:
df_train.shape, df_test.shape

((536, 5), (134, 5))

In [32]:
def retirar_outliers(df, col):
    '''Retira os outliers pelo método do IQR'''
    quant25 = df[col].quantile(0.25)
    quant75 = df[col].quantile(0.75)
    iqr = quant75 - quant25
    
    return df[(quant25 - 1.5*iqr < df[col]) & (df[col] < quant75 + 1.5*iqr)]

In [33]:
df_train = retirar_outliers(df_train, 'qtde_funcionarios')
df_train = retirar_outliers(df_train, 'faturamento')
# df_train = retirar_outliers(df_train, target_col)

In [35]:
def normalizar_minmax(df, col):
    '''Normaliza os dados pelo método min max'''
    df[col] = (df[col] - df[col].min()) / (df[col].max() - df[col].min())
    return df

In [36]:
df_train = normalizar_minmax(df_train, 'qtde_funcionarios')
df_train = normalizar_minmax(df_train, 'faturamento')
df_train = normalizar_minmax(df_train, 'cadri_ano_solicitado')
df_train = normalizar_minmax(df_train, target_col)

In [37]:
df_train = df_train.dropna()

In [39]:
df_train.head()

Unnamed: 0,qtde_funcionarios,faturamento,cadri_ano_solicitado,ton | m3 - ano,target
84,0.130786,0.188161,0.631579,0.0,0
488,0.03086,0.044397,0.526316,0.0,0
569,1.0,0.569108,0.789474,0.0,0
477,0.224835,0.323467,1.0,0.0,0
217,0.223365,0.321353,0.368421,0.0,0


In [40]:
X_train = df_train.drop(target_col, axis=1)
X_train = sm.add_constant(X_train)
y_train = df_train[target_col]

In [41]:
results = sm.OLS(y_train, X_train).fit()

In [42]:
results.summary()

0,1,2,3
Dep. Variable:,ton | m3 - ano,R-squared:,1.0
Model:,OLS,Adj. R-squared:,1.0
Method:,Least Squares,F-statistic:,5.38e+31
Date:,"Tue, 28 Feb 2023",Prob (F-statistic):,0.0
Time:,15:12:01,Log-Likelihood:,14718.0
No. Observations:,420,AIC:,-29430.0
Df Residuals:,415,BIC:,-29410.0
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-8.544e-17,2.31e-17,-3.700,0.000,-1.31e-16,-4e-17
qtde_funcionarios,9.714e-17,5.95e-17,1.633,0.103,-1.98e-17,2.14e-16
faturamento,-6.939e-18,5.46e-17,-0.127,0.899,-1.14e-16,1e-16
cadri_ano_solicitado,5.464e-17,3.32e-17,1.647,0.100,-1.06e-17,1.2e-16
target,0.2500,1.73e-17,1.44e+16,0.000,0.250,0.250

0,1,2,3
Omnibus:,462.723,Durbin-Watson:,2.055
Prob(Omnibus):,0.0,Jarque-Bera (JB):,22340.661
Skew:,-5.035,Prob(JB):,0.0
Kurtosis:,37.281,Cond. No.,13.5


In [43]:
def preprocess(df):
    df = retirar_outliers(df, 'qtde_funcionarios')
    df = retirar_outliers(df, 'faturamento')
    # df = retirar_outliers(df, target_col)

    df = normalizar_minmax(df, 'qtde_funcionarios')
    df = normalizar_minmax(df, 'faturamento')
    df = normalizar_minmax(df, 'cadri_ano_solicitado')
    df = normalizar_minmax(df, target_col)

    df = df.dropna()
    X = df.drop(target_col, axis=1)
    X = sm.add_constant(X)
    y = df[target_col]
    
    return X, y

In [44]:
X, y = preprocess(df_test)

In [45]:
preds = results.predict(X)

In [46]:
def mse(y, preds):
    return sum((preds_i - y_i)**2 for y_i, preds_i in zip(y, preds)) / len(preds)

In [47]:
mse(y, preds)

0.0009306987399770837

In [48]:
lr = LinearRegression()

In [49]:
lr.fit(X_train, y_train)

In [50]:
lr.score(X, y)

0.9349731848184819

In [51]:
def preprocess2(df):
    df = retirar_outliers(df, 'qtde_funcionarios')
    df = retirar_outliers(df, 'faturamento')
    # df = retirar_outliers(df, target_col)

    df = normalizar_minmax(df, 'qtde_funcionarios')
    df = normalizar_minmax(df, 'faturamento')
    df = normalizar_minmax(df, target_col)

    df = df.dropna()
    X = df.drop(target_col, axis=1)
    X = sm.add_constant(X)
    y = df[target_col]
    
    return X, y

lr2 = LinearRegression()
X_train2, y_train2 = preprocess2(df_train[['qtde_funcionarios', 'faturamento', target_col]])
lr2.fit(X_train2, y_train2)
X_test2, y_test2 = preprocess2(df_test[['qtde_funcionarios', 'faturamento', target_col]])
lr2.score(X_test2, y_test2)

-0.006438503336453794