# Notebook Base de Regressão Logística

In [1]:
import numpy as np
import pandas as pd 

import seaborn as sns
import matplotlib.pyplot as plt

## Classe de regressão logística
*Faz o trabalho pesado em regressão logística.*

In [2]:
class LogisticRegression:
    def __init__(self, learning_rate=0.001, n_iters=1000):
        self.lr = learning_rate
        self.n_iters = n_iters
        self.weights = None
        self.bias = None

    def fit(self, X, y):
        n_samples, n_features = X.shape

        # init parameters
        self.weights = np.zeros(n_features)
        self.bias = 0

        # gradient descent
        for _ in range(self.n_iters):
            # approximate y with linear combination of weights and x, plus bias
            linear_model = np.dot(X, self.weights) + self.bias
            # apply sigmoid function
            y_predicted = self._sigmoid(linear_model)

            # compute gradients
            dw = (1 / n_samples) * np.dot(X.T, (y_predicted - y))
            db = (1 / n_samples) * np.sum(y_predicted - y)
            # update parameters
            self.weights -= self.lr * dw
            self.bias -= self.lr * db

    def predict(self, X):
        linear_model = np.dot(X, self.weights) + self.bias
        y_predicted = self._sigmoid(linear_model)
        y_predicted_cls = [1 if i > 0.5 else 0 for i in y_predicted]
        
        return np.array(y_predicted_cls)
    
    def plot(self, X, y, legend):
        # only plots if X refers to exactly 2 variables
        if X.shape[1] != 2:
            raise ValueError("Can plot only for X's that refers to exactly 2 vars.")
        
        slope = -(self.weights[0]/self.weights[1])
        intercept = -(self.bias/self.weights[1])
        predictions = self.predict(X)

        sns.set_style('white')
        sns.scatterplot(x = X[:,0], y= X[:,1], hue=y.reshape(-1), style=predictions.reshape(-1));

        ax = plt.gca()
        ax.autoscale(False)
        x_vals = np.array(ax.get_xlim())
        y_vals = intercept + (slope * x_vals)
        plt.plot(x_vals, y_vals, c="k");
        
        plt.xlabel(legend[0])
        plt.ylabel(legend[1])

    def _sigmoid(self, x):
        return 1 / (1 + np.exp(-x))

## Importando a base
*Importa a base, seleciona as variáveis, limpa, normaliza.*


In [9]:
df = pd.read_csv("db_vinho.csv")

df.sample(5)

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
1516,6.1,0.32,0.25,2.3,0.071,23.0,58.0,0.99633,3.42,0.97,10.6,5
358,11.9,0.43,0.66,3.1,0.109,10.0,23.0,1.0,3.15,0.85,10.4,7
1207,9.9,0.72,0.55,1.7,0.136,24.0,52.0,0.99752,3.35,0.94,10.0,5
611,13.2,0.38,0.55,2.7,0.081,5.0,16.0,1.0006,2.98,0.54,9.4,5
745,7.3,0.51,0.18,2.1,0.07,12.0,28.0,0.99768,3.52,0.73,9.5,6


In [10]:
def booleamize(x):
	return round(x/10)

df['quality'] = df['quality'].apply(booleamize)

In [11]:
# ======================================================== #
# =================== Seleção de Dados =================== #
# ======================================================== #
dados = {
    'X' : ['fixed acidity', 'volatile acidity', 'total sulfur dioxide', 'density', 'pH', 'sulphates', 'alcohol'],
    'y' : 'quality',
    'normalizada' : False
}
# ======================================================== #


df = df[ dados['X']+[dados['y']] ]
df = df.dropna()

if not dados['normalizada']:
    for col in dados['X']:
        df[[col]] = df[[col]]/df[[col]].mean()

X = df[ dados['X'] ].to_numpy()
y = df[[ dados['y'] ]].to_numpy()
y = np.hstack((y)).T

df.sample(5)

Unnamed: 0,fixed acidity,volatile acidity,total sulfur dioxide,density,pH,sulphates,alcohol,quality
1212,0.817343,1.231479,0.322804,0.998228,1.011744,0.942036,0.997795,1
1250,0.853403,1.13675,0.79625,0.998388,1.026845,0.926842,1.045766,1
726,0.9736,1.3641,1.054494,1.002662,1.035905,1.093977,1.064954,1
1560,0.937541,1.13675,2.819157,0.999472,0.969462,0.790095,0.949824,0
577,1.057738,0.833617,2.388751,1.001458,0.996644,0.911648,0.911447,0


## Aplicando a regressão logística
*Aplica a função e avalia a precisão da previsão.*

In [19]:
regressor = LogisticRegression(learning_rate=0.1, n_iters=10000)
regressor.fit(X, y)
predictions = regressor.predict(X)

def accuracy(y_true, y_pred):
    accuracy = np.sum(y_true == y_pred) / len(y_true)
    return accuracy

print(f"A precisão do modelo é: {accuracy(y, predictions)}")

A precisão do modelo é: 0.7460913070669168


# Plotando os resultados visualmente
*~ se você estiver analisando exatamente 2 vars numéricas ~*

In [7]:
try:
    regressor.plot(X, y, dados['X'])
except:
    print("Sem visualização disponível.")

Sem visualização disponível.


## Mostrando os pesos
*Cria uma tabela mostrando os pesos de avaliação das variáveis numéricas.*

In [20]:
norma_pesos = pd.DataFrame(regressor.weights)/pd.DataFrame(regressor.weights).abs().sum()
norma_pesos = norma_pesos[0].values.tolist()

dfpesos = pd.DataFrame({'Pesos(%)':norma_pesos}, index=dados['X'])

dfpesos

Unnamed: 0,Pesos(%)
fixed acidity,-0.023601
volatile acidity,-0.130716
total sulfur dioxide,-0.046467
density,-0.136581
pH,-0.088948
sulphates,0.091823
alcohol,0.481864
