In [1]:
import janitor
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn.preprocessing import StandardScaler# escalamiento a los datos para que los valores se ajusten a ello
from sklearn.linear_model import LinearRegression # modelo de regresión lineal

### Regresión Logística
La regresión logística es un tipo de modelo de aprendizaje automático supervisado que se utiliza para predecir la probabilidad de que un evento ocurra. La regreión logística se basa en la función logística, que es una función sigmoide que toma un valor entre 0 y 1. Aunque se llama regresión, se utiliza comúnmente para problemas de clasificación binaria (dos clases), aunque también se puede utilizar para clasificación multiclase.

La regresión logística se puede utilizar en los siguientes casos:

* **Cuando la variable dependiente es categórica.** La regresión logística se utiliza para predecir variables dependientes que pueden tomar un número limitado de valores.
* **Probabilidad de eventos.** Cuando se desea modelar la probabilidad de un evento en función de variables predictoras.


In [2]:
data = pd.read_csv('../../data/raw/Social_Network_Ads.csv')
data.head()

Unnamed: 0,User ID,Gender,Age,EstimatedSalary,Purchased
0,15624510,Male,19,19000,0
1,15810944,Male,35,20000,0
2,15668575,Female,26,43000,0
3,15603246,Female,27,57000,0
4,15804002,Male,19,76000,0


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   User ID          400 non-null    int64 
 1   Gender           400 non-null    object
 2   Age              400 non-null    int64 
 3   EstimatedSalary  400 non-null    int64 
 4   Purchased        400 non-null    int64 
dtypes: int64(4), object(1)
memory usage: 15.8+ KB


In [4]:
categorical_columns = data.select_dtypes(include='object').columns
categorical_columns

Index(['Gender'], dtype='object')

### Transformación de las variables categóricas

In [5]:
from sklearn.preprocessing import OneHotEncoder
import sklearn.compose


In [6]:
import sklearn.preprocessing


transformer = sklearn.compose.make_column_transformer(
    (sklearn.preprocessing.OneHotEncoder(), categorical_columns),
    remainder='passthrough'
)

In [7]:
encoded_df = (
    pd.DataFrame(
        transformer.fit_transform(data),
        columns = transformer.get_feature_names_out(),
        index=data.index
    )
    .rename(
        columns = lambda x: x.removeprefix('onehotencoder__')
    )
    .rename(
        columns = lambda x: x.removeprefix('remainder__')
    )
   
)
encoded_df

Unnamed: 0,Gender_Female,Gender_Male,User ID,Age,EstimatedSalary,Purchased
0,0.0,1.0,15624510.0,19.0,19000.0,0.0
1,0.0,1.0,15810944.0,35.0,20000.0,0.0
2,1.0,0.0,15668575.0,26.0,43000.0,0.0
3,1.0,0.0,15603246.0,27.0,57000.0,0.0
4,0.0,1.0,15804002.0,19.0,76000.0,0.0
...,...,...,...,...,...,...
395,1.0,0.0,15691863.0,46.0,41000.0,1.0
396,0.0,1.0,15706071.0,51.0,23000.0,1.0
397,1.0,0.0,15654296.0,50.0,20000.0,1.0
398,0.0,1.0,15755018.0,36.0,33000.0,0.0


In [8]:
X =encoded_df.iloc[:,[3,4]]

y = encoded_df.iloc[:,-1].values


In [9]:
from sklearn.model_selection import train_test_split


In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [11]:
print(X_train.shape)
print(X_test.shape)

(320, 2)
(80, 2)


In [12]:
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings('ignore', category=FutureWarning)

In [13]:
sc_x = StandardScaler()
X_train = sc_x.fit_transform(X_train)
X_test = sc_x.transform(X_test)

In [14]:
from sklearn.linear_model import LogisticRegression


In [15]:
log_reg = LogisticRegression(random_state = 0)
log_reg.fit(X_train, y_train)

In [16]:
y_pred = log_reg.predict(X_test)


In [17]:
print('Reales:', y_test[:10], 'Predicción:', y_pred[:10])


Reales: [0. 0. 0. 0. 0. 0. 0. 1. 0. 0.] Predicción: [0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]


In [18]:
from sklearn.metrics import confusion_matrix


In [19]:
confusion_matrix(y_test, y_pred)


array([[57,  1],
       [ 5, 17]])

In [20]:

from sklearn.metrics import precision_score, recall_score, f1_score
print('Precisión:', precision_score(y_test, y_pred))
print('Memoria:', recall_score(y_test, y_pred))
print('F1_score:', f1_score(y_test, y_pred))
     

Precisión: 0.9444444444444444
Memoria: 0.7727272727272727
F1_score: 0.85
