In [1]:
import os

import pandas as pd
import numpy as np
import statsmodels.formula.api as smf
import plotly.express as px

from sklearn import preprocessing
from sklearn.cluster import KMeans

In [2]:
# Get PATH folder from previous directorys
def get_path(prev_folders:int=0):
    for i in range(prev_folders-1): os.chdir('../')  # Change to previous folder
    PATH = os.path.dirname(os.getcwd()) + '/'
    PATH = PATH.replace('\\', '/')
    return PATH

# Environment settings

In [3]:
pd.options.display.max_columns = None  # Remove "dots" from display when printing dataframes
PATH = get_path(prev_folders=2)

Adjunto encontrara el archivo “202306_Prueba_CD_prob_acuaticos.parquet”. En este documento se presenta información de trabajadores (afiliado cotizante) y beneficiarios de la caja de compensación familiar Compensar con la siguiente información:
- Id (identificación)
- fuente (indica si el registro corresponde a un trabajador (afiliado) o un beneficiario)
- parentesco_beneficiario (indica la relación que tiene el beneficiario con el trabajador cotizante)
- PeriodoVinculación (fecha de vinculación a la caja)
- genero
- nivel_socioeconomico
- Categoria
- descripcion_familia (tipo de familia)
- edad
- Educación (indica si la persona ha tomado servicios de educación)
- Recreación (indica si la persona ha tomado servicios de recreación)
- Acondicionamiento Físico (indica si la persona ha tomado servicios de acondicionamiento físico)
- Motricidad (indica si la persona ha tomado servicios de motricidad)
- Deportes Terrestres (indica si la persona ha tomado servicios de deportes terrestres)
- Recompra (indica si la persona ha tomado servicios de deportes acuáticos en años anteriores)
- Objetivo (indica si la persona ha tomado servicios de deportes acuáticos en el último año)

# Read data

In [4]:
df = pd.read_parquet(PATH + 'data/202306_Prueba_CD_prob_acuaticos.parquet')

# Preprocesing

In [5]:
df = df.rename({
    'PeriodoVinculación':'periodo_vinculacion',
    'Categoria':'categoria',
    'Educación':'educacion',
    'Recreación':'recreacion',
    'Acondicionamiento Físico':'acondicionamiento_fisico',
    'Motricidad':'motricidad',
    'Deportes Terrestres':'deportes_terrestres',
    'Recompra':'recompra',
    'Objetivo':'objetivo'
    }, axis=1
    )

df['parentesco_beneficiario'] = df['parentesco_beneficiario'].fillna('Ninguno')
df['periodo_vinculacion'] = pd.to_datetime(df['periodo_vinculacion'], format='%Y-%m-%d')

# Descriptive analysis

a. Realice un análisis descriptivo de la información presentada.

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 16 columns):
 #   Column                    Non-Null Count  Dtype         
---  ------                    --------------  -----         
 0   id                        1000 non-null   int32         
 1   fuente                    1000 non-null   object        
 2   parentesco_beneficiario   1000 non-null   object        
 3   periodo_vinculacion       1000 non-null   datetime64[ns]
 4   genero                    1000 non-null   object        
 5   nivel_socioeconomico      1000 non-null   int32         
 6   categoria                 1000 non-null   object        
 7   descripcion_familia       787 non-null    object        
 8   edad                      1000 non-null   int32         
 9   educacion                 1000 non-null   object        
 10  recreacion                1000 non-null   int32         
 11  acondicionamiento_fisico  1000 non-null   int32         
 12  motricidad           

In [7]:
df.describe()

Unnamed: 0,id,nivel_socioeconomico,edad,recreacion,acondicionamiento_fisico,motricidad,deportes_terrestres,recompra,objetivo
count,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0
mean,5095164.0,1.782,32.567,0.027,0.026,0.012,0.036,0.091,0.1
std,2924070.0,1.728428,19.367979,0.162164,0.159215,0.10894,0.186383,0.287753,0.30015
min,115419.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2439276.0,0.0,17.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,5137560.0,2.0,30.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,7717380.0,3.0,45.0,0.0,0.0,0.0,0.0,0.0,0.0
max,9994874.0,6.0,101.0,1.0,1.0,1.0,1.0,1.0,1.0


In [8]:
df.isnull().sum()

id                            0
fuente                        0
parentesco_beneficiario       0
periodo_vinculacion           0
genero                        0
nivel_socioeconomico          0
categoria                     0
descripcion_familia         213
edad                          0
educacion                     0
recreacion                    0
acondicionamiento_fisico      0
motricidad                    0
deportes_terrestres           0
recompra                      0
objetivo                      0
dtype: int64

In [9]:
df.sample(n=15).head(15)

Unnamed: 0,id,fuente,parentesco_beneficiario,periodo_vinculacion,genero,nivel_socioeconomico,categoria,descripcion_familia,edad,educacion,recreacion,acondicionamiento_fisico,motricidad,deportes_terrestres,recompra,objetivo
325,3293377,Beneficiario,Hijos,2019-04-01,Masculino,5,A,MonoParental,22,NO,0,0,0,0,0,0
858,2247803,Afiliado,Ninguno,2023-02-08,Femenino,3,A,,25,NO,0,0,0,0,0,0
847,3238276,Afiliado,Ninguno,2022-05-10,Femenino,3,A,Unipersonal,31,NO,0,0,0,0,0,0
704,5290572,Afiliado,Ninguno,2023-04-01,Femenino,4,C,,48,NO,0,0,0,0,0,0
380,2044722,Afiliado,Ninguno,2022-01-03,Femenino,0,A,Extensas,39,NO,0,0,0,1,0,0
745,9070878,Beneficiario,Hijos,2022-01-17,Masculino,0,B,Extensas,3,NO,1,0,0,0,0,0
519,5214880,Afiliado,Ninguno,2023-01-26,Masculino,2,A,,35,NO,0,0,0,0,0,0
93,9547560,Afiliado,Ninguno,2023-03-15,Masculino,4,A,,45,NO,0,0,0,0,0,0
827,2690422,Beneficiario,Hijos,2021-02-01,Masculino,4,A,MonoParental,17,NO,0,0,0,0,0,0
21,9572454,Afiliado,Ninguno,2021-09-25,Masculino,0,A,Extensas,35,NO,0,0,0,0,0,0


# Identification model for water sports

b. Desarrolle un modelo que permita identificar clientes potenciales para adquirir servicios de deportes acuáticos.

## Imputation Hot-Deck (variation)

In [10]:
label_encoder = preprocessing.LabelEncoder()

df['ENC_fuente'] = label_encoder.fit_transform(df['fuente'])
df['ENC_parentesco_beneficiario'] = label_encoder.fit_transform(df['parentesco_beneficiario'])
df['ENC_categoria'] = label_encoder.fit_transform(df['categoria'])

In [11]:
X = df[[
    'ENC_fuente', 'ENC_parentesco_beneficiario',
    'nivel_socioeconomico',
    'ENC_categoria'
    ]].values  # Variables related to the category to be predicted

# Clustering
kmeans_model = KMeans(n_clusters=4)
kmeans_model = kmeans_model.fit(X)
labels = kmeans_model.predict(X)
centroids = kmeans_model.cluster_centers_  # centroids values

for i in range(0, len(df)):
    role, relation, socio_lev, category = df.loc[i, [
        'ENC_fuente', 'ENC_parentesco_beneficiario',
        'nivel_socioeconomico', 'ENC_categoria'
        ]] # variables as an array
    X_new = np.array([[role, relation, socio_lev, category]])
    new_label = kmeans_model.predict(X_new)
    df.at[i, 'PRED_descripcion_familia'] = new_label

  super()._check_params_vs_input(X, default_n_init=10)
found 0 physical cores < 1
  File "c:\Users\almontao\AppData\Local\Programs\Python\Python39\lib\site-packages\joblib\externals\loky\backend\context.py", line 282, in _count_physical_cores
    raise ValueError(f"found {cpu_count_physical} physical cores < 1")


# Models

## Ordinary Least Squares regression (OLS)

In [12]:
regresion = smf.ols('objetivo ~ fuente + parentesco_beneficiario + genero + nivel_socioeconomico + categoria + descripcion_familia + recreacion + acondicionamiento_fisico + motricidad + recompra',
              data=df
              )
results = regresion.fit()

# print(res.rsquared)
# print(res.params)
print(results.summary())

                            OLS Regression Results                            
Dep. Variable:               objetivo   R-squared:                       0.507
Model:                            OLS   Adj. R-squared:                  0.495
Method:                 Least Squares   F-statistic:                     43.81
Date:                Sun, 29 Oct 2023   Prob (F-statistic):          8.76e-105
Time:                        15:27:55   Log-Likelihood:                 26.570
No. Observations:                 787   AIC:                            -15.14
Df Residuals:                     768   BIC:                             73.56
Df Model:                          18                                         
Covariance Type:            nonrobust                                         
                                               coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------------------

## 