#### CARGA DE DATOS

In [1]:
import pandas as pd
import numpy as np
from funpymodeling.exploratory import status

In [2]:
data = pd.read_csv("marketing_campaign.csv", sep=';', index_col=0)
data.head(5)

Unnamed: 0_level_0,Year_Birth,Education,Marital_Status,Income,Kidhome,Teenhome,Dt_Customer,Recency,MntWines,MntFruits,...,NumWebVisitsMonth,AcceptedCmp3,AcceptedCmp4,AcceptedCmp5,AcceptedCmp1,AcceptedCmp2,Complain,Z_CostContact,Z_Revenue,Response
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5524,1957,Graduation,Single,58138.0,0,0,2012-09-04,58,635,88,...,7,0,0,0,0,0,0,3,11,1
2174,1954,Graduation,Single,46344.0,1,1,2014-03-08,38,11,1,...,5,0,0,0,0,0,0,3,11,0
4141,1965,Graduation,Together,71613.0,0,0,2013-08-21,26,426,49,...,4,0,0,0,0,0,0,3,11,0
6182,1984,Graduation,Together,26646.0,1,0,2014-02-10,26,11,4,...,6,0,0,0,0,0,0,3,11,0
5324,1981,PhD,Married,58293.0,1,0,2014-01-19,94,173,43,...,5,0,0,0,0,0,0,3,11,0


In [3]:
status(data)

Unnamed: 0,variable,q_nan,p_nan,q_zeros,p_zeros,unique,type
0,Year_Birth,0,0.0,0,0.0,59,int64
1,Education,0,0.0,0,0.0,5,object
2,Marital_Status,0,0.0,0,0.0,8,object
3,Income,24,0.010714,0,0.0,1974,float64
4,Kidhome,0,0.0,1293,0.577232,3,int64
5,Teenhome,0,0.0,1158,0.516964,3,int64
6,Dt_Customer,0,0.0,0,0.0,663,object
7,Recency,0,0.0,28,0.0125,100,int64
8,MntWines,0,0.0,13,0.005804,776,int64
9,MntFruits,0,0.0,400,0.178571,158,int64


#### PREPARACIÓN DE LA DATA

##### --- Uso unas funciones para reemplazar las numéricas sin comprometer (tanto) el valor promedio y la desviación estándar

In [4]:
def minmax(x, y):
    #Calcula un rango mínimo y máximo.
    resul1 = x - y
    resul2 = x + y
    resultados = {'min': resul1, 'max': resul2}
    return resultados

In [5]:
def imp_numericas(f):
    #Imputa valores faltantes (NaN) de forma vectorizada.
    #Se espera que la entrada 'f' sea un objeto tipo Series de pandas.
    if not isinstance(f, pd.Series):
        try:
            f = pd.Series(f, dtype=float)
        except ValueError:
            return f

    nan_mask = f.isna()
    
    if not np.any(nan_mask):
        return f
        
    mean_val = f.mean(skipna=True)
    std_val = f.std(skipna=True)
    
    mn_sd = minmax(x=round(mean_val), y=round(std_val))
    
    num_nan = np.sum(nan_mask)
    aleatorios = np.random.randint(mn_sd['min'], mn_sd['max'] + 1, size=num_nan)

    f_imputado = f.copy()
    
    f_imputado[nan_mask] = aleatorios

    f_imputado[nan_mask & (f_imputado < 1)] = 1
    f_imputado[nan_mask & (f_imputado > mn_sd['max'])] = mn_sd['max']
    
    return f_imputado

In [6]:
def imp_data(data):
    """
    Args:
        df (pd.DataFrame): El dataframe a procesar.

    Returns:
        pd.DataFrame: Un nuevo dataframe con las columnas numéricas imputadas.
    """
    df_imputado = data.copy()
    
    for column in df_imputado.columns:
        if pd.api.types.is_numeric_dtype(df_imputado[column]):
            #print(f"Procesando la columna numérica: '{column}'")
            df_imputado[column] = imp_numericas(df_imputado[column])
            
    return df_imputado

In [7]:
data_imp = imp_data(data)

In [8]:
# Comprobamos que no hay valores faltantes
status(data_imp)

Unnamed: 0,variable,q_nan,p_nan,q_zeros,p_zeros,unique,type
0,Year_Birth,0,0.0,0,0.0,59,int64
1,Education,0,0.0,0,0.0,5,object
2,Marital_Status,0,0.0,0,0.0,8,object
3,Income,0,0.0,0,0.0,1998,float64
4,Kidhome,0,0.0,1293,0.577232,3,int64
5,Teenhome,0,0.0,1158,0.516964,3,int64
6,Dt_Customer,0,0.0,0,0.0,663,object
7,Recency,0,0.0,28,0.0125,100,int64
8,MntWines,0,0.0,13,0.005804,776,int64
9,MntFruits,0,0.0,400,0.178571,158,int64


In [9]:
print("### Promedio data")
print(round(data['Income'].mean(), 2))

print("-"*40)
print("### Promedio data imputada")
print(round(data_imp['Income'].mean(), 2))

### Promedio data
52247.25
----------------------------------------
### Promedio data imputada
52268.56


In [10]:
print("### Desviación estándar data")
print(round(data['Income'].std(), 2))

print("-"*40)
print("### Desviación estándar data imputada")
print(round(data_imp['Income'].std(), 2))

### Desviación estándar data
25173.08
----------------------------------------
### Desviación estándar data imputada
25087.99


#### ELIMINAMOS COLUMNAS FECHA Y CON VALORES ÚNICOS

In [11]:
columnas_a_eliminar = ['Year_Birth', 'Dt_Customer', 'Z_CostContact', 'Z_Revenue']
data_imp2 = data_imp.drop(columns=columnas_a_eliminar)

In [12]:
status(data_imp2)

Unnamed: 0,variable,q_nan,p_nan,q_zeros,p_zeros,unique,type
0,Education,0,0.0,0,0.0,5,object
1,Marital_Status,0,0.0,0,0.0,8,object
2,Income,0,0.0,0,0.0,1998,float64
3,Kidhome,0,0.0,1293,0.577232,3,int64
4,Teenhome,0,0.0,1158,0.516964,3,int64
5,Recency,0,0.0,28,0.0125,100,int64
6,MntWines,0,0.0,13,0.005804,776,int64
7,MntFruits,0,0.0,400,0.178571,158,int64
8,MntMeatProducts,0,0.0,1,0.000446,558,int64
9,MntFishProducts,0,0.0,384,0.171429,182,int64


#### TRANSFORMAMOS COLUMNAS OBJECT A NUMÉRICAS, YA QUE SON POCOS VALORES

In [13]:
data_imp2['Marital_Status'].unique()

array(['Single', 'Together', 'Married', 'Divorced', 'Widow', 'Alone',
       'Absurd', 'YOLO'], dtype=object)

In [14]:
class_map = {'Single':0, 'Married':1, 'Together':1, 'Divorced':2, 'Widow':3, 'Alone':0, 'Absurd':0, 'YOLO':0}
data_imp2['Marital_Status'] = data_imp2['Marital_Status'].map(class_map)

In [15]:
data_imp2['Education'].unique()

array(['Graduation', 'PhD', 'Master', 'Basic', '2n Cycle'], dtype=object)

In [16]:
class_map = {'Graduation':0, 'PhD':1, 'Master':2, 'Basic':3, '2n Cycle':4}
data_imp2['Education'] = data_imp2['Education'].map(class_map)

In [17]:
status(data_imp2)

Unnamed: 0,variable,q_nan,p_nan,q_zeros,p_zeros,unique,type
0,Education,0,0.0,1127,0.503125,5,int64
1,Marital_Status,0,0.0,487,0.217411,4,int64
2,Income,0,0.0,0,0.0,1998,float64
3,Kidhome,0,0.0,1293,0.577232,3,int64
4,Teenhome,0,0.0,1158,0.516964,3,int64
5,Recency,0,0.0,28,0.0125,100,int64
6,MntWines,0,0.0,13,0.005804,776,int64
7,MntFruits,0,0.0,400,0.178571,158,int64
8,MntMeatProducts,0,0.0,1,0.000446,558,int64
9,MntFishProducts,0,0.0,384,0.171429,182,int64


#### CLASIFICACIÓN

In [18]:
data_x = data_imp2.drop('Response', axis=1)
data_y = data_imp2['Response']

In [19]:
data_x = data_x.values
data_y = data_y.values

In [20]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(data_x, data_y, test_size=0.3)

##### Modelo predictivo

In [21]:
# Entrenamos el modelo
from sklearn.ensemble import RandomForestClassifier
# Creamos 1000 decision trees
rf = RandomForestClassifier(n_estimators = 1000, random_state = 99)
rf.fit(x_train, y_train)

#### Predicción clase y score

In [22]:
rf.predict(x_train)

array([0, 0, 0, ..., 0, 0, 0])

In [23]:
pred_probs = rf.predict_proba(x_train)
pred_probs

array([[0.815     , 0.185     ],
       [0.96342857, 0.03657143],
       [0.998     , 0.002     ],
       ...,
       [0.986     , 0.014     ],
       [0.8515    , 0.1485    ],
       [0.993     , 0.007     ]])

In [24]:
y_prob_tr = pred_probs[:,1]
y_prob_tr

array([0.185     , 0.03657143, 0.002     , ..., 0.014     , 0.1485    ,
       0.007     ])

#### ------------------------------------------------------------------

##### Regresión

In [25]:
from sklearn.linear_model import LinearRegression

# a.Creamos modelo
model = LinearRegression()

# b. fiteamos
model.fit(x_train, y_train)

# c. obtenemos predicciónes para tr y ts
pred_tr = model.predict(x_train)
pred_ts = model.predict(x_test)

In [26]:
pred_tr[0:6]

array([0.23817743, 0.10259729, 0.11550038, 0.0115609 , 0.08069571,
       0.29737031])

In [27]:
# Guardar el modelo

#### RandomForest

In [28]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor 

model_rf = RandomForestRegressor()

In [29]:
params = {
    'n_estimators' : [10, 100, 300, 500,1000],
    'max_features': [50,100],
    #'bootstrap': [False, True],
    #'max_depth': [50, 500],
    #'min_samples_leaf': [3, 50],
    #'min_samples_split': [10, 50],
}

grid_rf = GridSearchCV(estimator = model_rf,
                        param_grid = params,
                        scoring = 'neg_mean_absolute_error',
                        cv = 5, 
                        verbose = 1
                        )

In [30]:
grid_rf.fit(x_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


In [31]:
grid_rf.best_estimator_

In [32]:
grid_rf.predict(x_train)
grid_rf.predict(x_test)

array([0.034     , 0.53183333, 0.57508333, 0.55      , 0.026     ,
       0.32      , 0.825     , 0.0542    , 0.844     , 0.48833333,
       0.25366667, 0.114     , 0.568     , 0.0475    , 0.33733254,
       0.004     , 0.232     , 0.59333333, 0.026     , 0.078     ,
       0.75933333, 0.00933333, 0.018     , 0.0507    , 0.07      ,
       0.01      , 0.196     , 0.234     , 0.218     , 0.008     ,
       0.        , 0.416     , 0.034     , 0.60033333, 0.04983333,
       0.084     , 0.522     , 0.124     , 0.43      , 0.022     ,
       0.036     , 0.04916667, 0.048     , 0.078     , 0.108     ,
       0.        , 0.303     , 0.01866667, 0.07      , 0.002     ,
       0.        , 0.        , 0.012     , 0.1885    , 0.35313333,
       0.012     , 0.016     , 0.122     , 0.461     , 0.14233333,
       0.        , 0.002     , 0.041     , 0.232     , 0.39666667,
       0.256     , 0.214     , 0.276     , 0.004     , 0.        ,
       0.012     , 0.        , 0.252     , 0.156     , 0.008  

In [33]:
grid_rf.best_params_

{'max_features': 100, 'n_estimators': 500}

In [34]:
# Guardar el modelo
# rfc.pkl

In [36]:
import pickle

with open('rfc.pickle', 'wb') as handle:
    pickle.dump(grid_rf.best_params_, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
#new_data = pd.read_csv("new_data.csv", sep=',')

In [37]:
with open('rfc.pickle', 'rb') as handle:
    rfc_tr = pickle.load(handle)

#### Regresión Lineal

In [41]:
x_data_reg = data_imp2.drop('Income', axis=1)
y_data_reg = data_imp2['Income']

In [42]:
x_data_reg = x_data_reg.values
y_data_reg = y_data_reg.values

In [43]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x_data_reg, y_data_reg, test_size=0.3)

In [44]:
from sklearn.linear_model import LinearRegression

# a.Creamos modelo
model = LinearRegression()

# b. fiteamos
model.fit(x_train, y_train)

# c. obtenemos predicciónes para tr y ts
pred_tr = model.predict(x_train)
pred_ts = model.predict(x_test)

In [45]:
pred_tr[0:6]

array([41679.27953425, 39030.05444843, 50134.60351366, 77458.20050074,
       48356.17262167, 45794.04780172])

In [46]:
# Guardar el modelo
# lr.pkl
with open('lr.pkl', 'wb') as handle:
    pickle.dump(model, handle, protocol=pickle.HIGHEST_PROTOCOL)

#### RandomForest Columna Income

In [47]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor 

model_rf = RandomForestRegressor()

In [48]:
params = {
    'n_estimators' : [10, 100, 300, 500,1000],
    'max_features': [50,100],
    #'bootstrap': [False, True],
    #'max_depth': [50, 500],
    #'min_samples_leaf': [3, 50],
    #'min_samples_split': [10, 50],
}

grid_rf = GridSearchCV(estimator = model_rf,
                        param_grid = params,
                        scoring = 'neg_mean_absolute_error',
                        cv = 5, 
                        verbose = 1
                        )

In [49]:
grid_rf.fit(x_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


In [50]:
grid_rf.best_estimator_

In [51]:
grid_rf.predict(x_train)
grid_rf.predict(x_test)

array([ 53835.82      ,  55272.962     ,  60321.796     ,  76647.19      ,
        76751.836     ,  33580.004     ,  31347.25466667,  65858.082     ,
        65374.234     ,  55819.338     ,  54146.806     ,  37125.09266667,
        22616.138     ,  52727.468     ,  54733.728     ,  40950.376     ,
        80983.37      ,  26131.95      ,  77459.786     ,  74583.222     ,
        53112.796     ,  74813.646     ,  21288.254     ,  65154.442     ,
        37043.782     ,  31180.34      ,  33435.176     ,  49806.756     ,
        29849.375     ,  30530.762     ,  72219.562     ,  53093.6       ,
        23825.836     ,  28542.734     ,  69288.16      ,  24812.13      ,
        41772.196     ,  59344.044     ,  82369.104     ,  75223.334     ,
        71247.518     ,  22547.504     ,  67337.666     ,  58074.922     ,
        52682.858     ,  55944.798     ,  89699.71      ,  23051.868     ,
        73143.102     ,  35224.432     ,  69525.438     ,  45965.354     ,
        24251.124     ,  

In [52]:
grid_rf.best_params_

{'max_features': 100, 'n_estimators': 500}

#### Métricas de error

In [53]:
pd.concat([pd.DataFrame(grid_rf.cv_results_["params"]),
           pd.DataFrame(grid_rf.cv_results_["mean_test_score"], 
                        columns=["neg_mean_absolute_error"])],axis=1).sort_values('neg_mean_absolute_error', ascending=False)

Unnamed: 0,max_features,n_estimators,neg_mean_absolute_error
8,100,500,-6860.011024
3,50,500,-6865.17123
4,50,1000,-6892.134207
9,100,1000,-6904.059438
2,50,300,-6906.235653
6,100,100,-6939.442306
7,100,300,-6948.202464
1,50,100,-7019.808752
0,50,10,-7296.130348
5,100,10,-7402.531618


In [54]:
grid_rf.score(x_train, y_train)

-2457.2977136358113

In [55]:
grid_rf.score(x_test, y_test)

-6488.981232284579

In [56]:
# Guardar el modelo
# rfr.pkl
with open('rfr.pkl', 'wb') as handle:
    pickle.dump(grid_rf.best_estimator_, handle, protocol=pickle.HIGHEST_PROTOCOL)


#### CARGAR PROYECTO EN GITHUB

In [None]:
# Usar git y git-lfs para los .csv y .pkl

- Repliquen este notebook para la resolución del ejercicio.
- Consideren las etapas: 1) Cargamos los datos, 2) Preparación de la data, 3) Clasificación, 4) Regresión y 5) Guardar un modelo.

**Son libres de decidir:**
- Cómo preparar y acondicionar el dataset.
- Pueden agregar y eliminar columnas del dataset.
- Decidir parámetros para ajustar en los modelos de clasificación y regresión.

##

- Creen un modelo de clasificación utilizando Random Forest para la columna `Response`. 
- Guarden el modelo de clasificación Random forest como `rfc.pkl`.
- Creen un modelo con regresión lineal y con Random Forest + GridsearchCV para predecir la columna `Income`.
- Guardar ambos modelos de regresion en pkl `lr.pkl` y `rfr.pkl`
- Cargar proyecto en Github / Gitlab, usen git y git-lfs para los `.csv` y `.pkl`.