In [1]:
## IMPORTS
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import pairwise_distances 
from collections import defaultdict
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV

In [2]:
## Train Data Reading & Preparation

dtypes ={'ind_cco_fin_ult1': 'uint8', 'ind_deme_fin_ult1': 'uint8',
            'ind_aval_fin_ult1': 'uint8', 'ind_valo_fin_ult1': 'uint8',
            'ind_reca_fin_ult1': 'uint8', 'ind_ctju_fin_ult1': 'uint8',
            'ind_cder_fin_ult1': 'uint8', 'ind_plan_fin_ult1': 'uint8',
            'ind_fond_fin_ult1': 'uint8', 'ind_hip_fin_ult1': 'uint8',
            'ind_pres_fin_ult1': 'uint8', 'ind_nomina_ult1': 'Int64', 
            'ind_cno_fin_ult1': 'uint8', 'ind_ctpp_fin_ult1': 'uint8',
            'ind_ahor_fin_ult1': 'uint8', 'ind_dela_fin_ult1': 'uint8',
            'ind_ecue_fin_ult1': 'uint8', 'ind_nom_pens_ult1': 'Int64',
            'ind_recibo_ult1': 'uint8', 'ind_deco_fin_ult1': 'uint8',
            'ind_tjcr_fin_ult1': 'uint8', 'ind_ctop_fin_ult1': 'uint8',
            'ind_viv_fin_ult1': 'uint8', 'ind_ctma_fin_ult1': 'uint8',
            'ncodpers' : 'uint32'} 

cols_to_cast={"renta":float}
group_to_binary=['indfall','conyuemp','indext','indresi']
kmeans_prediction=[]

parse_dates = ['fecha_dato','fecha_alta']

df= pd.read_csv("./datasets/train_ver2.csv",dtype=dtypes, parse_dates=parse_dates)

print('Dataframe initialized')
df.shape

print('--------------------------- (BEFORE) ---------------------------')
# print('##################### DESCRIBE ')
# print(df.describe())
# print('##################### INFO ')
# df.info()
# print('##################### Null Count ')
# np.sum(df.isna().sum())

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


Dataframe initialized
--------------------------- (BEFORE) ---------------------------


In [3]:
## Data Cleaning

# remove any duplicate rows
df = df.drop_duplicates()

## seperate dates to day month year cols for model preditction
df["fecha_dato"] = pd.to_datetime(df["fecha_dato"],format="%Y-%m-%d")
df["fecha_alta"] = pd.to_datetime(df["fecha_alta"],format="%Y-%m-%d")
df["ult_fec_cli_1t"] = pd.to_datetime(df["ult_fec_cli_1t"],format="%Y-%m-%d")

seperation_cols=['day','month','year']
# for i in seperation_cols:
df["fecha_dato_day"] = pd.DatetimeIndex(df["fecha_dato"]).day
df["fecha_dato_month"] = pd.DatetimeIndex(df["fecha_dato"]).month
df["fecha_dato_day"] = pd.DatetimeIndex(df["fecha_dato"]).year

df["fecha_alta_day"] = pd.DatetimeIndex(df["fecha_alta"]).day
df["fecha_alta_month"] = pd.DatetimeIndex(df["fecha_alta"]).month
df["fecha_alta_year"] = pd.DatetimeIndex(df["fecha_alta"]).year

df["ult_fec_cli_1t_day"] = pd.DatetimeIndex(df["ult_fec_cli_1t"]).day
df["ult_fec_cli_1t_month"] = pd.DatetimeIndex(df["ult_fec_cli_1t"]).month
df["ult_fec_cli_1t_year"] = pd.DatetimeIndex(df["ult_fec_cli_1t"]).year

## convert age to numeric values and filling unknown values with the mean for each age group
df["age"]   = pd.to_numeric(df["age"], errors="coerce")
df.loc[df.age < 18,"age"]  = df.loc[(df.age >= 18) & (df.age <= 30),"age"].mean(skipna=True)
df.loc[df.age > 100,"age"] = df.loc[(df.age >= 30) & (df.age <= 100),"age"].mean(skipna=True)
df["age"].fillna(df["age"].mean(),inplace=True)
df["age"]                  = df["age"].astype(int)

## fill missing cod_prov with 0
df["cod_prov"].fillna(0,inplace=True)

# Drop some uneeded columns as tipodom is not useful and nomprov as province code already exists in cod_prov
df.drop(['fecha_dato','fecha_alta','ult_fec_cli_1t','tipodom','nomprov'], axis=1, inplace=True)

print('--------------------------- (AFTER) ---------------------------')
# print('##################### INFO ')
# df.info()
print('##################### Null Count ')
np.sum(df.isna().sum())

--------------------------- (AFTER) ---------------------------
##################### Null Count 


58347484

In [4]:
## Models and Helper Functions
cosine_sim=None
product_names = {"ind_ahor_fin_ult1" : "Saving Account","ind_aval_fin_ult1" : "Guarantees","ind_cco_fin_ult1" : "Current Accounts","ind_cder_fin_ult1" : "Derivada Account",
"ind_cno_fin_ult1" : "Payroll Account","ind_ctju_fin_ult1" : "Junior Account","ind_ctma_fin_ult1" : "Más particular Account","ind_ctop_fin_ult1" : "particular Account",
"ind_ctpp_fin_ult1" : "particular Plus Account","ind_deco_fin_ult1" : "Short-term deposits","ind_deme_fin_ult1" : "Medium-term deposits","ind_dela_fin_ult1" : "Long-term deposits",
"ind_ecue_fin_ult1" : "e-account","ind_fond_fin_ult1" : "Funds","ind_hip_fin_ult1" : "Mortgage","ind_plan_fin_ult1" : "Pensions","ind_pres_fin_ult1" : "Loans",
"ind_reca_fin_ult1" : "Taxes","ind_tjcr_fin_ult1" : "Credit Card","ind_valo_fin_ult1" : "Securities","ind_viv_fin_ult1" : "Home Account","ind_nomina_ult1" : "Payroll",
"ind_nom_pens_ult1" : "Pensions","ind_recibo_ult1" : "Direct Debit"}

province_names= {29: 'MALAGA', 13: 'CIUDAD REAL', 50: 'ZARAGOZA', 45: 'TOLEDO', 24: 'LEON', 20: 'GIPUZKOA', 10: 'CACERES', 17: 'GIRONA',
                 49: 'ZAMORA', 8: 'BARCELONA', 37: 'SALAMANCA', 9: 'BURGOS', 22: 'HUESCA', 31: 'NAVARRA', 5: 'AVILA', 40: 'SEGOVIA',
                 27: 'LUGO', 25: 'LERIDA', 28: 'MADRID', 3: 'ALICANTE', 42: 'SORIA', 41: 'SEVILLA', 39: 'CANTABRIA', 7: 'BALEARS, ILLES', 
                 47: 'VALLADOLID', 36: 'PONTEVEDRA', 46: 'VALENCIA', 44: 'TERUEL', 15: 'CORUÑA, A', 32: 'OURENSE', 23: 'JAEN',
                 16: 'CUENCA', 48: 'BIZKAIA', 12: 'CASTELLON', 26: 'RIOJA, LA', 2: 'ALBACETE', 6: 'BADAJOZ', 30: 'MURCIA', 11: 'CADIZ',
                 4: 'ALMERIA', 19: 'GUADALAJARA', 34: 'PALENCIA', 35: 'PALMAS, LAS', 14: 'CORDOBA', 21: 'HUELVA', 18: 'GRANADA', 33: 'ASTURIAS',
                 38: 'SANTA CRUZ DE TENERIFE', 52: 'MELILLA', 43: 'TARRAGONA', 1: 'ALAVA', 51: 'CEUTA',0:'UNKNOWN'}

def change_names(col_names, map_products):
    '''
    Change column names (e.g."ind_recibo_ult1") to map names (e.g."Direct Debit").
    '''
    return list(map(lambda col_name: map_products[col_name], col_names))


def popularity_based(df):
    """
    Function that calculates the probability of a product occurring. 
    Probability range is <0, 1>.
    """
    top_col = {}
    for col in df.columns[1:]:
        top_col[col] = df[col].value_counts()[1]
        
#     sorted by most popular
#     top_col = dict(sorted(top_col.items(), key=lambda it: it[1], reverse=True)) 
    
    for k, v in top_col.items():
        top_col[k] = np.around(v / df.shape[0], decimals=4)
        
    return top_col


def useritem(user_id, df, sim_matrix = cosine_sim):
    """
    Function that calculates recommendations for a given user.
    It uses cosine similarity to calculate the most similar users.
    Returns the probability of products for a given user based on similar users.
    Probability range is <0, 1>.
    """
    # computes the index in the user-item similarity matrix for a given user_id
    cos_id = list(df.index).index(user_id) 
    
    # number of similar users
    k = 0
    sim_min = 0.79
    user_sim_k = {}
    
    while k < 20:
        # creates the dictionary {'similar user':'similarity'}
        for user in range(len(df)):
            
            # 0.99 because I don`t want the same user as user_id
            if sim_min < sim_matrix[cos_id, user] < 0.99:
                user_sim_k[user] = sim_matrix[cos_id, user]
                k+=1
                
        sim_min -= 0.025
        
        # if there are no users with similarity at least 0.65, the recommendation probability will be set to 0 
        if sim_min < 0.65:
            break
            
    # sorted k most similar users
    user_sim_k = dict(sorted(user_sim_k.items(), key=lambda item: item[1], reverse=True))
    user_id_k = list(user_sim_k.keys()) 
    
    # dataframe with k most similar users
    df_user_k = df.iloc[user_id_k]
    df_user_k_T = df_user_k.T
    
    # change the user index to the cosine index
    df_user_k_T.columns = user_id_k
    
    # mean of ownership by k similar users
    ownership = []
    usit = {}
    
    for row_name, row in df_user_k_T.iterrows():
        
        for indx, own in row.items():
            
            ownership.append(own) 
        
        usit[row_name] = np.mean(ownership)
        ownership = []
        
    # if there are no users with similarity at least 0.65, the recommendation probability is 0 
    if pd.isna(list(usit.values())[0]) == True:
        
        usit = {key : 0 for (key, value) in usit.items()}
            
    return usit

def modelbased(user_id, df, model=DecisionTreeClassifier(max_depth=9)):
    """
    Function that calculates recommendations for a given user.
    It uses machine learning model to calculate the probability of products.
    Probability range is <0, 1>.   
    """
    
    mdbs = {}
    
    for c in df.columns:
        y_train = df[c].astype('int')
        x_train = df.drop([c], axis = 1)
        model.fit(x_train, y_train)
        p_train = model.predict_proba(x_train[x_train.index == user_id])[:,1]
        
        mdbs[c] = p_train[0]
        
    return mdbs

In [21]:
## PLAYGROUND
 
#df_light = df[:10000]
# cosine_sim = 1 - pairwise_distances(df_light, metric="cosine")

#dum=pd.get_dummies(df_light)

#dum.replace([np.inf, -np.inf], 0, inplace=True)
# dum.fillna(0.0, inplace=True)
# df_light
#df.ind_empleado.unique()
# popularity_based(df)
#modelbased(user_id=1061608,df=dum)

# change_names([29.0],province_names)

# change_names(['ind_ecue_fin_ult1'],product_names)