In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder

#Plotting Libraries
import matplotlib.pyplot as plt
import seaborn as sns
import math

#Memory management 
import gc
import random
import time
import xgboost as xgb
from sklearn.preprocessing import StandardScaler, MinMaxScaler    
scaler = StandardScaler()
minmax = MinMaxScaler()


from statsmodels.graphics.tsaplots import plot_acf, plot_pacf

train_file = "/kaggle/input/santander-pr/train.csv"
test_file = "/kaggle/input/santander-pr/test.csv"

targetcols = ["ind_ahor_fin_ult1","ind_aval_fin_ult1","ind_cco_fin_ult1","ind_cder_fin_ult1","ind_cno_fin_ult1","ind_ctju_fin_ult1","ind_ctma_fin_ult1",
              "ind_ctop_fin_ult1","ind_ctpp_fin_ult1","ind_deco_fin_ult1","ind_deme_fin_ult1","ind_dela_fin_ult1", "ind_ecue_fin_ult1","ind_fond_fin_ult1",
              "ind_hip_fin_ult1", "ind_plan_fin_ult1","ind_pres_fin_ult1","ind_reca_fin_ult1","ind_tjcr_fin_ult1","ind_valo_fin_ult1","ind_viv_fin_ult1",
              "ind_nomina_ult1","ind_nom_pens_ult1","ind_recibo_ult1"]

dtype_list = {'ind_cco_fin_ult1': 'uint8',
              'ind_deme_fin_ult1': 'uint8',
              'ind_aval_fin_ult1': 'uint8',
              'ind_valo_fin_ult1': 'uint8',
              'ind_reca_fin_ult1': 'uint8',
              'ind_ctju_fin_ult1': 'uint8',
              'ind_cder_fin_ult1': 'uint8', 
              'ind_plan_fin_ult1': 'uint8',
              'ind_fond_fin_ult1': 'uint8', 
              'ind_hip_fin_ult1': 'uint8',
              'ind_pres_fin_ult1': 'uint8', 
              'ind_nomina_ult1': 'float64', 
              'ind_cno_fin_ult1': 'uint8',
              'ind_ctpp_fin_ult1': 'uint8',
              'ind_ahor_fin_ult1': 'uint8',
              'ind_dela_fin_ult1': 'uint8',
              'ind_ecue_fin_ult1': 'uint8',
              'ind_nom_pens_ult1': 'float64',
              'ind_recibo_ult1': 'uint8',
              'ind_deco_fin_ult1': 'uint8',
              'ind_tjcr_fin_ult1': 'uint8', 
              'ind_ctop_fin_ult1': 'uint8',
              'ind_viv_fin_ult1': 'uint8',
              'ind_ctma_fin_ult1': 'uint8',
             'ncodpers' : 'uint32'}  

# ['fecha_alta','canal_entrada']

feature_cols = ['ncodpers','fecha_dato','age','renta','nomprov', 'ind_nuevo', 
               'segmento', 'ind_actividad_cliente', 'pais_residencia', 'ind_empleado', 
                'sexo', 'tiprel_1mes', 'indrel_1mes', 'antiguedad',  'indrel', 'indext', 'indresi', 'indfall', 'canal_entrada']

In [None]:
from tqdm import tqdm

def string_num_age(x):
    if(type(x) == str and x != ' NA'):
        x = int(x)
    elif( x == ' NA'):
        x = np.nan
    return x

y_train = pd.read_csv(train_file, usecols = ['age','fecha_dato','ind_ahor_fin_ult1', 'ind_aval_fin_ult1', 'ind_cco_fin_ult1',
       'ind_cder_fin_ult1', 'ind_cno_fin_ult1', 'ind_ctju_fin_ult1',
       'ind_ctma_fin_ult1', 'ind_ctop_fin_ult1', 'ind_ctpp_fin_ult1',
       'ind_deco_fin_ult1', 'ind_deme_fin_ult1', 'ind_dela_fin_ult1',
       'ind_ecue_fin_ult1', 'ind_fond_fin_ult1', 'ind_hip_fin_ult1',
       'ind_plan_fin_ult1', 'ind_pres_fin_ult1', 'ind_reca_fin_ult1',
       'ind_tjcr_fin_ult1', 'ind_valo_fin_ult1', 'ind_viv_fin_ult1',
       'ind_nomina_ult1', 'ind_nom_pens_ult1', 'ind_recibo_ult1'], dtype=dtype_list, parse_dates=['fecha_dato'])
y_train.select_dtypes(include=['int']).apply(pd.to_numeric,downcast='unsigned')
y_train.select_dtypes(include=['float']).apply(pd.to_numeric,downcast='unsigned')

# Selection of rows
y_train.age = y_train.age.apply(lambda x: string_num_age(x))
y_train = y_train.loc[y_train.age.isnull()==False]
y_train.fecha_dato = y_train['fecha_dato'].apply(lambda x: 100*x.year + x.month)
y_train = y_train.fillna(0)
y_train.select_dtypes(include=['int']).apply(pd.to_numeric,downcast='unsigned')
y_train.select_dtypes(include=['float']).apply(pd.to_numeric,downcast='unsigned')

dum = y_train.drop(columns=['age'])
dum = dum.groupby('fecha_dato').sum()

In [None]:
from statsmodels.graphics.tsaplots import plot_pacf
from statsmodels.graphics.tsaplots import plot_acf

date = [201501, 201502, 201503, 201504, 201505, 201506, 201507, 201508, 201509, 201510, 201511, 201512, 201601, 201602, 201603, 201604]
plt.style.use('ggplot')

for i in targetcols:
    fig,ax = plt.subplots(ncols=2, figsize=(30,5))
    plot_acf(dum[i], lags=15, title=i+' autocorrelation', ax=ax[0])
#     sns.lineplot(data=dum[i], ax=ax[1])
    dum_df = y_train[['fecha_dato',i]].copy()
    temp = dum_df.groupby('fecha_dato').sum().reset_index()
    sns.lineplot(data=temp[i], ax=ax[1])
    plt.title(i + " Trends")
    ax[1].set_xticks(np.arange(0,17))
    ax[1].set_xticklabels(date)
    
    plt.show()
    name = i+'_autocorr.png'
    fig.savefig(name)