In [1]:
import pandas as pd
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression, Lars, ElasticNet, Lasso, Ridge, BayesianRidge
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV


%matplotlib inline
import pygal
import math
import numpy as np
from IPython.display import display, HTML
base_html = """
<!DOCTYPE html>
<html>
  <head>
  <script type="text/javascript" src="http://kozea.github.com/pygal.js/javascripts/svg.jquery.js"></script>
  <script type="text/javascript" src="https://kozea.github.io/pygal.js/2.0.x/pygal-tooltips.min.js""></script>
  </head>
  <body>
    <figure>
      {rendered_chart}
    </figure>
  </body>
</html>
"""

# Regresión

Pruebe los modelos de regresi ́on vistos en clase para estimar de la mejor forma posible las siguientes
variables objetivo:

- PAY AMT1: Cantidad pagada en Septiembre, 2005
- PAY AMT2: Cantidad pagada en Agosto, 2005
- PAY AMT3: Cantidad pagada en Julio, 2005

Considere que hay variables que no pueden utilizarse al ocurrir despu ́es de la variable objetivo.

In [2]:
amt1_train=pd.read_csv('./Datos/train_PAY_AMT1.csv')
amt1_val=pd.read_csv('./Datos/val_PAY_AMT1.csv')

amt2_train=pd.read_csv('./Datos/train_PAY_AMT2.csv')
amt2_val=pd.read_csv('./Datos/val_PAY_AMT2.csv')

amt3_train=pd.read_csv('./Datos/train_PAY_AMT3.csv')
amt3_val=pd.read_csv('./Datos/val_PAY_AMT3.csv')

In [3]:
amt1_val.T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,8990,8991,8992,8993,8994,8995,8996,8997,8998,8999
ID,19943.0,7848.0,14301.0,17291.0,13027.0,13541.0,17622.0,10860.0,7431.0,14836.0,...,19125.0,16363.0,2554.0,7128.0,26872.0,8396.0,5187.0,21221.0,12268.0,15196.0
LIMIT_BAL,360000.0,100000.0,700000.0,230000.0,80000.0,100000.0,20000.0,360000.0,210000.0,80000.0,...,150000.0,360000.0,210000.0,500000.0,290000.0,360000.0,230000.0,200000.0,200000.0,120000.0
SEX,2.0,1.0,2.0,2.0,2.0,1.0,2.0,1.0,2.0,2.0,...,2.0,2.0,2.0,1.0,1.0,2.0,1.0,1.0,1.0,2.0
EDUCATION,2.0,1.0,1.0,2.0,1.0,2.0,1.0,2.0,2.0,3.0,...,2.0,2.0,1.0,2.0,1.0,1.0,1.0,2.0,3.0,1.0
MARRIAGE,1.0,1.0,1.0,1.0,1.0,1.0,2.0,1.0,1.0,2.0,...,2.0,1.0,2.0,2.0,1.0,2.0,2.0,1.0,1.0,2.0
AGE,45.0,25.0,36.0,34.0,40.0,28.0,31.0,38.0,30.0,38.0,...,32.0,33.0,26.0,40.0,45.0,29.0,39.0,41.0,36.0,26.0
PAY_0,0.0,-1.0,0.0,0.0,1.0,-1.0,-1.0,1.0,-2.0,0.0,...,0.0,1.0,0.0,-1.0,-2.0,0.0,0.0,2.0,-1.0,-2.0
PAY_2,0.0,0.0,0.0,-1.0,-1.0,-1.0,-1.0,-2.0,-2.0,0.0,...,0.0,-1.0,0.0,-1.0,-2.0,0.0,0.0,2.0,-1.0,-2.0
PAY_3,-2.0,0.0,0.0,-1.0,-1.0,-2.0,-1.0,-2.0,-2.0,0.0,...,0.0,2.0,0.0,-1.0,-2.0,0.0,0.0,2.0,-1.0,-2.0
PAY_4,-2.0,0.0,0.0,-1.0,-1.0,-2.0,-1.0,-1.0,-2.0,0.0,...,0.0,2.0,0.0,-1.0,-2.0,0.0,0.0,2.0,-1.0,-2.0


In [4]:
amt1_train.columns

Index(['ID', 'LIMIT_BAL', 'SEX', 'EDUCATION', 'MARRIAGE', 'AGE', 'PAY_0',
       'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6', 'BILL_AMT2', 'BILL_AMT3',
       'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6', 'PAY_AMT1', 'PAY_AMT2',
       'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6'],
      dtype='object')

In [5]:
def plot_hist(df,value,route=""):
    info=df[value].describe()
    r=info["max"]-info["min"]
    k=math.ceil(1+3.322*math.log10(info["count"]))
    a=int(round(r/k))
    ranges=[]
    for i in range(int(info["min"]),int(info["max"]+2),a):
        ranges.append(i)
    aux=np.histogram(df[value], bins=ranges)
    histograma=[]
    for i in range(len(aux[0])):
        histograma.append([aux[0][i],aux[1][i],aux[1][i+1]])
    hist = pygal.Histogram()
    hist.add(value,histograma)
    if route!='':
        return hist.render_to_png(route)
    display(HTML(base_html.format(rendered_chart=hist.render(is_unicode=True))))

In [6]:
def iqr(df,variable):
    desc = df[variable].describe()
    iqr_value= desc["75%"]- desc["25%"]
    bandaIzq = desc["25%"] - (3*iqr_value)
    bandaDer = desc["75%"] + (3*iqr_value)
    aux = df.copy()
    return aux[(aux[variable]>bandaIzq) & (aux[variable]<bandaDer)]

In [7]:
def percentiles(df,variable,percent):
    desc = df[variable].describe(percentiles=[0.01,0.99])
    aux = df.copy()
    return aux[(aux[variable]>=desc["1%"]) & (aux[variable]<=desc["99%"])]

In [8]:
def media(df,variable,n_sigma):
    media = df[variable].mean()
    sigma = df[variable].std()
    aux = df.copy()
    return aux[(media-n_sigma*sigma <= aux[variable])  & (aux[variable] <= media + n_sigma*sigma )]

In [9]:
comparacion = pd.DataFrame()
comparacion["nombre"]=amt1_train.columns
ls_iqr=[]
ls_mediana=[]
ls_percentiles=[]
for i in amt1_train.columns:
    ls_iqr.append(iqr(amt1_train,i).shape[0]/amt1_train.shape[0])
    ls_mediana.append(media(amt1_train,i,3).shape[0]/amt1_train.shape[0])
    ls_percentiles.append(percentiles(amt1_train,i,[0.01,0.99]).shape[0]/amt1_train.shape[0])
comparacion["iqr"]=ls_iqr
comparacion["mediana"]=ls_mediana
comparacion["percentiles"]=ls_percentiles
comparacion

Unnamed: 0,nombre,iqr,mediana,percentiles
0,ID,1.0,1.0,0.98
1,LIMIT_BAL,0.999952,0.995381,0.99281
2,SEX,1.0,1.0,1.0
3,EDUCATION,0.988381,0.988381,0.997762
4,MARRIAGE,1.0,1.0,0.99819
5,AGE,1.0,0.995381,0.988905
6,PAY_0,0.985095,0.995619,0.995619
7,PAY_2,0.983619,0.995048,0.995048
8,PAY_3,0.987619,0.995381,0.995381
9,PAY_4,0.989238,0.995048,0.995048


In [24]:
ls_disc = ["SEX", "EDUCATION", "MARRIAGE", "PAY_0", "PAY_2", "PAY_3",
           "PAY_4", "PAY_5", "PAY_6"]
ls_cont = ["LIMIT_BAL", "AGE", "BILL_AMT1", "BILL_AMT2", "BILL_AMT3",
           "BILL_AMT4", "BILL_AMT5", "BILL_AMT6", "PAY_AMT1", 
           "PAY_AMT2", "PAY_AMT3", "PAY_AMT4", "PAY_AMT5", "PAY_AMT6"]
tgt = "default.payment.next.month"
def data(df):
    bill=['BILL_AMT2', 'BILL_AMT3','BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6']
    pay=['PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6']
    aux2=df.copy()
    aux2=iqr(aux2,"BILL_AMT2")
    aux2=iqr(aux2,"BILL_AMT3")
    aux2=iqr(aux2,"BILL_AMT5")
    aux2=iqr(aux2,"BILL_AMT6")
    aux2=pd.get_dummies(aux2,columns=ls_disc)
    #aux2=percentiles(aux2,"PAY_AMT2",[0.01,0.99])
    #aux2=percentiles(aux2,"PAY_AMT3",[0.01,0.99])
    #aux2=percentiles(aux2,"PAY_AMT4",[0.01,0.99])
    
    for i in range(2,7):
        for j in range(i,7):
            if i==j:
                continue
            aux2["diff_BILL_AMT{}_{}".format(i,j)]=(aux2["BILL_AMT{}".format(i)]-aux2["BILL_AMT{}".format(j)])/aux2["LIMIT_BAL"]
            aux2["diff_PAY_AMT{}_{}".format(i,j)]=aux2["PAY_AMT{}".format(i)]-aux2["PAY_AMT{}".format(j)]
            aux2["PAY_BILL{}_{}/LIMIT_BAL".format(i,j)]=(aux2["BILL_AMT{}".format(i)]-aux2["PAY_AMT{}".format(j)])/aux2["LIMIT_BAL"]
        aux2["LIMBAL*BILL_AMT_{}".format(i)]=aux2["BILL_AMT{}".format(i)]*aux2["LIMIT_BAL"]
        aux2["BILL_AMT_{}**2".format(i)]=aux2["BILL_AMT{}".format(i)]**2
        aux2["PAY{}/LIMIT_BAL".format(i)]=aux2["PAY_AMT{}".format(i)]/aux2["LIMIT_BAL"]
        aux2[f"USE_{i}"] = aux2[f"BILL_AMT{i}"] / aux2["LIMIT_BAL"]


    #aux2["mean_pay"]=aux2[pay].mean(axis=1)
    #aux2["mean_bill"]=aux2[bill].mean(axis=1)
    payarr=['PAY_AMT2']
    for i in range(3,7):
        payarr.append("PAY_AMT{}".format(i))
        aux2["maxpay_2_{}".format(i)]=aux2[payarr].max(axis=1)
        aux2["minpay_2_{}".format(i)]=aux2[payarr].min(axis=1)
        #aux2["meanpay_2_{}".format(i)]=aux2[payarr].mean(axis=1)
        aux2["sumpay_2_{}".format(i)]=aux2[payarr].sum(axis=1)

    billarr=['BILL_AMT2']
    for i in range(3,7):
        billarr.append("BILL_AMT{}".format(i))    
        aux2["maxbill_2_{}".format(i)]=aux2[billarr].max(axis=1)
        aux2["minbill_2_{}".format(i)]=aux2[billarr].min(axis=1)
        aux2["sumbill_2_{}".format(i)]=aux2[billarr].sum(axis=1)
        aux2["sumbill_2_{}".format(i)]=aux2[billarr].sum(axis=1)/aux2["LIMIT_BAL"]
        #aux2["meanbill_2_{}".format(i)]=aux2[billarr].mean(axis=1)
        #aux2["%meanbill_2_{}".format(i)]=(aux2[billarr].mean(axis=1))/aux2["LIMIT_BAL"]


    #aux2["mean_max_min_pay"]=(aux2["max_pay"]+aux2["min_pay"])/2
    #aux2["mean_max_min_bill"]=(aux2["max_bill"]+aux2["min_bill"])/2

    #aux2["mean_max_min_pay**2"]=aux2["mean_max_min_pay"]**2
    #aux2["mean_max_min_bill**2"]=aux2["mean_max_min_bill"]**2
    return aux2

In [25]:
aux2=data(amt1_train)
val=data(amt1_val)

In [26]:
aux2.shape[0]/amt1_train.shape[0]

0.940047619047619

In [27]:
tgt = "PAY_AMT1"
ls_pred = [x for x in aux2.columns if x not in [tgt]]

In [28]:
X_train, X_test, y_train, y_test = train_test_split(aux2[ls_pred], aux2[tgt], test_size=0.33, random_state=42)

In [29]:
reg_grid = {
    'copy_X':[True,False],
    'normalize':[True,False]
}

In [30]:
linReg = LinearRegression()
clf = GridSearchCV(linReg, reg_grid, cv=4, error_score=-1000, n_jobs=-1, scoring="r2")
clf.fit(X_train, y_train)
print("Best score: " + str(clf.best_score_))



Best score: 0.32949917169039544


In [145]:
rid_grid = {
    'alpha': [.01,.02,.05,.08,.09,.003 ,.004 ,.05, .3, .5, .8 ,.08 ,.09],
    'copy_X':[True,False],
    'normalize':[True,False],
    'solver':['auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga']
}

In [None]:
ridgereg = Ridge()
clf = GridSearchCV(ridgereg, rid_grid, cv=4, error_score=-1000, n_jobs=-1, scoring="r2")
clf.fit(X_train, y_train)
print("Best score: " + str(clf.best_score_))

In [None]:
las_reg = {
    'alpha': [.01,.02,.003 ,.004 ,.05, .3, .5, .8 ,.08 ,.09],
    'precompute': [True,False],
    'selection': ["random","cyclic"],
    'positive': [True,False],
    'warm_start': [True,False]
}

In [66]:
Lasso?

In [68]:
lassreg = Lasso()
clf = GridSearchCV(lassreg, las_reg, cv=4, error_score=-1000, n_jobs=-1, scoring="r2")
clf.fit(X_train, y_train)
print("Best score: " + str(clf.best_score_))

Best score: 0.3237529603180102


  tol, rng, random, positive)


In [69]:
elasnet = ElasticNet()
elasnet.fit(X_train, y_train)

  positive)


ElasticNet(alpha=1.0, copy_X=True, fit_intercept=True, l1_ratio=0.5,
           max_iter=1000, normalize=False, positive=False, precompute=False,
           random_state=None, selection='cyclic', tol=0.0001, warm_start=False)

In [278]:
linReg = LinearRegression()
ridgereg = Ridge()
lassreg = Lasso()
elasnet = ElasticNet()

In [206]:
linReg.fit(X_train,y_train)
ridgereg.fit(X_train, y_train)
lassreg.fit(X_train, y_train)
elasnet.fit(X_train, y_train)

ElasticNet(alpha=1.0, copy_X=True, fit_intercept=True, l1_ratio=0.5,
           max_iter=1000, normalize=False, positive=False, precompute=False,
           random_state=None, selection='cyclic', tol=0.0001, warm_start=False)

In [207]:
reg = cross_val_score(estimator = linReg, X=X_train, y=y_train, cv=4, n_jobs=-1, scoring="r2")
rid = cross_val_score(estimator = ridgereg, X=X_train, y=y_train, cv=4, n_jobs=-1, scoring="r2")
las = cross_val_score(estimator = lassreg, X=X_train, y=y_train, cv=4, n_jobs=-1, scoring="r2")
net = cross_val_score(estimator = elasnet, X=X_train, y=y_train, cv=4, n_jobs=-1, scoring="r2")

In [208]:
print("reg",reg.mean())
print("rig",rid.mean())
print("las",las.mean())
print("net",net.mean())

reg 0.20845173546125212
rig 0.2084521996590036
las 0.20846375146757137
net 0.20470234305856147


In [209]:
print("reg",linReg.score(X_train,y_train))
print("rid",ridgereg.score(X_train,y_train))
print("las",lassreg.score(X_train,y_train))
print("net",elasnet.score(X_train,y_train))

reg 0.2513978299783338
rid 0.251397829412753
las 0.2513977678354078
net 0.24696927696885096


In [151]:
for i in val.columns:
    print(i)

ID
LIMIT_BAL
AGE
BILL_AMT2
BILL_AMT3
BILL_AMT4
BILL_AMT5
BILL_AMT6
PAY_AMT2
PAY_AMT3
PAY_AMT4
PAY_AMT5
PAY_AMT6
SEX_1
SEX_2
EDUCATION_0
EDUCATION_1
EDUCATION_2
EDUCATION_3
EDUCATION_4
EDUCATION_5
EDUCATION_6
MARRIAGE_0
MARRIAGE_1
MARRIAGE_2
MARRIAGE_3
PAY_0_-2
PAY_0_-1
PAY_0_0
PAY_0_1
PAY_0_2
PAY_0_3
PAY_0_4
PAY_0_5
PAY_0_6
PAY_0_7
PAY_0_8
PAY_2_-2
PAY_2_-1
PAY_2_0
PAY_2_1
PAY_2_2
PAY_2_3
PAY_2_4
PAY_2_5
PAY_2_6
PAY_2_7
PAY_3_-2
PAY_3_-1
PAY_3_0
PAY_3_1
PAY_3_2
PAY_3_3
PAY_3_4
PAY_3_5
PAY_3_6
PAY_3_7
PAY_4_-2
PAY_4_-1
PAY_4_0
PAY_4_1
PAY_4_2
PAY_4_3
PAY_4_4
PAY_4_5
PAY_4_6
PAY_4_7
PAY_4_8
PAY_5_-2
PAY_5_-1
PAY_5_0
PAY_5_2
PAY_5_3
PAY_5_4
PAY_5_5
PAY_5_6
PAY_5_7
PAY_6_-2
PAY_6_-1
PAY_6_0
PAY_6_2
PAY_6_3
PAY_6_4
PAY_6_5
PAY_6_6
PAY_6_7
diff_BILL_AMT2_3
diff_PAY_AMT2_3
diff_PAY_BILL2_3
PAY_BILL2_3/LIMIT_BAL
PAY_BILL2_3/LIMIT_BAL**2
diff_BILL_AMT2_4
diff_PAY_AMT2_4
diff_PAY_BILL2_4
PAY_BILL2_4/LIMIT_BAL
PAY_BILL2_4/LIMIT_BAL**2
diff_BILL_AMT2_5
diff_PAY_AMT2_5
diff_PAY_BILL2_5
PAY_BILL2_5/

In [138]:
ridgereg.predict(X=val[ls_pred]).shape

KeyError: "['PAY_5_8', 'PAY_6_8', 'PAY_3_8', 'PAY_2_8'] not in index"

In [133]:
val.shape

(8215, 79)

In [68]:
amt2_train=amt2_train.drop(columns=["PAY_AMT1"])

In [88]:
bill=['BILL_AMT3','BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6']
pay=['PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6']
aux2=iqr(amt2_train,"BILL_AMT3")
aux2=iqr(aux2,"BILL_AMT5")
aux2=iqr(aux2,"BILL_AMT6")

aux2=percentiles(aux2,"PAY_AMT2",[0.01,0.99])
aux2=percentiles(aux2,"PAY_AMT3",[0.01,0.99])
aux2=percentiles(aux2,"PAY_AMT4",[0.01,0.99])
#aux2=iqr(aux2,"BILL_AMT3")
#aux2=iqr(aux2,"BILL_AMT5")
#aux2=iqr(aux2,"BILL_AMT6")
for i in range(3,7):
    for j in range(i,7):
        if i==j:
            continue
        aux2["diff_BILL_AMT{}_{}".format(i,j)]=aux2["BILL_AMT{}".format(i)]-aux2["BILL_AMT{}".format(j)]
        aux2["diff_PAY_AMT{}_{}".format(i,j)]=aux2["PAY_AMT{}".format(i)]-aux2["PAY_AMT{}".format(j)]
        aux2["diff_PAY_BILL{}_{}".format(i,j)]=aux2["BILL_AMT{}".format(i)]-aux2["PAY_AMT{}".format(j)]
    aux2["LIMBAL*BILL_AMT_{}".format(i)]=aux2["BILL_AMT{}".format(i)]*aux2["LIMIT_BAL"]
    aux2["BILL_AMT_{}**2".format(i)]=aux2["BILL_AMT{}".format(i)]**2
    aux2[f"USE_{i}"] = aux2[f"BILL_AMT{i}"] / aux2["LIMIT_BAL"]


#aux2["mean_pay"]=aux2[pay].mean(axis=1)
#aux2["mean_bill"]=aux2[bill].mean(axis=1)
payarr=['PAY_AMT2']
for i in range(4,7):
    payarr.append("PAY_AMT{}".format(i))
    aux2["maxpay_3_{}".format(i)]=aux2[payarr].max(axis=1)
    aux2["minpay_3_{}".format(i)]=aux2[payarr].min(axis=1)
    aux2["meanpay_3_{}".format(i)]=aux2[payarr].mean(axis=1)
    
#billarr=['BILL_AMT2']
#for i in range(3,7):
#    payarr.append("BILL_AMT{}".format(i))    
#    aux2["maxbill_2_{}".format(i)]=aux2[billarr].max(axis=1)
#    aux2["minbill_2_{}".format(i)]=aux2[billarr].min(axis=1)


#aux2["mean_max_min_pay"]=(aux2["max_pay"]+aux2["min_pay"])/2
#aux2["mean_max_min_bill"]=(aux2["max_bill"]+aux2["min_bill"])/2

#aux2["mean_max_min_pay**2"]=aux2["mean_max_min_pay"]**2
#aux2["mean_max_min_bill**2"]=aux2["mean_max_min_bill"]**2

In [89]:
aux2.shape[0]/amt1_train.shape[0]

0.9190952380952381

In [93]:
tgt = "PAY_AMT2"
ls_pred = [x for x in aux2.columns if x not in [tgt]]

In [94]:
X_train, X_test, y_train, y_test = train_test_split(aux2[ls_pred], aux2[tgt], test_size=0.33, random_state=42)

In [95]:
linReg = LinearRegression()
ridgereg = Ridge()
lassreg = Lasso()
elasnet = ElasticNet()

In [96]:
linReg.fit(X_train,y_train)
ridgereg.fit(X_train, y_train)
lassreg.fit(X_train, y_train)
elasnet.fit(X_train, y_train)

  overwrite_a=True).T


ElasticNet(alpha=1.0, copy_X=True, fit_intercept=True, l1_ratio=0.5,
           max_iter=1000, normalize=False, positive=False, precompute=False,
           random_state=None, selection='cyclic', tol=0.0001, warm_start=False)

In [97]:
reg = cross_val_score(estimator = linReg, X=X_train, y=y_train, cv=4, n_jobs=-1, scoring="r2")
rid = cross_val_score(estimator = ridgereg, X=X_train, y=y_train, cv=4, n_jobs=-1, scoring="r2")
las = cross_val_score(estimator = lassreg, X=X_train, y=y_train, cv=4, n_jobs=-1, scoring="r2")
net = cross_val_score(estimator = elasnet, X=X_train, y=y_train, cv=4, n_jobs=-1, scoring="r2")

In [98]:
print("reg",reg.mean())
print("rig",rid.mean())
print("las",las.mean())
print("net",net.mean())

reg 1.0
rig 1.0
las 0.9999979926449807
net 0.9999980467823524


In [99]:
print("reg",linReg.score(X_train,y_train))
print("rid",ridgereg.score(X_train,y_train))
print("las",lassreg.score(X_train,y_train))
print("net",elasnet.score(X_train,y_train))

reg 1.0
rid 1.0
las 0.9999980716813123
net 0.9999981024081912


In [100]:
amt3_train.T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,20990,20991,20992,20993,20994,20995,20996,20997,20998,20999
ID,27942.0,4342.0,17333.0,27601.0,23311.0,24791.0,12205.0,27136.0,29742.0,3407.0,...,14721.0,4989.0,19051.0,404.0,29246.0,5907.0,24224.0,21561.0,20874.0,25408.0
LIMIT_BAL,200000.0,50000.0,50000.0,100000.0,20000.0,320000.0,500000.0,500000.0,100000.0,170000.0,...,360000.0,130000.0,20000.0,30000.0,50000.0,50000.0,580000.0,50000.0,200000.0,150000.0
SEX,2.0,1.0,2.0,2.0,2.0,1.0,1.0,1.0,1.0,2.0,...,1.0,2.0,2.0,1.0,1.0,2.0,1.0,2.0,1.0,2.0
EDUCATION,2.0,2.0,2.0,1.0,2.0,2.0,1.0,2.0,2.0,1.0,...,1.0,2.0,2.0,2.0,1.0,2.0,1.0,2.0,1.0,2.0
MARRIAGE,2.0,1.0,1.0,2.0,1.0,1.0,1.0,1.0,1.0,2.0,...,2.0,1.0,2.0,1.0,1.0,1.0,2.0,2.0,2.0,1.0
AGE,27.0,32.0,29.0,28.0,46.0,35.0,43.0,47.0,32.0,26.0,...,29.0,32.0,30.0,48.0,25.0,46.0,32.0,23.0,33.0,24.0
PAY_0,0.0,0.0,0.0,0.0,1.0,0.0,-1.0,-1.0,0.0,0.0,...,-1.0,1.0,3.0,0.0,0.0,0.0,-1.0,1.0,0.0,1.0
PAY_2,0.0,0.0,0.0,0.0,2.0,0.0,-1.0,-1.0,0.0,0.0,...,2.0,2.0,2.0,0.0,0.0,0.0,0.0,-1.0,0.0,-1.0
PAY_3,-1.0,0.0,0.0,0.0,0.0,0.0,-1.0,-1.0,0.0,0.0,...,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0
PAY_4,-1.0,0.0,0.0,0.0,0.0,0.0,-1.0,-1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,2.0,-1.0


In [102]:
amt3_train=amt3_train.drop(columns=["PAY_AMT1","PAY_AMT2"])

In [103]:
bill=['BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6']
pay=['PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6']
aux2=iqr(amt2_train,"BILL_AMT5")
aux2=iqr(aux2,"BILL_AMT6")

aux2=percentiles(aux2,"PAY_AMT2",[0.01,0.99])
aux2=percentiles(aux2,"PAY_AMT3",[0.01,0.99])
aux2=percentiles(aux2,"PAY_AMT4",[0.01,0.99])
#aux2=iqr(aux2,"BILL_AMT3")
#aux2=iqr(aux2,"BILL_AMT5")
#aux2=iqr(aux2,"BILL_AMT6")
for i in range(3,7):
    for j in range(i,7):
        if i==j:
            continue
        aux2["diff_BILL_AMT{}_{}".format(i,j)]=aux2["BILL_AMT{}".format(i)]-aux2["BILL_AMT{}".format(j)]
        aux2["diff_PAY_AMT{}_{}".format(i,j)]=aux2["PAY_AMT{}".format(i)]-aux2["PAY_AMT{}".format(j)]
        aux2["diff_PAY_BILL{}_{}".format(i,j)]=aux2["BILL_AMT{}".format(i)]-aux2["PAY_AMT{}".format(j)]
    aux2["LIMBAL*BILL_AMT_{}".format(i)]=aux2["BILL_AMT{}".format(i)]*aux2["LIMIT_BAL"]
    aux2["BILL_AMT_{}**2".format(i)]=aux2["BILL_AMT{}".format(i)]**2
    aux2[f"USE_{i}"] = aux2[f"BILL_AMT{i}"] / aux2["LIMIT_BAL"]


#aux2["mean_pay"]=aux2[pay].mean(axis=1)
#aux2["mean_bill"]=aux2[bill].mean(axis=1)
payarr=['PAY_AMT2']
for i in range(4,7):
    payarr.append("PAY_AMT{}".format(i))
    aux2["maxpay_3_{}".format(i)]=aux2[payarr].max(axis=1)
    aux2["minpay_3_{}".format(i)]=aux2[payarr].min(axis=1)
    aux2["meanpay_3_{}".format(i)]=aux2[payarr].mean(axis=1)
    
#billarr=['BILL_AMT2']
#for i in range(3,7):
#    payarr.append("BILL_AMT{}".format(i))    
#    aux2["maxbill_2_{}".format(i)]=aux2[billarr].max(axis=1)
#    aux2["minbill_2_{}".format(i)]=aux2[billarr].min(axis=1)


#aux2["mean_max_min_pay"]=(aux2["max_pay"]+aux2["min_pay"])/2
#aux2["mean_max_min_bill"]=(aux2["max_bill"]+aux2["min_bill"])/2

#aux2["mean_max_min_pay**2"]=aux2["mean_max_min_pay"]**2
#aux2["mean_max_min_bill**2"]=aux2["mean_max_min_bill"]**2

In [104]:
aux2.shape[0]/amt1_train.shape[0]

0.9283809523809524

In [105]:
tgt = "PAY_AMT2"
ls_pred = [x for x in aux2.columns if x not in [tgt]]

In [106]:
X_train, X_test, y_train, y_test = train_test_split(aux2[ls_pred], aux2[tgt], test_size=0.33, random_state=42)

In [107]:
linReg = LinearRegression()
ridgereg = Ridge()
lassreg = Lasso()
elasnet = ElasticNet()

In [108]:
linReg.fit(X_train,y_train)
ridgereg.fit(X_train, y_train)
lassreg.fit(X_train, y_train)
elasnet.fit(X_train, y_train)

  overwrite_a=True).T


ElasticNet(alpha=1.0, copy_X=True, fit_intercept=True, l1_ratio=0.5,
           max_iter=1000, normalize=False, positive=False, precompute=False,
           random_state=None, selection='cyclic', tol=0.0001, warm_start=False)

In [109]:
reg = cross_val_score(estimator = linReg, X=X_train, y=y_train, cv=4, n_jobs=-1, scoring="r2")
rid = cross_val_score(estimator = ridgereg, X=X_train, y=y_train, cv=4, n_jobs=-1, scoring="r2")
las = cross_val_score(estimator = lassreg, X=X_train, y=y_train, cv=4, n_jobs=-1, scoring="r2")
net = cross_val_score(estimator = elasnet, X=X_train, y=y_train, cv=4, n_jobs=-1, scoring="r2")

In [110]:
print("reg",reg.mean())
print("rig",rid.mean())
print("las",las.mean())
print("net",net.mean())

reg 1.0
rig 1.0
las 0.9999986035967797
net 0.9999986350426153


In [111]:
print("reg",linReg.score(X_train,y_train))
print("rid",ridgereg.score(X_train,y_train))
print("las",lassreg.score(X_train,y_train))
print("net",elasnet.score(X_train,y_train))

reg 1.0
rid 1.0
las 0.9999987818390627
net 0.9999987837296326
