#### Importação de bibliotecas

In [1]:
import numpy as np
import pandas as pd
import lightgbm as lgb
import seaborn as sns
import matplotlib.pyplot as plt
from tpot import TPOTClassifier

from pathlib import Path
from sklearn.model_selection import train_test_split



In [2]:
pd.set_option('display.max_colwidth', None)

#### Configuration

In [3]:
raw_data_dir = Path("data/raw/")

#### Load data

In [4]:
train_valid = pd.read_csv(Path(raw_data_dir, "train.csv"))
test        = pd.read_csv(Path(raw_data_dir, "test.csv"))
metadata    = pd.read_csv(Path(raw_data_dir, "metadata.csv"))
submission  = pd.read_csv(Path(raw_data_dir, "submission_sample.csv"))

In [5]:
target = "y"
ids = "id"
variables = [
    'var1', 'var2', 'var3', 'var4', 'var5', 'var6', 'var7', 'var8', 'var9', 
    'var10', 'var11', 'var12', 'var13', 'var14', 'var15', 'var16',
    'var17', 'var18', 'var19', 'var20', 'var21', 'var22', 'var23', 'var24',
    'var25', 'var26', 'var27', 'var28', 'var29', 'var30', 'var31', 'var32',
    'var33', 'var34', 'var35', 'var36', 'var37', 'var38', 'var39', 'var40',
    'var41', 'var42', 'var43', 'var44', 'var45', 'var46', 'var47', 'var48',
    'var49', 'var50', 'var51', 'var52', 'var53', 'var54', 'var55', 'var56',
    'var57', 'var58', 'var59', 'var60', 'var61', 'var62', 'var63', 'var64',
    'var65', 'var66', 'var67', 'var68']

#### Variable types

In [6]:
meta = metadata.rename(columns={"Variavel cod": "variable", "Variavel tipo": "type"})
meta_table =  meta.groupby(by="type", as_index=False).count()
meta_table["variable"] = [meta[meta["type"] == x]["variable"].tolist() for x in meta_table["type"]]
meta_table

Unnamed: 0,type,variable
0,Qualitativo nominal,"[id, var1, var2, var3, var4, var5, var6, var7, var8, var9, var10, var11, var12, var13, var14, var15, var16, var17, var18, var19, var20, var21, var22, var23, var28, var29, var30, var31, var33, var34, var35, var36, var37, var38, var39, var41]"
1,Qualitativo ordinal,"[var26, var32, var42, var43]"
2,Quantitativo continua,"[var55, var56, var57, var58, var59, var60, var61, var62, var63, var64, var65, var66]"
3,Quantitativo discreto,"[var24, var25, var27, var40, var44, var45, var46, var47, var48, var49, var50, var51, var52, var53, var54, var67, var68, y]"


#### Train and Validation Split

In [7]:
X_train, X_valid, y_train, y_valid = train_test_split(train_valid[variables], train_valid[[target]], 
                                                      test_size=0.25, 
                                                      random_state=4233)

In [8]:
train = pd.concat([y_train, X_train], axis=1)
valid = pd.concat([y_valid, X_valid], axis=1)

In [None]:
perc_train = y_train.reset_index().\
    groupby(by="y", as_index=False).\
    count()/int(y_train.count())

perc_valid = y_valid.reset_index().\
    groupby(by="y", as_index=False).\
    count()/int(y_valid.count())

perc_total = train_valid[[target]].reset_index().\
    groupby(by="y", as_index=False).\
    count()/int(train_valid[[target]].count())

pd.concat([perc_train[["index"]].rename(columns={"index": "train"}),
          perc_valid[["index"]].rename(columns={"index": "valid"}),
          perc_total[["index"]].rename(columns={"index": "total"})],
          axis=1)

#### Data Exploration

In [None]:
vars_ordinal  = meta_table[meta_table["type"]=="Qualitativo ordinal"]["variable"].tolist()[0]
vars_ordinal

In [None]:
train_ordinal = train[["y"] + vars_ordinal]
melt_orginal  = pd.melt(train[["y"] + vars_ordinal], id_vars=["y"])
melt_orginal.head()

In [None]:
count_ordinal = melt_orginal.\
    reset_index().\
    groupby(by=["variable", "value", "y"], as_index=False).\
    count()

In [None]:
perct_ordinal = count_ordinal.\
    groupby(by=["variable", "value", "y"]).agg({'index': 'sum'}).\
    groupby(by=["variable", "value"], level=0).apply(lambda x:100 * x / float(x.sum())).reset_index()

perct_ordinal.head()

In [None]:
var = "var26"
fig, axs = plt.subplots(ncols=1, nrows=2, figsize = (14,8) )

ax1 = sns.barplot(ax=axs[0],x="value", y="index", hue="y", data=count_ordinal[count_ordinal["variable"]==var])
ax2 = sns.barplot(ax=axs[1],x="value", y="index", hue="y", data=perct_ordinal[perct_ordinal["variable"]==var])

In [None]:
var = "var32"
fig, axs = plt.subplots(ncols=1, nrows=2, figsize = (14,8) )

ax1 = sns.barplot(ax=axs[0],x="value", y="index", hue="y", data=count_ordinal[count_ordinal["variable"]==var])
ax2 = sns.barplot(ax=axs[1],x="value", y="index", hue="y", data=perct_ordinal[perct_ordinal["variable"]==var])

#### Variável 42

In [None]:
var = "var42"
fig, axs = plt.subplots(ncols=1, nrows=2, figsize = (14,8) )

ax1 = sns.barplot(ax=axs[0],x="value", y="index", hue="y", data=count_ordinal[count_ordinal["variable"]==var])
ax2 = sns.barplot(ax=axs[1],x="value", y="index", hue="y", data=perct_ordinal[perct_ordinal["variable"]==var])

In [None]:
var = "var43"
fig, axs = plt.subplots(ncols=1, nrows=2, figsize = (14,8) )

ax1 = sns.barplot(ax=axs[0],x="value", y="index", hue="y", data=count_ordinal[count_ordinal["variable"]==var])
ax2 = sns.barplot(ax=axs[1],x="value", y="index", hue="y", data=perct_ordinal[perct_ordinal["variable"]==var])

#### Train Model

In [None]:
x = train[variables]
y = train[target]

In [None]:
train_data = lgb.Dataset(x, label = y)

In [None]:
param = {'objective': 'binary'}
param['metric'] = 'f1'

In [None]:
bst = lgb.train(param, train_data)

In [None]:
a = np.rint(bst.predict(test[variables])).tolist()

In [None]:
submission["predicted"] = [int(a) for a in a]

In [None]:
submission

In [None]:
submission.to_csv("secondsubmission.csv", index=None)

#### Tpot

In [30]:
from sklearn.metrics import f1_score, make_scorer

In [31]:
scorer = make_scorer(f1_score)

In [9]:
tpot = TPOTClassifier(verbosity=3, random_state=42,scoring=scorer)
tpot.fit(X_train, y_train["y"])

32 operators have been imported by TPOT.


Optimization Progress:   0%|          | 0/30 [00:00<?, ?pipeline/s]

_pre_test decorator: _random_mutation_operator: num_test=0 Unsupported set of arguments: The combination of penalty='l1' and loss='hinge' is not supported, Parameters: penalty='l1', loss='hinge', dual=False.

Generation 1 - Current Pareto front scores:

-1	0.8664081574994172	XGBClassifier(input_matrix, XGBClassifier__learning_rate=0.001, XGBClassifier__max_depth=9, XGBClassifier__min_child_weight=7, XGBClassifier__n_estimators=100, XGBClassifier__n_jobs=1, XGBClassifier__subsample=0.45, XGBClassifier__verbosity=0)

-2	0.8700898966631773	RandomForestClassifier(MinMaxScaler(input_matrix), RandomForestClassifier__bootstrap=True, RandomForestClassifier__criterion=gini, RandomForestClassifier__max_features=0.2, RandomForestClassifier__min_samples_leaf=8, RandomForestClassifier__min_samples_split=4, RandomForestClassifier__n_estimators=100)

Generation 2 - Current Pareto front scores:

-1	0.8699012175019754	RandomForestClassifier(input_matrix, RandomForestClassifier__bootstrap=True, RandomFo

TPOTClassifier(generations=5, population_size=5, random_state=42, verbosity=3)

In [11]:
print(tpot.score(X_train, y_train))

0.9486404833836858


  return f(*args, **kwargs)


In [10]:
print(tpot.score(X_valid, y_valid))

0.8660436137071651


  return f(*args, **kwargs)


In [14]:
X_valid.shape

(3531, 68)

In [19]:
submission

Unnamed: 0,id,predicted
0,0,1
1,2,1
2,4,1
3,7,0
4,15,1
...,...,...
21178,35297,0
21179,35298,0
21180,35300,0
21181,35302,1


In [21]:
a = np.rint(tpot.predict(test[variables])).tolist()

In [22]:
submission["predicted"] = [int(a) for a in a]

In [29]:
submission.to_csv("thirdsubmission.csv", index=None)