In [80]:
import pandas as pd
import numpy as np

from importlib.machinery import SourceFileLoader
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

DI = SourceFileLoader("DI", "src/DI.py").load_module()

Handler = DI.Handler.Handler()
Graph = DI.Graph.Graph()
AlgoML = DI.AlgoML.AlgoML()
Refine = DI.Refine.Refine()
Metrics = DI.Metrics.Metrics()

_data_train = pd.read_csv("/home/agl/codenation/20200530_DS_codenationModule8/1.Original/train.csv")
_data_test = pd.read_csv("/home/agl/codenation/20200530_DS_codenationModule8/1.Original/test.csv")

In [81]:
features = [
    'NU_NOTA_REDACAO',
    'NU_NOTA_CN',
    'NU_NOTA_LC',
    'NU_NOTA_CH',
    'NU_NOTA_COMP3',
    'NU_NOTA_COMP1',
    'NU_NOTA_COMP5',
    'TP_STATUS_REDACAO',
    'NU_NOTA_COMP4',
    'NU_NOTA_COMP2',
    'TP_LINGUA' #This feature selected by bias on our analyze give a little better on metrics result
]

target = "NU_NOTA_MT"

In [82]:
#get colums of inscribeds of data test and save
inscribeds_column_all = _data_test["NU_INSCRICAO"]

#data test cleaned
test = _data_test[features + ["NU_INSCRICAO"]].dropna()

#get colums of inscribeds of data test without null and save
inscribeds_column_cleaned = test["NU_INSCRICAO"]

#Remove NU_INSCRICAO of data test because not is more necessary and must to have only the features for our model
test = test.drop(columns="NU_INSCRICAO")

In [83]:
#get our data train and separate it
train = _data_train[features + [target]].dropna()
train_target = train[target]
train.drop(columns=target, inplace=True)

#obj with x_train, y_train, x_test, y_test randomized
obj_train = Handler.create_test_from_train(train, train_target, 0.517)

In [84]:
#Standard data for train of obj_train
scaler = Refine.standardize(obj_train["x_train"])

#Transform the indepedents variable into obj_train
obj_train["x_train"] = scaler.transform(obj_train["x_train"])
obj_train["x_test"] = scaler.transform(obj_train["x_test"])

#Transform also our official test data
test = scaler.transform(test)

In [85]:
#Now the moment more cool, AlgoML
#First set some configurations that had found in experiments

AlgoML.hp_randomForestRegressor["max_depth"] = 8
AlgoML.hp_randomForestRegressor["max_features"] = "sqrt"
# AlgoML.hp_randomForestRegressor["min_samples_leaf"] = 2
AlgoML.hp_randomForestRegressor["n_estimators"] = 2000
AlgoML.hp_randomForestRegressor["bootstrap"] = True
AlgoML.hp_randomForestRegressor

{'criterion': 'mse',
 'max_depth': 8,
 'max_leaf_nodes': None,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 2000,
 'n_jobs': -1,
 'random_state': 0,
 'verbose': 0,
 'warm_start': False,
 'max_features': 'sqrt',
 'bootstrap': True}

In [86]:
#Result metrics
ml = AlgoML.randomForestRegressor(obj_data=obj_train)

Metrics.calculate_regression(obj_train["y_test"], ml.predict(obj_train["x_test"]))

{'mean_abs_err': 58.939470960780454,
 'mean_sqr_err': 5611.779419368704,
 'r_mean_sqr_err': 74.91181094706431}

In [87]:
#Now predict
y_test_pred = ml.predict(test)

In [88]:
#And merge with the others null
column_predicted = pd.DataFrame(inscribeds_column_cleaned.copy())
column_predicted["NU_NOTA_MT"] = np.around(y_test_pred, 1)

answer = pd.merge(pd.DataFrame(inscribeds_column_all), column_predicted,
             on="NU_INSCRICAO", how="outer", right_index=True)

answer.head(6)

Unnamed: 0,NU_INSCRICAO,NU_NOTA_MT
0,73ff9fcc02f0a99919906c942c2e1a1042cdcf98,432.7
1,71a95f9f1b91a82c65ad94abbdf9f54e6066f968,452.1
2,b38a03232f43b11c9d0788abaf060f7366053b6d,579.1
3,70b682d9a3636be23f6120fa9d6b164eb3c6002d,
4,715494628a50142ce8cb17191cfe6d0f3cae0934,509.6
5,e656d6bad65c93fb2880f1eba5037008c8e75774,465.4


In [89]:
#To confirm if have tha same number null between answer and our columns cleaned
_s031k = (inscribeds_column_all.shape)[0] - (inscribeds_column_cleaned.shape)[0]

answer.isna().sum()["NU_NOTA_MT"] == _s031k

True

In [90]:
#Finally save it :)
answer.to_csv("~/codenation/20200530_DS_codenationModule8/3.Deployed/codenationModule8.csv", index=False)