In [23]:
import pandas as pd
import numpy as np

from importlib.machinery import SourceFileLoader
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

DI = SourceFileLoader("DI", "src/DI.py").load_module()

Handler = DI.Handler.Handler()
Graph = DI.Graph.Graph()
AlgoML = DI.AlgoML.AlgoML()
Refine = DI.Refine.Refine()
Metrics = DI.Metrics.Metrics()

_data_train = pd.read_csv("/home/agl/codenation/20200606_DS_codenationModule9/1.Original/train.csv")
_data_test = pd.read_csv("/home/agl/codenation/20200606_DS_codenationModule9/1.Original/test.csv")

In [18]:
features = [
    'NU_IDADE',
    'TP_ST_CONCLUSAO',
    'TP_ANO_CONCLUIU',
    'TP_ESCOLA',
]

target = "IN_TREINEIRO"

In [19]:
#get colums of inscribeds of data test and save
inscribeds_column_all = _data_test["NU_INSCRICAO"]

#data test cleaned
test = _data_test[features + ["NU_INSCRICAO"]].dropna()

#get colums of inscribeds of data test without null and save
inscribeds_column_cleaned = test["NU_INSCRICAO"]

#Remove NU_INSCRICAO of data test because not is more necessary and must to have only the features for our model
test = test.drop(columns="NU_INSCRICAO")

In [20]:
#get our data train and separate it
train = _data_train[features + [target]].dropna()
train_target = train[target]
train.drop(columns=target, inplace=True)

#obj with x_train, y_train, x_test, y_test randomized
obj_train = Handler.create_test_from_train(train, train_target, 0.517, 80)

In [21]:
#Standard data for train of obj_train
scaler = Refine.standardize(obj_train["x_train"])

#Transform the indepedents variable into obj_train
obj_train["x_train"] = scaler.transform(obj_train["x_train"])
obj_train["x_test"] = scaler.transform(obj_train["x_test"])

#Transform also our official test data
test = scaler.transform(test)

In [24]:
#Result metrics
ml = AlgoML.neighborsClassifier(obj_data=obj_train)

Metrics.calculate_classification(obj_train["y_test"], ml.predict(obj_train["x_test"]))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      6147
           1       1.00      0.98      0.99       952

    accuracy                           1.00      7099
   macro avg       1.00      0.99      0.99      7099
weighted avg       1.00      1.00      1.00      7099


        ----------------------------------------------------------------------
          True Negatives = 6147             |    False Positives = 0
          False Negatives = 23            |    True Positives = 929
        ----------------------------------------------------------------------
            
roc_auc_score = 0.9879201680672269


In [15]:
#Now predict
y_test_pred = ml.predict(test)

In [16]:
#And merge with the others null
column_predicted = pd.DataFrame(inscribeds_column_cleaned.copy())
column_predicted[target] = np.around(y_test_pred, 1)

answer = pd.merge(pd.DataFrame(inscribeds_column_all), column_predicted,
             on="NU_INSCRICAO", how="outer", right_index=True)

answer.head(6)

Unnamed: 0,NU_INSCRICAO,IN_TREINEIRO
0,ba0cc30ba34e7a46764c09dfc38ed83d15828897,0
1,177f281c68fa032aedbd842a745da68490926cd2,0
2,6cf0d8b97597d7625cdedc7bdb6c0f052286c334,1
3,5c356d810fa57671402502cd0933e5601a2ebf1e,0
4,df47c07bd881c2db3f38c6048bf77c132ad0ceb3,0
5,3f28749fb79fb059caf5aed79625a5addfd7a91a,0


In [17]:
#To confirm if have tha same number null between answer and our columns cleaned
_s031k = (inscribeds_column_all.shape)[0] - (inscribeds_column_cleaned.shape)[0]

answer.isna().sum()[target] == _s031k

True

In [20]:
#Finally save it :)
answer.to_csv("/home/agl/codenation/20200606_DS_codenationModule9/3.Deploy/codenationModule9.csv", index=False)