In [126]:
import pandas as pd
import numpy as np

from importlib.machinery import SourceFileLoader
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

DI = SourceFileLoader("DI", "src/DI.py").load_module()

Handler = DI.Handler.Handler()
Graph = DI.Graph.Graph()
AlgoML = DI.AlgoML.AlgoML()
Refine = DI.Refine.Refine()
Metrics = DI.Metrics.Metrics()

_data_train = pd.read_csv("/home/agl/codenation/20200606_DS_codenationModule9/1.Original/train.csv")
_data_test = pd.read_csv("/home/agl/codenation/20200606_DS_codenationModule9/1.Original/test.csv")

In [3]:
# All analyzes made on file analyze.ipynb

#1 - Drop the necessaries columns, but remimber storage them in a variable because will need to concat at end
#2 - Remove the columns that have 60% null if they had low coeffience
#3 - Create differents dataset with differentes strategy to use in model and see which will give better result

In [4]:
features = [
    'NU_IDADE',
    'TP_ST_CONCLUSAO',
    'TP_ANO_CONCLUIU',
    'TP_ESCOLA',
]

target = "IN_TREINEIRO"

In [5]:
#get colums of inscribeds of data test and save
inscribeds_column_all = _data_test["NU_INSCRICAO"]

#data test cleaned
test = _data_test[features + ["NU_INSCRICAO"]].dropna()

#get colums of inscribeds of data test without null and save
inscribeds_column_cleaned = test["NU_INSCRICAO"]

#Remove NU_INSCRICAO of data test because not is more necessary and must to have only the features for our model
test = test.drop(columns="NU_INSCRICAO")

In [128]:
#get our data train and separate it
train = _data_train[features + [target]].dropna()
train_target = train[target]
train.drop(columns=target, inplace=True)

#obj with x_train, y_train, x_test, y_test randomized
obj_train = Handler.create_test_from_train(train, train_target, 0.517, 80)

In [129]:
#Standard data for train of obj_train
scaler = Refine.standardize(obj_train["x_train"])

#Transform the indepedents variable into obj_train
obj_train["x_train"] = scaler.transform(obj_train["x_train"])
obj_train["x_test"] = scaler.transform(obj_train["x_test"])

#Transform also our official test data
test = scaler.transform(test)

In [130]:
#Result metrics
ml = AlgoML.neighborsClassifier(obj_data=obj_train)

Metrics.calculate_classification(obj_train["y_test"], ml.predict(obj_train["x_test"]))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      6147
           1       1.00      0.98      0.99       952

    accuracy                           1.00      7099
   macro avg       1.00      0.99      0.99      7099
weighted avg       1.00      1.00      1.00      7099


        ----------------------------------------------------------------------
          True Negatives = 6147             |    False Positives = 0
          False Negatives = 23            |    True Positives = 929
        ----------------------------------------------------------------------
            
roc_auc_score = 0.9879201680672269
avg_precision_score = 0.9790802290771218


In [144]:
forest = AlgoML.randomForestClassifier(obj_data=obj_train)

Metrics.calculate_classification(obj_train["y_test"], forest.predict(obj_train["x_test"]))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      6147
           1       1.00      0.98      0.99       952

    accuracy                           1.00      7099
   macro avg       1.00      0.99      0.99      7099
weighted avg       1.00      1.00      1.00      7099


        ----------------------------------------------------------------------
          True Negatives = 6147             |    False Positives = 0
          False Negatives = 23            |    True Positives = 929
        ----------------------------------------------------------------------
            
roc_auc_score = 0.9879201680672269
avg_precision_score = 0.9790802290771218


0.996760107057332

In [132]:
naive = AlgoML.guassianNB(obj_data=obj_train)

Metrics.calculate_classification(obj_train["y_test"], naive.predict(obj_train["x_test"]))

              precision    recall  f1-score   support

           0       1.00      0.96      0.98      6147
           1       0.81      1.00      0.89       952

    accuracy                           0.97      7099
   macro avg       0.90      0.98      0.94      7099
weighted avg       0.97      0.97      0.97      7099


        ----------------------------------------------------------------------
          True Negatives = 5921             |    False Positives = 226
          False Negatives = 0            |    True Positives = 952
        ----------------------------------------------------------------------
            
roc_auc_score = 0.9816170489669758
avg_precision_score = 0.8081494057724957


In [133]:
svm = AlgoML.svm(obj_data=obj_train)

Metrics.calculate_classification(obj_train["y_test"], svm.predict(obj_train["x_test"]))

              precision    recall  f1-score   support

           0       1.00      0.98      0.99      6147
           1       0.90      1.00      0.95       952

    accuracy                           0.98      7099
   macro avg       0.95      0.99      0.97      7099
weighted avg       0.99      0.98      0.98      7099


        ----------------------------------------------------------------------
          True Negatives = 6037             |    False Positives = 110
          False Negatives = 0            |    True Positives = 952
        ----------------------------------------------------------------------
            
roc_auc_score = 0.9910525459573776
avg_precision_score = 0.896421845574388


In [134]:
logicReg = AlgoML.logisticRegression(obj_data=obj_train)

Metrics.calculate_classification(obj_train["y_test"], logicReg.predict(obj_train["x_test"]))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      6147
           1       0.98      0.98      0.98       952

    accuracy                           0.99      7099
   macro avg       0.99      0.99      0.99      7099
weighted avg       0.99      0.99      0.99      7099


        ----------------------------------------------------------------------
          True Negatives = 6129             |    False Positives = 18
          False Negatives = 23            |    True Positives = 929
        ----------------------------------------------------------------------
            
roc_auc_score = 0.9864560392238886
avg_precision_score = 0.9605320495096242


In [None]:
#Now predict
y_test_pred = ml.predict(test)

In [None]:
#And merge with the others null
column_predicted = pd.DataFrame(inscribeds_column_cleaned.copy())
column_predicted[target] = np.around(y_test_pred, 1)

answer = pd.merge(pd.DataFrame(inscribeds_column_all), column_predicted,
             on="NU_INSCRICAO", how="outer", right_index=True)

answer.head(6)

In [None]:
#To confirm if have tha same number null between answer and our columns cleaned
_s031k = (inscribeds_column_all.shape)[0] - (inscribeds_column_cleaned.shape)[0]

answer.isna().sum()[target] == _s031k

# MERGE EXEMPLE
inscribeds_column_all = _data_test["NU_INSCRICAO"]

inscribes_column_withoutn_null = _data_test.dropna(subset=features)
pd.merge(inscribes_column_withoutn_null, pd.DataFrame(incribeds_column_all), how="outer").shape