# <center>Give Me Credit: Kaggle competition</center>

## Première Etape : Importation des librairies nécessaires

In [None]:
from bigml.api import BigML
from pandas import read_csv
import kaggle
import matplotlib.pyplot as plt
import pandas as pd
import random

*<center>Préciser l'id du projet BigML</center>*

In [None]:
api = BigML(project= 'project/5d94a407eba31d45c8000088')

## Deuxième Etape: Importer les données de train

*<center>Créer le dataset de training: trainfull</center>*

In [None]:
df = read_csv('../handson-ml2/kaggle-give-me-credit-train.csv')

In [None]:
df = df.fillna(0)
df['IncomePerPerson'] = df['MonthlyIncome']/ (df['NumberOfDependents']+1)
df['NumberOfDaysLate'] = df['NumberOfTimes90DaysLate']+ df['NumberOfTime60-89DaysPastDueNotWorse']+ df['NumberOfTime30-59DaysPastDueNotWorse']
df['NumberCreditLines'] = df['NumberOfOpenCreditLinesAndLoans']- df['NumberRealEstateLoansOrLines']
df['MonthlyDebt'] = df['MonthlyIncome'] * df['DebtRatio']
df['MonthlyBalance'] = df['MonthlyIncome'] - df['MonthlyDebt']
df = df.rename(columns={"Unnamed: 0": "Id"})

In [None]:
df

In [None]:
df.to_csv("trainfull.csv")

In [None]:
source = api.create_source('../handson-ml2/trainfull.csv')

In [None]:
origin_dataset = api.create_dataset(source)

## Troisième étape: Séparation en train et val

*<center>Split 80/20 du trainfull en train et validation</center>*

In [None]:
train_dataset = api.create_dataset (origin_dataset, {"name": "Train80", "sample_rate":0.8, "seed": "myseed"})

In [None]:
val_dataset = api.create_dataset (origin_dataset, {"name": "Val", "sample_rate":0.8, "seed": "myseed", "out_of_bag": True})

## Quatrième étape: Modèle d'apprentissage

*<center>Lancement d'un Modèle ou d'un Deepnet ou autre...</center>*

In [None]:
ensemble = api.create_ensemble(train_dataset, {"objective_field":"SeriousDlqin2yrs"})
api.ok(ensemble)

*<center>Evaluation du modèle</center>*

In [None]:
evaluation = api.create_evaluation(ensemble, val_dataset)
api.status(evaluation)
api.ok(evaluation)
evaluation = api.get_evaluation(evaluation)
api.pprint(evaluation['object']['result'])

*<center>Lancer la prediction à partir de notre modèle en demandant les probabilités</center>*

In [None]:
batch_prediction = api.create_batch_prediction(ensemble, val_dataset, {"all_fields": True, "header": True, "probabilities": True})
api.ok(batch_prediction)

*<center>Récupérer la batch prediction au format csv</center>*

In [None]:
api.download_batch_prediction(batch_prediction, filename='../handson-ml2/my_predictionsvalidation.csv')

In [None]:
df = read_csv('../handson-ml2/my_predictionsvalidation.csv')
df = df.rename(columns={"SeriousDlqin2yrs.1":"MyPrediction"})
df = df.drop(columns={"field1"})
df

## Cinquième étape: vérifier les erreurs pour évaluer notre modèle

In [None]:
TN = 0
TP = 0
FN = 0
FP = 0

serie = df.transpose() # here we convert the DataFrame into a Serie
for ligne in serie:
    if(serie[ligne]['SeriousDlqin2yrs'] == 0 and serie[ligne]['MyPrediction'] == 0):
             TN = TN + 1
            
    if(serie[ligne]['SeriousDlqin2yrs'] == 1 and serie[ligne]['MyPrediction'] == 1):
             TP = TP + 1

    if(serie[ligne]['SeriousDlqin2yrs'] == 1 and serie[ligne]['MyPrediction'] == 0):
             FN = FN + 1

    if(serie[ligne]['SeriousDlqin2yrs'] == 0 and serie[ligne]['MyPrediction'] == 1):
             FP = FP + 1
            
print(f"TN : {TN}")
print(f"TP : {TP}")
print(f"FN : {FN}")
print(f"FP : {FP}")

*<center> Ajouter une colonne erreur contenant les TN,FN, TP, FP</center>*

In [None]:
def confusion(row):
    if row['SeriousDlqin2yrs'] == 0 and row['MyPrediction'] == 0:
        error_value = 'TN'
    if row['SeriousDlqin2yrs'] == 1 and row['MyPrediction'] == 1:
        error_value = 'TP'
    if row['SeriousDlqin2yrs'] == 1 and row['MyPrediction'] == 0:
        error_value = 'FN'
    if row['SeriousDlqin2yrs'] == 0 and row['MyPrediction'] == 1:
        error_value = 'FP'
    return error_value

df['error'] = df.apply(confusion, axis = 1)
df.to_csv('../handson-ml2/kaggle-give-me-credit-train_confusion.csv', index=False)

In [None]:
df = read_csv('../handson-ml2/kaggle-give-me-credit-train_confusion.csv')
df

*<center> Mesurer l'accuracy, compter les différentes erreurs, et construire une matrice de confusion</center>*

In [None]:
accuracy = (TN+TP)/(TP+TN+FP+FN)
accuracy

In [None]:
matrice = df['error'].value_counts()
matrice

In [None]:
data = read_csv('../handson-ml2/kaggle-give-me-credit-train_confusion.csv')

data = pd.DataFrame(data, columns=['SeriousDlqin2yrs','MyPrediction'])

confusion_matrix = pd.crosstab(df['SeriousDlqin2yrs'], df['MyPrediction'], rownames=['Actual'], colnames=['Predicted'])
print (confusion_matrix)

*<center> Calculer le gain à partir d'un seuil </center>*

In [None]:
def gain_seuil(seuil):
    def seuil_pred(row, seuil):
        if row['1 probability'] > seuil and row['SeriousDlqin2yrs'] == 0:
            row['error'] = 'FP'
        if row['1 probability'] > seuil and row['SeriousDlqin2yrs'] == 1:
            row['error'] = 'TP'
        if row['1 probability'] < seuil and row['SeriousDlqin2yrs'] == 0:
            row['error'] = 'TN'
        if row['1 probability'] < seuil and row['SeriousDlqin2yrs'] == 1:
            row['error'] = 'FN'
        return row

    for data in [df]:
        data['error'] = df[['error', '1 probability', 'SeriousDlqin2yrs']].apply(seuil_pred, axis=1, seuil = seuil)

    count = data['error'].value_counts()
    gain = 1
    if "TP" not in  data['error']:
        if "FP" not in  data['error']:
            gain = count[0]*500 - count[1]*2500
        else:
            gain = count[0]*500 - count[1]*2500 - count[3]*500
    else:
        gain = count[0]*500 - count[1]*2500 - count[2]*500

    return gain

In [None]:
%matplotlib inline

In [None]:
table = [gain_seuil(n/100) for n in range(1,100)]
seuil = [(n/100) for n in range(1,100)]
plt.plot(table)
plt.ylabel("Seuil")
plt.xlabel("Gain")
plt.title('Gain maximisé')
max_val = max(table)
max_threshold = table.index(max_val)
print(f'Le gain maximum est {max_val} et il est obtenu avec un seuil de {max_threshold}')

*<center> Récupérer les 100 plus grosses erreurs </center>*

In [None]:
df['absolute_error'] = (df['1 probability']-df['SeriousDlqin2yrs']).abs()
desc_order = df.sort_values(by='absolute_error', ascending = False).head(100)
desc_order.to_csv('../handson-ml2/100error.csv', index=False)

*<center> Calculer l'AUC </center>*

In [None]:
from sklearn.metrics import roc_curve, auc, roc_auc_score
score = roc_auc_score(df['SeriousDlqin2yrs'].values,df['1 probability'].values)
print(score)

In [None]:
positif = df.loc[df['SeriousDlqin2yrs'] == 1] # Tout les Positif 
negatif = df.loc[df['SeriousDlqin2yrs'] == 0] # Tout les negatifs 

x = 0
y = 0
# pour chaque 1 proba ( P ) dans toutes les 1 proba (P)
for threshold_pos in positif['1 probability']:
    # pour chaque 1 Proba ( N ) dans toutes les 1 proba ( N )
    for threshold_neg in negatif['1 probability']:
        if threshold_pos > threshold_neg:
            x += 1 
        y += 1 
print(f"L'AUC est égale à {round((x/y)*100,2)}%")

In [None]:
nombre_supérieur = 0
nombre_iteration = 0
serie = df.transpose() # here we convert the DataFrame into a Serie
for ligne in serie:
    if serie[ligne]['SeriousDlqin2yrs'] == 1: 
        proba_positif = serie[ligne]['1 probability'] # On stocke la valeur de 1 probability à la variable proba_positif
        for ligne_neg in serie: 
            if serie[ligne_neg]['SeriousDlqin2yrs'] == 0:
                if proba_positif > serie[ligne_neg]['1 probability']: # Si la valeur de proba_positif est inférieure à la valeur de 1 probability
                    nombre_supérieur += 1 # On incrémente la valeur de 1
                nombre_iteration += 1 # On incrémente le nombre de boucle total (dénominateur pour le calcul de l'AUC)

print(nombre_supérieur)
print(nombre_iteration)
print(f"{nombre_supérieur/nombre_iteration}")

## Sixième étape: Préparer les données de test

In [None]:
df = read_csv('../handson-ml2/kaggle-give-me-credit-test.csv')

In [None]:
df = df.rename(columns={"Unnamed: 0": "Id"})
df['IncomePerPerson'] = df['MonthlyIncome']/ (df['NumberOfDependents']+1)
df['NumberOfDaysLate'] = df['NumberOfTimes90DaysLate']+ df['NumberOfTime60-89DaysPastDueNotWorse']+ df['NumberOfTime30-59DaysPastDueNotWorse']
df['NumberCreditLines'] = df['NumberOfOpenCreditLinesAndLoans']- df['NumberRealEstateLoansOrLines']
df['MonthlyDebt'] = df['MonthlyIncome'] * df['DebtRatio']
df['MonthlyBalance'] = df['MonthlyIncome'] - df['MonthlyDebt']

In [None]:
df

In [None]:
df.to_csv("test.csv")

*<center> Créer un dataset de test </center>*

In [None]:
test_source = api.create_source('../handson-ml2/test.csv')
api.ok(test_source)

In [None]:
test_dataset = api.create_dataset(test_source)

## Septième étape: Modèle avec le trainfull/test

In [None]:
ensemble1 = api.create_ensemble(origin_dataset, {"objective_field":"SeriousDlqin2yrs"})
api.ok(ensemble1)

In [None]:
batch_prediction1 = api.create_batch_prediction(ensemble1, test_dataset, {"output_fields": ["Id"], "probabilities": True})
api.ok(batch_prediction1)

In [None]:
api.download_batch_prediction(batch_prediction1, filename='../handson-ml2/my_predictionsfinales.csv')

## Huitième étape: Soumission à Kaggle

In [None]:
submit = read_csv('../handson-ml2/my_predictionsfinales.csv')

*<center> Mettre au bon format pour kaggle </center>*

In [None]:
submit = submit.rename(columns={"1 probability": "Probability"})
submit = submit.drop(columns={"0 probability", "SeriousDlqin2yrs"})

In [None]:
submit

In [None]:
submit.to_csv("submitkaggle.csv", index=False)

In [None]:
submission_file = "submitkaggle.csv"

*<center> Soumission à kaggle </center>*

In [None]:
kaggle.api.competition_submit(submission_file, "BigML model", "GiveMeSomeCredit")