In [1]:
import pandas as pd
import numpy as np
import warnings
from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler    
from sklearn.ensemble import AdaBoostClassifier


testCSV = 'https://raw.githubusercontent.com/andybbruno/DataMining/master/new_test_cleaned.csv?token=AGWKQXZ3CGK7FSO5Y7QZFTS6DCHQK'
trainCSV = 'https://raw.githubusercontent.com/andybbruno/DataMining/master/new_train_cleaned.csv?token=AGWKQXZ5YDIRGB6HOYWHTPC6DCHQQ'



In [2]:
print("################## ADABOOST ##################")
# READ TRAINING
df = pd.read_csv(trainCSV)
df.drop(df.columns[0], axis=1, inplace=True)
df.drop(columns=['RefId'], axis=1, inplace=True)

y_train = df['IsBadBuy']
X_train = df.drop(columns=['IsBadBuy'])

# READ TEST
df = pd.read_csv(testCSV)
df.drop(df.columns[0], axis=1, inplace=True)
df.drop(columns=['RefId'], axis=1, inplace=True)

y_test = df['IsBadBuy']
X_test = df.drop(columns=['IsBadBuy'])


clf = AdaBoostClassifier(learning_rate=0.1, n_estimators=5)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))


print("################## ADABOOST + OVERSAMPLING ##################")
# READ TRAINING
df = pd.read_csv(trainCSV)
df.drop(df.columns[0], axis=1, inplace=True)
df.drop(columns=['RefId'], axis=1, inplace=True)

y_train = df['IsBadBuy']
X_train = df.drop(columns=['IsBadBuy'])
X_train, y_train = RandomOverSampler().fit_resample(X_train, y_train)


# READ TEST
df = pd.read_csv(testCSV)
df.drop(df.columns[0], axis=1, inplace=True)
df.drop(columns=['RefId'], axis=1, inplace=True)

y_test = df['IsBadBuy']
X_test = df.drop(columns=['IsBadBuy'])


clf = AdaBoostClassifier(learning_rate=0.1, n_estimators=5)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))


print("################## ADABOOST + UNDERSAMPLING ##################")
# READ TRAINING
df = pd.read_csv(trainCSV)
df.drop(df.columns[0], axis=1, inplace=True)
df.drop(columns=['RefId'], axis=1, inplace=True)

y_train = df['IsBadBuy']
X_train = df.drop(columns=['IsBadBuy'])
X_train, y_train = RandomUnderSampler().fit_resample(X_train, y_train)


# READ TEST
df = pd.read_csv(testCSV)
df.drop(df.columns[0], axis=1, inplace=True)
df.drop(columns=['RefId'], axis=1, inplace=True)

y_test = df['IsBadBuy']
X_test = df.drop(columns=['IsBadBuy'])


clf = AdaBoostClassifier(learning_rate=0.1, n_estimators=5)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

################## ADABOOST ##################
              precision    recall  f1-score   support

           0       0.90      0.99      0.94     12829
           1       0.68      0.23      0.35      1768

    accuracy                           0.89     14597
   macro avg       0.79      0.61      0.64     14597
weighted avg       0.88      0.89      0.87     14597

################## ADABOOST + OVERSAMPLING ##################
              precision    recall  f1-score   support

           0       0.90      0.99      0.94     12829
           1       0.68      0.23      0.35      1768

    accuracy                           0.89     14597
   macro avg       0.79      0.61      0.64     14597
weighted avg       0.88      0.89      0.87     14597

################## ADABOOST + UNDERSAMPLING ##################
              precision    recall  f1-score   support

           0       0.90      0.99      0.94     12829
           1       0.68      0.23      0.35      1768

    accura