In [1]:
import pandas as pd
import numpy as np
from sklearn import tree
from sklearn import preprocessing
from sklearn.ensemble import AdaBoostClassifier
from sklearn.datasets import make_classification
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import RandomOverSampler 
from xgboost import XGBClassifier
import graphviz
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
from typing import OrderedDict

In [2]:
def make_numeric(dataset):
    le = preprocessing.LabelEncoder()
    for categorical in dataset.select_dtypes(object):
        dataset[categorical] = le.fit_transform(dataset[categorical].values)
        
train = pd.read_csv('/kaggle/input/ilp2021f/train_final.csv')
test = pd.read_csv('/kaggle/input/ilp2021f/test_final.csv')

trainX = train.drop(columns=['income>50K'])
trainY = train[['income>50K']].astype(np.uint8)
test.drop(columns=['ID'],inplace=True)

make_numeric(trainX)
make_numeric(test) 
make_numeric(train)

In [3]:
# Output Decision Tree Predictions

model = tree.DecisionTreeClassifier(criterion="entropy", max_depth=4)
model = model.fit(trainX, trainY)
predictions = model.predict(test)
submission = pd.DataFrame({'ID': range(1,test.shape[0]+1), 'Prediction': predictions})
submission.to_csv('DecisionTree.csv', index=False)

In [4]:
# Output AdaBoost Predictions

model = AdaBoostClassifier(n_estimators=20, learning_rate=1, random_state=0)
model = model.fit(trainX, trainY['income>50K'])
predictions = model.predict(test)
submission = pd.DataFrame({'ID': range(1,test.shape[0]+1), 'Prediction': predictions})
submission.to_csv('AdaBoost.csv', index=False)

In [5]:
# Output RandomForest Predictions

model = RandomForestClassifier(n_estimators = 20, max_depth=2, random_state=0)
model = model.fit(trainX, trainY['income>50K'])
predictions = model.predict(test)
submission = pd.DataFrame({'ID': range(1,test.shape[0]+1), 'Prediction': predictions})
submission.to_csv('RandomForest.csv', index=False)

In [6]:
# Feature Tuning For XGBoost

def fix(dataset):
    le = preprocessing.LabelEncoder()
    for col in dataset.columns:
        if dataset[col].dtypes == 'object':
            dataset[col] = le.fit_transform(dataset[col].values)
            
train = pd.read_csv('/kaggle/input/ilp2021f/train_final.csv')
trainX = train.drop(columns=['income>50K'])
trainY = train[['income>50K']].astype(np.uint8)
fix(trainX)

rs = RandomOverSampler()
rs.fit(trainX,trainY)
trainX,trainY = rs.fit_resample(trainX, trainY)

test = pd.read_csv('/kaggle/input/ilp2021f/test_final.csv')
test.drop(columns=['ID'],inplace=True)
fix(test) 

In [7]:
# Output XG Boost

model = XGBClassifier(min_child_weight = 5, subsample= 0.9)
model = model.fit(trainX, trainY['income>50K'])
p = model.predict(trainX)
print(np.mean(p == trainY['income>50K']))

predictions = model.predict(test)
submission = pd.DataFrame({'ID': range(1,test.shape[0]+1), 'Prediction': predictions})
submission.to_csv('XGBoost.csv', index=False)