In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from catboost import CatBoostRegressor
from sklearn.metrics import accuracy_score, f1_score
from sklearn.metrics import classification_report
import warnings
warnings.simplefilter(action='ignore')


In [None]:
df = pd.read_csv('../input/adult-income-dataset/adult.csv')

#df = pd.read_csv('adult.csv')
print(df.shape)
df.head(5)

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.dtypes

In [None]:
for col in df.columns.to_list():
    if len(df[col].value_counts().sort_values())<20:
        print(f'{col}:-')
        print(df[col].value_counts())
        print('\n')

In [None]:
numeric = df.select_dtypes(include=['number'])
numeric

In [None]:
cat = df.select_dtypes('object')
cat

In [None]:
for col in numeric:
    f, axis = plt.subplots(1,2, figsize=(20, 5))
    sns.distplot(df[col], ax=axis[0], kde=True)
    sns.boxplot(df['income'], df[col], ax=axis[1])
    plt.show()

In [None]:
for col in cat:
    print(f'{col}:-')
    sns.countplot(df[col], hue=df["income"])
    plt.xticks(rotation=90)
    plt.show()

In [None]:
for i,v in enumerate(df.columns.to_list()):
    df[v].replace("?", np.NaN, inplace=True)

In [None]:
df["workclass"].replace(np.NaN, "Private", inplace=True)
df["occupation"].replace(np.NaN, "unknowen", inplace=True)
df["native-country"].replace(np.NaN, "United-States", inplace=True)

In [None]:
df.isnull().sum()

In [None]:
for j, x in enumerate(df.select_dtypes(exclude=['float64','int64']).columns.to_list()): 
    my_dict = { v : i 
                   for i, v in enumerate(df[x].unique(), 0)
              }
    df.loc[:, x] = df.loc[:, x].map(my_dict)

In [None]:
df_train, df_test = train_test_split(df, test_size=0.2, stratify = df['income'])


In [None]:
X_train = df_train.drop(columns=['income'], axis=1)
X_test = df_test.drop(columns=['income'], axis=1)

y_train = pd.get_dummies(df_train['income'], drop_first=True)
y_test = pd.get_dummies(df_test['income'], drop_first=True)

In [None]:
rf = RandomForestClassifier(n_estimators=500, class_weight='balanced', random_state=0)
rf.fit(X_train, y_train)

print(classification_report(y_test, rf.predict(X_test)))
print('Train score : ', f1_score(y_train, rf.predict(X_train)))
print('test score : ', f1_score(y_test, rf.predict(X_test)))

In [None]:
gb = GradientBoostingClassifier(n_estimators=800, max_depth=3,random_state=0)
gb.fit(X_train, y_train)

print(classification_report(y_test, gb.predict(X_test)))
print('Train score : ', f1_score(y_train, gb.predict(X_train)))
print('test score : ', f1_score(y_test, gb.predict(X_test)))