In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder,MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import  LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

**Dataset Link:** https://www.kaggle.com/datasets/wenruliu/adult-income-dataset

# Understanding Dataset

In [None]:
data=pd.read_csv("../input/adult-income-dataset/adult.csv")

In [None]:
data.info()

In [None]:
data.head()

In [None]:
data.shape

In [None]:
data.describe().T

In [None]:
data.describe(include='object')

In [None]:
data.income.unique()

In [None]:
# data.income=data.income.replace(['<=50K', '>50K'],[0,1])

In [None]:
sns.countplot(data.income)

# Feature Selection,EDA

In [None]:
data.age.unique()

In [None]:
sns.boxplot(data.income,data.age)


In [None]:
sns.boxplot(data.income,data['fnlwgt'])


In [None]:
sns.boxplot(data.income,data['educational-num'])


In [None]:
sns.boxplot(data.income,data['capital-gain'])


In [None]:
sns.boxplot(data.income,data['capital-loss'])


In [None]:
sns.boxplot(data.income,data['hours-per-week'])


In [None]:
data.income.unique()

In [None]:
data.isnull().sum()

In [None]:
data=data.dropna()

In [None]:
data.income=data.income.replace(['<=50K', '>50K'],[0,1])

In [None]:
data.income=data.income.replace()

In [None]:
plt.figure(figsize=(7,6))
sns.heatmap(data.corr(),annot=True,cmap='Blues')

In [None]:
data.workclass.unique()

In [None]:
plt.figure(figsize=(15,7))
ax=sns.countplot(data.workclass,hue=data.income)
plt.legend(['<=50K', '>50K'])

In [None]:
data.education.unique()

In [None]:
plt.figure(figsize=(15,7))
ax=sns.countplot(data.education,hue=data.income)
ax.set_xticklabels(ax.get_xticklabels(),rotation=60)
plt.legend(['<=50K', '>50K'])

In [None]:
data['marital-status'].unique()

In [None]:
plt.figure(figsize=(15,7))
ax=sns.countplot(data['marital-status'],hue=data.income)
plt.legend(['<=50K', '>50K'])

In [None]:
data.occupation.unique()

In [None]:
plt.figure(figsize=(15,7))
ax=sns.countplot(data.occupation,hue=data.income)
ax.set_xticklabels(ax.get_xticklabels(),rotation=60)
plt.legend(['<=50K', '>50K'])

In [None]:
data.relationship.unique()

In [None]:
plt.figure(figsize=(15,7))
ax=sns.countplot(data.relationship,hue=data.income)
plt.legend(['<=50K', '>50K'])

In [None]:
data.gender.unique()

In [None]:
plt.figure(figsize=(15,7))
ax=sns.countplot(data.gender,hue=data.income)
plt.legend(['<=50K', '>50K'])

In [None]:
data['native-country'].unique()

In [None]:
plt.figure(figsize=(15,7))
ax=sns.countplot(data['native-country'],hue=data.income)
ax.set_xticklabels(ax.get_xticklabels(),rotation=60)
plt.legend(['<=50K', '>50K'])
plt.show()

# Data Preprocessing

In [None]:
x=data.drop(['income'],axis=1)

In [None]:
x.head()

In [None]:
numericalcols=list(data.select_dtypes(exclude='object').columns)
numericalcols.pop()
numericalcols

In [None]:
x=pd.get_dummies(x)

In [None]:
x.head()

In [None]:
data=data.drop(['fnlwgt'],axis=1)

In [None]:
numericalcols

In [None]:
from sklearn.preprocessing import StandardScaler
M=StandardScaler()
x[numericalcols]=M.fit_transform(x[numericalcols])

In [None]:
x.head()

In [None]:
y=data.income

In [None]:
x_train,x_test,y_trian,y_test=train_test_split(x,y,random_state=89,test_size=0.3)

# LogisticRegression

In [None]:
ModelL=LogisticRegression()

In [None]:
ModelL.fit(x_train,y_trian)

In [None]:
ModelL.score(x_test,y_test)

In [None]:
y_predictL=ModelL.predict(x_test)

In [None]:
accuracy_score(y_test,y_predictL)

In [None]:
ax=sns.distplot(y_test,hist=False,label='Actual Values')
ax=sns.distplot(y_predictL,hist=False,label='Predicted Values')
ax.set_title('LogisticRegression')
plt.legend()

In [None]:
confusion_matrix(y_test,y_predictL)

In [None]:
sns.heatmap(confusion_matrix(y_test,y_predictL),annot=True,cmap='Blues')

# SupportVectorClassifier

In [None]:
ModelS=SVC()

In [None]:
ModelS.fit(x_train,y_trian)
ModelS.score(x_test,y_test)

In [None]:
y_predicS=ModelS.predict(x_test)

In [None]:
accuracy_score(y_test,y_predicS)

In [None]:
ax=sns.distplot(y_test,hist=False,label='Actual Values')
ax=sns.distplot(y_predicS,hist=False,label='Predicted Values')
ax.set_title('SupportVectorClassifier')
plt.legend()

In [None]:
confusion_matrix(y_test,y_predicS)

In [None]:
sns.heatmap(confusion_matrix(y_test,y_predicS),annot=True,cmap='Blues')

# KNeighborsClassifier

In [None]:
ModelK=KNeighborsClassifier(n_neighbors=13)

In [None]:
ModelK.fit(x_train,y_trian)
ModelK.score(x_test,y_test)

In [None]:
ax=sns.distplot(y_test,hist=False,label='Actual Values')
ax=sns.distplot(ModelK.predict(x_test),hist=False,label='Predicted Values')
ax.set_title('KNeighborsClassifier')
plt.legend()

In [None]:
confusion_matrix(y_test,ModelK.predict(x_test))

In [None]:
sns.heatmap(confusion_matrix(y_test,ModelK.predict(x_test)),annot=True,cmap='Blues')

# RandomForestClassifier

In [None]:
ModelR=RandomForestClassifier()
ModelR.fit(x_train,y_trian)
ModelR.score(x_test,y_test)

In [None]:
ax=sns.distplot(y_test,hist=False,label='Actual Values')
ax=sns.distplot(ModelR.predict(x_test),hist=False,label='Predicted Values')
ax.set_title('RandomForestClassifier')
plt.legend()