In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
from sklearn.compose import ColumnTransformer

In [4]:
# Load the dataset (replace 'your_dataset.csv' with the actual file path)
data = pd.read_csv('data/adult.csv')

In [20]:
train_df, test_df = train_test_split(data, test_size=0.2, random_state=42)

In [24]:
train_df.to_csv("data/train_df.csv",index=False)
test_df.to_csv("data/test_df.csv",index=False)

In [5]:
data.head()


Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,country,salary
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [6]:
data.drop_duplicates(inplace=True)


In [7]:
for i in data.columns:
    print(i,":",data[i].nunique(),"\n",data[i].unique())
    

age : 73 
 [39 50 38 53 28 37 49 52 31 42 30 23 32 40 34 25 43 54 35 59 56 19 20 45
 22 48 21 24 57 44 41 29 18 47 46 36 79 27 67 33 76 17 55 61 70 64 71 68
 66 51 58 26 60 90 75 65 77 62 63 80 72 74 69 73 81 78 88 82 83 84 85 86
 87]
workclass : 9 
 [' State-gov' ' Self-emp-not-inc' ' Private' ' Federal-gov' ' Local-gov'
 ' ?' ' Self-emp-inc' ' Without-pay' ' Never-worked']
fnlwgt : 21648 
 [ 77516  83311 215646 ...  34066  84661 257302]
education : 16 
 [' Bachelors' ' HS-grad' ' 11th' ' Masters' ' 9th' ' Some-college'
 ' Assoc-acdm' ' Assoc-voc' ' 7th-8th' ' Doctorate' ' Prof-school'
 ' 5th-6th' ' 10th' ' 1st-4th' ' Preschool' ' 12th']
education-num : 16 
 [13  9  7 14  5 10 12 11  4 16 15  3  6  2  1  8]
marital-status : 7 
 [' Never-married' ' Married-civ-spouse' ' Divorced'
 ' Married-spouse-absent' ' Separated' ' Married-AF-spouse' ' Widowed']
occupation : 15 
 [' Adm-clerical' ' Exec-managerial' ' Handlers-cleaners' ' Prof-specialty'
 ' Other-service' ' Sales' ' Craft-repair' '

In [8]:
data[(data["workclass"]==" ?")].shape


(1836, 15)

In [9]:
data[(data["occupation"]==" ?")].shape

(1843, 15)

In [10]:
data[(data["country"]==" ?")].shape

(582, 15)

In [11]:
data["workclass"] = data["workclass"].replace(' ?', np.nan)
data["workclass"]=data["workclass"].fillna(data["workclass"].mode()[0])

In [12]:
data["occupation"] = data["occupation"].replace(' ?', np.nan)
data["occupation"]=data["occupation"].fillna(data["occupation"].mode()[0])

In [13]:
data["country"] = data["country"].replace(' ?', np.nan)
data["country"]=data["country"].fillna(data["country"].mode()[0])

In [14]:
# Createing binary target variable (1 for >50K and 0 for <=50K)
def map_salary(salary_value):
    if salary_value.strip() == '>50K':
        return 1
    else:
        return 0
    
data['salary'] = data['salary'].apply(map_salary)

In [15]:
data.drop(['fnlwgt','relationship','capital-gain','capital-loss'],axis=1,inplace=True)

In [16]:
x= data.drop(columns=["salary"],axis=1)
y = data['salary']

In [17]:
    numeric_features = x.select_dtypes(include='number').columns.tolist()
    categorical_features = x.select_dtypes(exclude='number').columns.tolist()



In [19]:
categorical_features

['workclass',
 'education',
 'marital-status',
 'occupation',
 'race',
 'sex',
 'country']

In [None]:
numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder()
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

In [None]:
x = preprocessor.fit_transform(x)

In [None]:
# Spliting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)


In [24]:
Random_Forest = RandomForestClassifier()
cv_scores = cross_val_score(Random_Forest, X_train, y_train, cv=10)
print(f'Mean Accuracy: {cv_scores.mean()}')


Mean Accuracy: 0.8137080345122978


In [25]:
logistic_regression = LogisticRegression(max_iter=1000)
cv_scores = cross_val_score(logistic_regression, X_train, y_train, cv=10)
print(f'Mean Accuracy: {cv_scores.mean()}')

Mean Accuracy: 0.8330709289198917


In [26]:
Decision_tree = DecisionTreeClassifier()
cv_scores = cross_val_score(Decision_tree, X_train, y_train, cv=10)
print(f'Mean Accuracy: {cv_scores.mean()}')

Mean Accuracy: 0.783664446775922
