### Import the libraries

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier

### Read the data

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
df_adult = pd.read_csv('/content/drive/My Drive/Projects_shai/Projects data/adult.csv')

In [None]:
df_adult.shape

(48842, 15)

In [None]:
df_adult.head().style.hide()

age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [None]:
df_adult.drop_duplicates(inplace=True)

###number of row and column


In [None]:
df_adult.shape

(48813, 15)

###Information about data set

In [None]:
df_adult.info()

<class 'pandas.core.frame.DataFrame'>
Index: 48813 entries, 0 to 48841
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             48813 non-null  int64 
 1   workclass       47850 non-null  object
 2   fnlwgt          48813 non-null  int64 
 3   education       48813 non-null  object
 4   education-num   48813 non-null  int64 
 5   marital-status  48813 non-null  object
 6   occupation      47847 non-null  object
 7   relationship    48813 non-null  object
 8   race            48813 non-null  object
 9   sex             48813 non-null  object
 10  capital-gain    48813 non-null  int64 
 11  capital-loss    48813 non-null  int64 
 12  hours-per-week  48813 non-null  int64 
 13  native-country  48539 non-null  object
 14  income          48813 non-null  object
dtypes: int64(6), object(9)
memory usage: 6.0+ MB


In [None]:
df_adult['income'].value_counts()

Unnamed: 0_level_0,count
income,Unnamed: 1_level_1
<=50K,24698
<=50K.,12430
>50K,7839
>50K.,3846


### Cleaning categories from points

In [None]:
df_adult['income'] = df_adult['income'].replace({'<=50K.': '<=50K', '>50K.': '>50K'})

### Check after cleaning

In [None]:
df_adult['income'].value_counts()

Unnamed: 0_level_0,count
income,Unnamed: 1_level_1
<=50K,37128
>50K,11685


###Check for missing values

In [None]:
df_adult.isnull().sum()

Unnamed: 0,0
age,0
workclass,963
fnlwgt,0
education,0
education-num,0
marital-status,0
occupation,966
relationship,0
race,0
sex,0


###Converting the categorical category to numerical variations

In [None]:
data_encoded = pd.get_dummies(df_adult, drop_first=True)

###Segmenting data into features and goals.

In [None]:
x = data_encoded.drop("income_>50K", axis=1)  # Features
y = data_encoded["income_>50K"]  # Goals

###We split the data into training and test sets.

In [None]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

### Train Naive Bayas Model

In [None]:
nb_model = GaussianNB()
nb_model.fit(X_train, y_train)

# predict
nb_predictions = nb_model.predict(X_test)

# acc calculate
nb_accuracy = accuracy_score(y_test, nb_predictions)
print(f'Naive Bayes Accuracy : {nb_accuracy:.2f}')


Naive Bayes Accuracy : 0.79


### Train DecisionTree Model

In [None]:
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train, y_train)

# predict
dt_predictions = dt_model.predict(X_test)

# acc calculate
dt_accuracy = accuracy_score(y_test, dt_predictions)
print(f'Decision Tree Accuracy : {dt_accuracy:.2f}')


Decision Tree Accuracy : 0.81
