# Importing necessary libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [3]:
#To ignore warnings

import warnings
warnings.filterwarnings("ignore")

In [4]:
pd.set_option('display.max_columns' , None)
pd.set_option('display.max_rows' , None)
pd.set_option('display.width' , None)

# Import Training Dataset

In [5]:
df = pd.read_csv(r'C:\Users\Hp\Desktop\data science\Python\Datasets\Processed_adult.csv' , index_col = 0)
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
0,39,6,77516,9,13,4,0,1,4,1,2174,0,40,38,0
1,50,5,83311,9,13,2,3,0,4,1,0,0,13,38,0
2,38,3,215646,11,9,0,5,1,4,1,0,0,40,38,0
3,53,3,234721,1,7,2,5,0,2,1,0,0,40,38,0
4,28,3,338409,9,13,2,9,5,2,0,0,0,40,4,0


# EDA

In [6]:
print(df.dtypes) 

age               int64
workclass         int64
fnlwgt            int64
education         int64
education_num     int64
marital_status    int64
occupation        int64
relationship      int64
race              int64
sex               int64
capital_gain      int64
capital_loss      int64
hours_per_week    int64
native_country    int64
income            int64
dtype: object


In [7]:
print(df.shape)

(32561, 15)


In [9]:
print(df.describe())

                age     workclass        fnlwgt     education  education_num  \
count  32561.000000  32561.000000  3.256100e+04  32561.000000   32561.000000   
mean      38.581647      3.094438  1.897784e+05     10.298210      10.080679   
std       13.640433      1.107194  1.055500e+05      3.870264       2.572720   
min       17.000000      0.000000  1.228500e+04      0.000000       1.000000   
25%       28.000000      3.000000  1.178270e+05      9.000000       9.000000   
50%       37.000000      3.000000  1.783560e+05     11.000000      10.000000   
75%       48.000000      3.000000  2.370510e+05     12.000000      12.000000   
max       90.000000      7.000000  1.484705e+06     15.000000      16.000000   

       marital_status    occupation  relationship          race           sex  \
count    32561.000000  32561.000000  32561.000000  32561.000000  32561.000000   
mean         2.611836      6.138755      1.446362      3.665858      0.669205   
std          1.506222      3.972708 

# Missing value check

In [10]:
df.isnull().sum()

age               0
workclass         0
fnlwgt            0
education         0
education_num     0
marital_status    0
occupation        0
relationship      0
race              0
sex               0
capital_gain      0
capital_loss      0
hours_per_week    0
native_country    0
income            0
dtype: int64

# Seperate X and Y

In [14]:
X = df.values[: , :-1]
Y = df.values[: , -1]

# Train test split

In [15]:
from sklearn.model_selection import train_test_split



X_train ,X_test, Y_train, Y_test = train_test_split(X, Y , test_size =0.3 , random_state =10)


# Scaling

In [16]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

X_train = scaler.fit_transform(X_train) 

X_test = scaler.transform(X_test) 
print(X)


[[    39      6  77516 ...      0     40     38]
 [    50      5  83311 ...      0     13     38]
 [    38      3 215646 ...      0     40     38]
 ...
 [    58      3 151910 ...      0     40     38]
 [    22      3 201490 ...      0     20     38]
 [    52      4 287927 ...      0     40     38]]


# Applying PCA

In [17]:

from sklearn.decomposition import PCA
pca = PCA(n_components= None)
X_train = pca.fit_transform(X_train)
X_test = pca.transform(X_test)
exp_variance = pca.explained_variance_ratio_
print(exp_variance)


[0.14822853 0.10132163 0.08087826 0.07791618 0.07439686 0.07293743
 0.07010901 0.06793779 0.06479797 0.06152477 0.06062523 0.04866873
 0.04298162 0.02767599]


# Tunning n_components parameter

In [18]:
#applying PCA

from sklearn.decomposition import PCA
pca = PCA(n_components= 0.75) #try to work around 0.75 - 0.95
X_train = pca.fit_transform(X_train)
X_test = pca.transform(X_test)
exp_variance = pca.explained_variance_ratio_
print(exp_variance)


[0.14822853 0.10132163 0.08087826 0.07791618 0.07439686 0.07293743
 0.07010901 0.06793779 0.06479797]


In [19]:
pca.n_components_

9

# Model building and prediction

In [20]:
from sklearn.linear_model import LogisticRegression
#create a model object
classifier = LogisticRegression()
#train the model object
classifier.fit(X_train, Y_train)

Y_pred =classifier.predict(X_test)



# Accuracy

In [21]:
from sklearn.metrics import confusion_matrix , accuracy_score , classification_report

cfm = confusion_matrix(Y_test , Y_pred)
print(cfm)
print("Classification report :")

print(classification_report(Y_test , Y_pred))

acc = accuracy_score(Y_test , Y_pred)
print("Accuracy of the model:" , acc)

[[7025  398]
 [1344 1002]]
Classification report :
              precision    recall  f1-score   support

           0       0.84      0.95      0.89      7423
           1       0.72      0.43      0.53      2346

    accuracy                           0.82      9769
   macro avg       0.78      0.69      0.71      9769
weighted avg       0.81      0.82      0.80      9769

Accuracy of the model: 0.8216808271061521


# Tunning n_components parameter

In [22]:
#applying PCA
#n_components can take 2 type of value - <1 [% of variance] , >1 [no of components to be retained]
from sklearn.decomposition import PCA
pca = PCA(n_components= 2)
X_train = pca.fit_transform(X_train)
X_test = pca.transform(X_test)
exp_variance = pca.explained_variance_ratio_
print(exp_variance)


[0.19541715 0.13357741]


# Model building and prediction

In [23]:
from sklearn.linear_model import LogisticRegression
#create a model object
classifier = LogisticRegression()
#train the model object
classifier.fit(X_train, Y_train)

Y_pred =classifier.predict(X_test)



# Accuracy

In [24]:
from sklearn.metrics import confusion_matrix , accuracy_score , classification_report

cfm = confusion_matrix(Y_test , Y_pred)
print(cfm)
print("Classification report :")

print(classification_report(Y_test , Y_pred))

acc = accuracy_score(Y_test , Y_pred)
print("Accuracy of the model:" , acc)

[[6991  432]
 [1522  824]]
Classification report :
              precision    recall  f1-score   support

           0       0.82      0.94      0.88      7423
           1       0.66      0.35      0.46      2346

    accuracy                           0.80      9769
   macro avg       0.74      0.65      0.67      9769
weighted avg       0.78      0.80      0.78      9769

Accuracy of the model: 0.7999795270754427
