# Census Prediction

### Installing Dependencies

In [95]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import preprocessing
from sklearn.metrics import f1_score
from pandas_profiling import ProfileReport
from sklearn import tree

## **Test Train Split**

In [None]:
data = pd.read_csv("Problem statement 1_dataset/adult.csv")
df_train = train_test_split(data, shuffle=False)[0]
df_test = train_test_split(data, shuffle=False)[1]

In [None]:
df_train.describe()

Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week
count,24420.0,24420.0,24420.0,24420.0,24420.0,24420.0
mean,38.61294,189827.5,10.083702,1082.549918,86.40774,40.405487
std,13.694262,105447.5,2.558916,7432.957501,401.162846,12.318458
min,17.0,12285.0,1.0,0.0,0.0,1.0
25%,28.0,117963.0,9.0,0.0,0.0,40.0
50%,37.0,178354.5,10.0,0.0,0.0,40.0
75%,48.0,237052.5,12.0,0.0,0.0,45.0
max,90.0,1484705.0,16.0,99999.0,4356.0,99.0


In [94]:
df_test.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,country,salary
24420,43,4,129853,0,6,4,7,1,2,1,0,0,40,38,0
24421,50,4,99925,11,9,2,1,0,4,1,0,0,32,38,0
24422,58,4,227800,3,2,5,5,1,2,1,0,0,50,38,0
24423,55,7,111130,7,12,0,1,3,1,1,0,0,40,38,0
24424,29,4,100764,9,13,2,4,0,4,1,0,0,45,38,1


### Cleaning the train data

In [None]:
df_train = df_train.replace(to_replace = '%?%', value = np.nan)
df_train.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,country,salary
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [None]:
nan = df_train.columns[df_train.isnull().any()]
df_train[nan].isnull().sum()

Series([], dtype: float64)

In [None]:
df_test.dropna()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,country,salary
24420,43,Private,129853,10th,6,Never-married,Machine-op-inspct,Not-in-family,Black,Male,0,0,40,United-States,<=50K
24421,50,Private,99925,HS-grad,9,Married-civ-spouse,Adm-clerical,Husband,White,Male,0,0,32,United-States,<=50K
24422,58,Private,227800,1st-4th,2,Separated,Farming-fishing,Not-in-family,Black,Male,0,0,50,United-States,<=50K
24423,55,State-gov,111130,Assoc-acdm,12,Divorced,Adm-clerical,Own-child,Asian-Pac-Islander,Male,0,0,40,United-States,<=50K
24424,29,Private,100764,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,45,United-States,>50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
32557,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
32558,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
32559,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K


## ***No Null Values***

### **taking categorical and non categorical categories and label encoding the dataset**

In [None]:
features = list(df_train.columns)
features

['age',
 'workclass',
 'fnlwgt',
 'education',
 'education-num',
 'marital-status',
 'occupation',
 'relationship',
 'race',
 'sex',
 'capital-gain',
 'capital-loss',
 'hours-per-week',
 'country',
 'salary']

In [None]:
categorical_features = list(df_train.select_dtypes(include=['object']).columns)
numerical_features = list(df_train.select_dtypes(include=['int', 'float']).columns)

In [None]:
labelEncode = preprocessing.LabelEncoder()
for i in categorical_features:
    df_train[i] = labelEncode.fit_transform(df_train[i])
    df_test[i] = labelEncode.fit_transform(df_test[i])

In [None]:
df_train.head(20)

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,country,salary
0,39,7,77516,9,13,4,1,1,4,1,2174,0,40,39,0
1,50,6,83311,9,13,2,4,0,4,1,0,0,13,39,0
2,38,4,215646,11,9,0,6,1,4,1,0,0,40,39,0
3,53,4,234721,1,7,2,6,0,2,1,0,0,40,39,0
4,28,4,338409,9,13,2,10,5,2,0,0,0,40,5,0
5,37,4,284582,12,14,2,4,5,4,0,0,0,40,39,0
6,49,4,160187,6,5,3,8,1,2,0,0,0,16,23,0
7,52,6,209642,11,9,2,4,0,4,1,0,0,45,39,1
8,31,4,45781,12,14,4,10,1,4,0,14084,0,50,39,1
9,42,4,159449,9,13,2,4,0,4,1,5178,0,40,39,1


In [None]:
df_test.head(20)

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,country,salary
24420,43,4,129853,0,6,4,7,1,2,1,0,0,40,38,0
24421,50,4,99925,11,9,2,1,0,4,1,0,0,32,38,0
24422,58,4,227800,3,2,5,5,1,2,1,0,0,50,38,0
24423,55,7,111130,7,12,0,1,3,1,1,0,0,40,38,0
24424,29,4,100764,9,13,2,4,0,4,1,0,0,45,38,1
24425,47,4,275095,15,10,0,7,1,4,0,0,0,40,38,0
24426,39,4,147500,11,9,2,10,5,2,0,0,0,40,38,0
24427,63,2,150079,11,9,2,1,5,4,0,0,0,35,38,1
24428,27,4,140863,11,9,4,14,1,4,1,0,0,60,38,0
24429,62,0,199198,1,7,0,0,1,2,0,0,0,40,38,0


# **Logistic Regression**

In [None]:
# train and test dataset without the salary
x_train = df_train.iloc[:,0:len(df_train.columns)-1]
x_test = df_test.iloc[:,0:len(df_test.columns)-1]

# train and test salary only
y_train = df_train.iloc[:,-1]
y_test = df_test.iloc[:,-1]

y_train

0        0
1        0
2        0
3        0
4        0
        ..
24415    1
24416    0
24417    0
24418    0
24419    1
Name: salary, Length: 24420, dtype: int64

In [None]:
lr = LogisticRegression()
lr.fit(x_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [None]:
lr.score(x_test, y_test)

0.7900749293698562

## Logistic Regression = 79% accuracy

> Indented block



## **KNN**

In [None]:
knn = KNeighborsClassifier(n_neighbors=25)
knn.fit(x_train, y_train)


In [None]:
knn.score(x_test, y_test)

0.7957253408672154

## KNN = 80%

## **Decision Tree:**

In [None]:
dt = DecisionTreeClassifier()
dt.fit(x_train, y_train)
dt.score(x_test, y_test)


0.8162387913032797

### Decision Tree = 81%

## **Random Forest**

In [None]:
rfc = RandomForestClassifier()
rfc.fit(x_train, y_train)
rfc.score(x_test, y_test)

0.8578798673381648

## Random Forest = 85%