# Predicting Income using U.S. Census Data and classification algorithms

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import statistics
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier

In [None]:
def txt_to_csv(filename):
    columns = ["age", "workclass", "fnlgt", "education", "education-num", "marital-status",
              "occupation", "relationship", "race", "sex", "capital-gain", "capital-loss", 
              "hours-per-week", "native-country", "target"]
    df = pd.DataFrame(columns = columns)
    i = 0

    with open( (filename + ".txt"), "r") as f:
        for line in f:
            current_line = line.split(",")
            df.loc[i] = current_line
            i += 1
    
    return df

In [None]:
#transform data to csv (takes ~40mins)
#filename = "adult_data"
#df = txt_to_csv(filename)
#df.to_csv("train.csv")
#filename = "adult_test"
#df = txt_to_csv(filename)
#df.to_csv("test.csv")

In [None]:
df = pd.read_csv("train.csv")
df = df.drop(["Unnamed: 0"], axis = 1)
df.head()

# FIRST LOOK AT DATA

In [None]:
age = df["age"].value_counts()
sex = df["sex"].value_counts()
target = df["target"].value_counts()
workclass = df["workclass"].value_counts()
education = df["education"].value_counts()
education_num = df["education-num"].value_counts()
race = df["race"].value_counts()
hours_per_week = df['hours-per-week'].value_counts()

In [None]:
plt.figure(figsize=(11,6))
plt.barh(np.arange(len(education_num)), education_num, align='center', color='#3f8dba')
ticks = np.array(education_num.index)
plt.yticks(np.arange(len(education_num)), ticks)
plt.xlabel('# of instances',fontsize = 20)
plt.ylabel('education_num', fontsize = 20, rotation=90)
plt.title('Years In Education distribution', fontsize = 25)

In [None]:
plt.figure(figsize=(11,6))
plt.barh(np.arange(len(education)), education, align='center', color='#3f8dba')
ticks = np.array(education.index)
plt.yticks(np.arange(len(education)), ticks)
plt.xlabel('# of instances',fontsize = 20)
plt.ylabel('education', fontsize = 20, rotation=90)
plt.title('Level of Education distribution', fontsize = 25)

In [None]:
plt.figure(figsize=(11,6))
plt.barh(np.arange(len(race)), race, align='center', color='#3f8dba')
ticks = np.array(race.index)
plt.yticks(np.arange(len(race)), ticks)
plt.xlabel('# of instances', fontsize = 20)
plt.ylabel('race', fontsize = 20, rotation=90)
plt.title('Race distribution', fontsize = 25)

In [None]:
plt.figure(figsize=(11,6))
plt.barh(np.arange(len(workclass)), workclass, align='center', color='#3f8dba')
ticks = np.array(workclass.index)
plt.yticks(np.arange(len(workclass)), ticks)
plt.xlabel('# of instances',fontsize = 20)
plt.ylabel('workclass', fontsize = 20, rotation=90)
plt.title('Workclass distribution', fontsize = 25)

In [None]:
plt.figure(figsize=(11,6))
plt.bar(np.arange(2), target, align='center', color='#3f8dba')
plt.xlabel('Income Groups',fontsize = 20)
plt.ylabel('# of instances', fontsize = 20)
plt.xticks(np.arange(2), ('Under 50k','Over 50k' ))
plt.title('Income groups distribution', fontsize = 25)
plt.show()

In [None]:
plt.figure(figsize=(11,6))
plt.plot(age.index, age, 'ro')
plt.xlabel("Age",fontsize = 20)
plt.ylabel("# of instances",fontsize = 20)
plt.title("Age Distribution", fontsize = 25)

plt.show

In [None]:
plt.figure(figsize=(11,6))
p1 = plt.bar(np.arange(2), sex, align='center', color='#3f8dba')
plt.xlabel("Sex",fontsize = 20)
plt.ylabel("# of instances",fontsize = 20)
plt.xticks(np.arange(2), ('Male','Female'))
plt.title('Gender Distribution',fontsize = 25)
plt.show()

# PREPARING DATA FOR TRAINING

In [None]:
df = pd.read_csv("train.csv")
features = ['age', 'education-num', 'hours-per-week']
X_train = df.loc[:, features].values
y_train = df.loc[:, ['target']].values

In [None]:
df = pd.read_csv("test.csv")
X_test = df.loc[:, features].values
y_test = df.loc[:, ['target']].values

In [None]:
y_test

In [None]:
df = pd.DataFrame(data = X_train)
df.head()

In [None]:
shuffle_index = np.random.permutation(32561)
X_train, y_train = X_train[shuffle_index], y_train[shuffle_index]

In [None]:
df = pd.DataFrame(data = X_train)
df.head()

In [None]:
y_train = (y_train == ' >50K\r\n')
y_test = (y_test == ' >50K\r\n')

In [None]:
y_train

In [None]:
y_test

In [None]:
some_citizen = X_train[2] 

# SGD CLASSIFIER

In [None]:
X_train_sgd = StandardScaler().fit_transform(X_train)

In [None]:
sgd_clf = SGDClassifier(random_state=42)
sgd_clf.fit(X_train_sgd, y_train)

In [None]:
sgd_clf.predict([some_citizen])

In [None]:
cross_val_score(sgd_clf, X_train_sgd, y_train, cv = 2, scoring = 'accuracy')

In [None]:
y_pred = sgd_clf.predict(X_test)
accuracy_score(y_test, y_pred)

# LOGISTIC REGRESSION

### Training on each of the features independently

#### AGE

In [None]:
X_age = X_train[:,0]
X_age = X_age.reshape(-1,1)
X_age

In [None]:
log_reg = LogisticRegression()
log_reg.fit(X_age, y_train)

In [None]:
log_reg.predict_proba([[ some_citizen[0] ]])

In [None]:
X_new = np.linspace(min(X_age), max(X_age) , 60).reshape(-1, 1)

In [None]:
y_proba = log_reg.predict_proba(X_new)
plt.figure(figsize=(11,6))
plt.plot(X_new, y_proba[:, 1], "g-", linewidth=2 , label = "under 50K")
plt.plot(X_new, y_proba[:, 0], "b", linewidth=2, label="over 50K")
plt.legend(loc="center left", fontsize=14)
plt.ylabel("Probability",fontsize = 20)
plt.xlabel("Age", fontsize = 20)
plt.title("Probability of belonging to income group by age", fontsize = 18)
plt.show()

#### YEARS IN EDUCATION

In [None]:
X_edu = X_train[:,1]
X_edu = X_edu.reshape(-1,1)
X_edu

In [None]:
log_reg = LogisticRegression()
log_reg.fit(X_edu, y_train)

In [None]:
log_reg.predict_proba([[ some_citizen[0] ]])

In [None]:
X_new = np.linspace(min(X_edu), max(X_edu), 15).reshape(-1, 1)

In [None]:
y_proba = log_reg.predict_proba(X_new)
plt.figure(figsize=(11,6))
plt.plot(X_new, y_proba[:, 1], "g-", linewidth=2 , label = "under 50K")
plt.plot(X_new, y_proba[:, 0], "b", linewidth=2, label="over 50K")
plt.legend(loc="center left", fontsize=14)
plt.ylabel("Probability",fontsize = 20)
plt.xlabel("Years in Education", fontsize = 20)
plt.title("Probability of belonging to income \n group by years in education", fontsize = 18)
plt.show()

#### HOURS PER WEEK

In [None]:
X_hpw = X_train[:,2]
X_hpw = X_hpw.reshape(-1,1)
max(X_hpw)

In [None]:
log_reg = LogisticRegression()
log_reg.fit(X_hpw, y_train)

In [None]:
log_reg.predict_proba([[ some_citizen[0] ]])

In [None]:
X_new = np.linspace(min(X_hpw), max(X_hpw), 98).reshape(-1, 1)

In [None]:
y_proba = log_reg.predict_proba(X_new)
plt.figure(figsize=(11,6))

plt.plot(X_new, y_proba[:, 1], "g-", linewidth=2 , label = "under 50K")
plt.plot(X_new, y_proba[:, 0], "b", linewidth=2, label="over 50K")
plt.legend(loc="center left", fontsize=14)
plt.ylabel("Probability",fontsize = 20)
plt.xlabel("hours per week", fontsize = 20)
plt.title("Probability of belonging to income group \nby hours worked per week", fontsize = 18)
plt.show()

### Training on all 3 feautures

In [None]:
log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)

In [None]:
y_proba = log_reg.predict_proba([some_citizen])
y_proba

In [None]:
cross_val_score(log_reg, X_train, y_train, cv = 2, scoring = 'accuracy')

In [None]:
print (statistics.mean(cross_val_score(log_reg, X_train, y_train, cv=2, scoring = 'accuracy')))

In [None]:
y_pred = log_reg.predict(X_test)
accuracy_score(y_test, y_pred)

# DECISION TREES


In [None]:
tree_clf = DecisionTreeClassifier(max_depth = 2)
tree_clf.fit(X_train, y_train)

In [None]:
tree_clf.predict([some_citizen])

In [None]:
ran = [2,3,4,5,6,7,8,9,10]
acc = []
for i in ran:
    tree_clf = DecisionTreeClassifier(max_depth = i)
    tree_clf.fit(X_train, y_train)
    acc.append(cross_val_score(tree_clf, X_train, y_train, cv = 2, scoring = 'accuracy')[0])

In [None]:

plt.figure(figsize=(11,6))
#plt.axis([1.5,10.5,0.75,0.81])
plt.plot(ran, acc)
plt.xlabel("max depth", fontsize = 20)
plt.ylabel("accuracy",  fontsize = 20)
plt.title("Cross-validation accuracy in regards to depth of Decision Tree", fontsize = 25)

In [None]:
tree_clf = DecisionTreeClassifier(max_depth = 8)
tree_clf.fit(X_train, y_train)

In [None]:
cross_val_score(tree_clf, X_train, y_train, cv = 2, scoring = 'accuracy')

In [None]:
print (statistics.mean(cross_val_score(tree_clf, X_train, y_train, cv=2, scoring = 'accuracy')))

In [None]:
y_pred = tree_clf.predict(X_test)

In [None]:
accuracy_score(y_test, y_pred)

# VOTING CLASSIFIERS

In [None]:
log_clf = LogisticRegression()
rnd_clf = RandomForestClassifier()
svm_clf = SVC()
voting_clf = VotingClassifier(
estimators=[('lr', log_clf), ('rf', rnd_clf), ('svc', svm_clf)],
voting='hard')
voting_clf.fit(X_train, y_train)

In [None]:
y_pred = voting_clf.predict(X_test)
accuracy_score(y_test, y_pred)

In [None]:
cross_val_score(voting_clf, X_train, y_train, cv = 2, scoring = 'accuracy')

# BAGGING

In [None]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
bag_clf = BaggingClassifier(
DecisionTreeClassifier(), n_estimators=500,
max_samples=100, bootstrap=True, n_jobs=-1)
bag_clf.fit(X_train, y_train)
y_pred = bag_clf.predict(X_test)

In [None]:
print (statistics.mean(cross_val_score(bag_clf, X_train, y_train, cv=2, scoring = 'accuracy')))

In [None]:
y_pred = bag_clf.predict(X_test)
print(accuracy_score(y_test, y_pred))

# RANDOM FOREST

In [None]:
rnd_clf = RandomForestClassifier()
rnd_clf.fit(X_train, y_train)

In [None]:
print (statistics.mean(cross_val_score(rnd_clf, X_train, y_train, cv=2, scoring = 'accuracy')))

In [None]:
y_pred = rnd_clf.predict(X_test)
print(accuracy_score(y_test, y_pred))

# ADABOOST

In [None]:
from sklearn.ensemble import AdaBoostClassifier
ada_clf = AdaBoostClassifier(
DecisionTreeClassifier(max_depth=1), n_estimators=200,
algorithm="SAMME.R", learning_rate=0.5)
ada_clf.fit(X_train, y_train)

In [None]:
print (statistics.mean(cross_val_score(ada_clf, X_train, y_train, cv=2, scoring = 'accuracy')))

In [None]:
y_pred = ada_clf.predict(X_test)
print(accuracy_score(y_test, y_pred))

# GRADIENT BOOSTING

In [None]:
from sklearn.ensemble import GradientBoostingRegressor
gbrt = GradientBoostingRegressor(max_depth=2, n_estimators=3, learning_rate=1.0)
gbrt.fit(X_train, y_train)

In [None]:
print (statistics.mean(cross_val_score(ada_clf, X_train, y_train, cv=2, scoring = 'accuracy')))

In [None]:
y_pred = ada_clf.predict(X_test)
print(accuracy_score(y_test, y_pred))