In [47]:
import pandas as pd
import numpy as np
import scipy as sp
import scipy.stats as stats

from matplotlib import pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report


In [3]:
data = pd.read_csv('data/adult.csv', header=None, skipinitialspace=True)

In [4]:
data.columns = ['age','workclass','fnlwgt','education','educationNum','marital',
           'occupation','relationship','race','sex','capGain','capLoss',
          'HoursWeek','NativeCountry','category']


In [5]:
data.head()

Unnamed: 0,age,workclass,fnlwgt,education,educationNum,marital,occupation,relationship,race,sex,capGain,capLoss,HoursWeek,NativeCountry,category
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [6]:
data.loc[data.category == '<=50K', 'category'] = 0
data.loc[data.category == '>50K', 'category'] = 1

In [7]:
data.category.head()

0    0
1    0
2    0
3    0
4    0
Name: category, dtype: int64

In [8]:
data.category.tail()

32556    0
32557    1
32558    0
32559    0
32560    1
Name: category, dtype: int64

In [9]:
# 7406609000 whatsapp of Ashok

In [10]:
data.category

0        0
1        0
2        0
3        0
4        0
5        0
6        0
7        1
8        1
9        1
10       1
11       1
12       0
13       0
14       1
15       0
16       0
17       0
18       0
19       1
20       1
21       0
22       0
23       0
24       0
25       1
26       0
27       1
28       0
29       0
        ..
32531    0
32532    1
32533    1
32534    0
32535    0
32536    1
32537    0
32538    1
32539    1
32540    0
32541    0
32542    0
32543    0
32544    0
32545    1
32546    0
32547    0
32548    0
32549    0
32550    0
32551    0
32552    0
32553    0
32554    1
32555    0
32556    0
32557    1
32558    0
32559    0
32560    1
Name: category, Length: 32561, dtype: int64

In [11]:
data.columns

Index(['age', 'workclass', 'fnlwgt', 'education', 'educationNum', 'marital',
       'occupation', 'relationship', 'race', 'sex', 'capGain', 'capLoss',
       'HoursWeek', 'NativeCountry', 'category'],
      dtype='object')

In [12]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
age              32561 non-null int64
workclass        32561 non-null object
fnlwgt           32561 non-null int64
education        32561 non-null object
educationNum     32561 non-null int64
marital          32561 non-null object
occupation       32561 non-null object
relationship     32561 non-null object
race             32561 non-null object
sex              32561 non-null object
capGain          32561 non-null int64
capLoss          32561 non-null int64
HoursWeek        32561 non-null int64
NativeCountry    32561 non-null object
category         32561 non-null int64
dtypes: int64(7), object(8)
memory usage: 3.7+ MB


In [13]:
data.replace(['$', '?', '*', '-'], np.nan, inplace=True)

In [14]:
object_types = data.dtypes[data.dtypes == 'object']

In [15]:

for i in object_types.index:
    print(i)
    print(set(data[i]))
    # data[i] = labelencoder.fit_transform(data[i])

workclass
{'Self-emp-not-inc', nan, 'Local-gov', 'Never-worked', 'Self-emp-inc', 'Federal-gov', 'Without-pay', 'State-gov', 'Private'}
education
{'Doctorate', 'Bachelors', 'HS-grad', '1st-4th', 'Some-college', 'Prof-school', '10th', '9th', 'Preschool', '5th-6th', 'Assoc-acdm', 'Masters', '12th', 'Assoc-voc', '11th', '7th-8th'}
marital
{'Married-civ-spouse', 'Separated', 'Widowed', 'Never-married', 'Married-AF-spouse', 'Divorced', 'Married-spouse-absent'}
occupation
{nan, 'Adm-clerical', 'Exec-managerial', 'Handlers-cleaners', 'Machine-op-inspct', 'Protective-serv', 'Transport-moving', 'Farming-fishing', 'Craft-repair', 'Prof-specialty', 'Priv-house-serv', 'Armed-Forces', 'Other-service', 'Tech-support', 'Sales'}
relationship
{'Husband', 'Unmarried', 'Own-child', 'Not-in-family', 'Wife', 'Other-relative'}
race
{'Asian-Pac-Islander', 'Other', 'Black', 'White', 'Amer-Indian-Eskimo'}
sex
{'Female', 'Male'}
NativeCountry
{nan, 'Italy', 'Mexico', 'Poland', 'Thailand', 'Canada', 'China', 'Tri

In [16]:
all_nulls = data.isnull().sum()
all_nulls

age                 0
workclass        1836
fnlwgt              0
education           0
educationNum        0
marital             0
occupation       1843
relationship        0
race                0
sex                 0
capGain             0
capLoss             0
HoursWeek           0
NativeCountry     583
category            0
dtype: int64

In [17]:
all_not_nulls = data.notnull().sum()
all_not_nulls

age              32561
workclass        30725
fnlwgt           32561
education        32561
educationNum     32561
marital          32561
occupation       30718
relationship     32561
race             32561
sex              32561
capGain          32561
capLoss          32561
HoursWeek        32561
NativeCountry    31978
category         32561
dtype: int64

In [18]:
def get_null_percentage(data):
    all_nulls = data.isnull().sum()
    all_not_nulls = data.notnull().sum()
    return all_nulls / all_not_nulls * 100

all_nulls / all_not_nulls * 100

age              0.000000
workclass        5.975590
fnlwgt           0.000000
education        0.000000
educationNum     0.000000
marital          0.000000
occupation       5.999740
relationship     0.000000
race             0.000000
sex              0.000000
capGain          0.000000
capLoss          0.000000
HoursWeek        0.000000
NativeCountry    1.823128
category         0.000000
dtype: float64

In [19]:
from collections import Counter

Counter(data['workclass'])

Counter({'State-gov': 1298,
         'Self-emp-not-inc': 2541,
         'Private': 22696,
         'Federal-gov': 960,
         'Local-gov': 2093,
         nan: 1836,
         'Self-emp-inc': 1116,
         'Without-pay': 14,
         'Never-worked': 7})

In [97]:
work_class_df_one = data[['workclass', 'category']].groupby(['workclass']).sum()

#work_class_df_zero = data[data['category']==0]['workclass', 'category'].groupby(['workclass']).sum()

work_class_df_zero = data[data['category']==0][['workclass', 'category']]

work_class_df_zero['category_zero'] = work_class_df_zero['category'] + 1
# df_1 = pd.DataFrame({'b': ['x', 'y', 'z'], 'a': [1, 2, 3]})
# df_1['c'] = df_1['a'] + 1
# df_1
work_class_df_zero

work_class_df_zero = work_class_df_zero[['workclass', 'category_zero']].groupby('workclass').sum()

work_class_df_one


Unnamed: 0_level_0,category
workclass,Unnamed: 1_level_1
Federal-gov,365
Local-gov,609
Never-worked,0
Private,5063
Self-emp-inc,600
Self-emp-not-inc,714
State-gov,344
Without-pay,0


In [65]:
# work_class_df[work_class_df['category']==1]
work_class_df

Unnamed: 0_level_0,category
workclass,Unnamed: 1_level_1
Federal-gov,365
Local-gov,609
Never-worked,0
Private,5063
Self-emp-inc,600
Self-emp-not-inc,714
State-gov,344
Without-pay,0


In [100]:
work_class_df_zero

Unnamed: 0_level_0,category_zero
workclass,Unnamed: 1_level_1
Federal-gov,578
Local-gov,1458
Never-worked,7
Private,19032
Self-emp-inc,474
Self-emp-not-inc,1785
State-gov,935
Without-pay,14


In [108]:
work_class_df_zero.shape

(8, 1)

In [109]:
work_class_df_one.shape

(8, 1)

In [51]:
def plot_by_category(data, columns):
    
    labels = columns
    
    zero_count = []
    one_count = []
    for i in columns:
        column_df = data[[i, 'category']].groupby(i).count()
        
    labels = ['G1', 'G2', 'G3', 'G4', 'G5']
    zero_count = [20, 34, 30, 35, 27]
    one_count = [25, 32, 34, 20, 25]

    x = np.arange(len(labels))  # the label locations
    width = 0.35  # the width of the bars

    fig, ax = plt.subplots()
    rects1 = ax.bar(x - width/2, zero_count, width, label='Men')
    rects2 = ax.bar(x + width/2, one_count, width, label='Women')

    # Add some text for labels, title and custom x-axis tick labels, etc.
    ax.set_ylabel('Scores')
    ax.set_title('Scores by group and gender')
    ax.set_xticks(x)
    ax.set_xticklabels(labels)
    ax.legend()


    def autolabel(rects):
        """Attach a text label above each bar in *rects*, displaying its height."""
        for rect in rects:
            height = rect.get_height()
            ax.annotate('{}'.format(height),
                        xy=(rect.get_x() + rect.get_width() / 2, height),
                        xytext=(0, 3),  # 3 points vertical offset
                        textcoords="offset points",
                        ha='center', va='bottom')


    autolabel(rects1)
    autolabel(rects2)

    fig.tight_layout()

    plt.show()

Unnamed: 0_level_0,category
workclass,Unnamed: 1_level_1
Federal-gov,943
Local-gov,2067
Never-worked,7
Private,24095
Self-emp-inc,1074
Self-emp-not-inc,2499
State-gov,1279
Without-pay,14


In [21]:
# replace the workclass with maximum occuring value

data['workclass'].replace(np.nan, 'Private', inplace=True)

In [22]:
Counter(data['occupation']).most_common()

[('Prof-specialty', 4140),
 ('Craft-repair', 4099),
 ('Exec-managerial', 4066),
 ('Adm-clerical', 3770),
 ('Sales', 3650),
 ('Other-service', 3295),
 ('Machine-op-inspct', 2002),
 (nan, 1843),
 ('Transport-moving', 1597),
 ('Handlers-cleaners', 1370),
 ('Farming-fishing', 994),
 ('Tech-support', 928),
 ('Protective-serv', 649),
 ('Priv-house-serv', 149),
 ('Armed-Forces', 9)]

In [23]:
data['occupation'].replace(np.nan, 'Prof-speciality', inplace=True)

In [24]:
data.dropna(inplace=True)

In [25]:
get_null_percentage(data)

age              0.0
workclass        0.0
fnlwgt           0.0
education        0.0
educationNum     0.0
marital          0.0
occupation       0.0
relationship     0.0
race             0.0
sex              0.0
capGain          0.0
capLoss          0.0
HoursWeek        0.0
NativeCountry    0.0
category         0.0
dtype: float64

In [26]:
object_types.index

Index(['workclass', 'education', 'marital', 'occupation', 'relationship',
       'race', 'sex', 'NativeCountry'],
      dtype='object')

In [27]:
data1 = pd.get_dummies(data, columns=object_types.index)
data1.head()

Unnamed: 0,age,fnlwgt,educationNum,capGain,capLoss,HoursWeek,category,workclass_Federal-gov,workclass_Local-gov,workclass_Never-worked,...,NativeCountry_Portugal,NativeCountry_Puerto-Rico,NativeCountry_Scotland,NativeCountry_South,NativeCountry_Taiwan,NativeCountry_Thailand,NativeCountry_Trinadad&Tobago,NativeCountry_United-States,NativeCountry_Vietnam,NativeCountry_Yugoslavia
0,39,77516,13,2174,0,40,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,50,83311,13,0,0,13,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,38,215646,9,0,0,40,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,53,234721,7,0,0,40,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,28,338409,13,0,0,40,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [28]:
X = data1.iloc[:, :-1]
y = data1.category

In [29]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)



In [30]:
X_train.shape

(22384, 106)

In [31]:
X_test.shape

(9594, 106)

In [32]:
y_train.shape

(22384,)

In [33]:
y_test.shape

(9594,)

In [34]:
model = LogisticRegression()
model.fit(X_train, y_train)
y_predict = model.predict(X_test)



In [35]:
accuracy_score(y_test, y_predict)

0.795705649364186

In [36]:
y_test[-10:].values

array([0, 0, 0, 1, 0, 0, 0, 0, 1, 1], dtype=int64)

In [37]:
y_predict[-10:]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 1], dtype=int64)

In [38]:
pd.crosstab(y_test, y_predict)

col_0,0,1
category,Unnamed: 1_level_1,Unnamed: 2_level_1
0,7043,240
1,1720,591


In [39]:
print(classification_report(y_test, y_predict))

              precision    recall  f1-score   support

           0       0.80      0.97      0.88      7283
           1       0.71      0.26      0.38      2311

   micro avg       0.80      0.80      0.80      9594
   macro avg       0.76      0.61      0.63      9594
weighted avg       0.78      0.80      0.76      9594

