In [22]:
import pandas as pd
import numpy as np
import scipy as sp
import scipy.stats as stats

from matplotlib import pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report


In [23]:
data = pd.read_csv('data/adult.csv', header=None, skipinitialspace=True)

In [24]:
data.columns = ['age','workclass','fnlwgt','education','educationNum','marital',
           'occupation','relationship','race','sex','capGain','capLoss',
          'HoursWeek','NativeCountry','category']


In [25]:
data.head()

Unnamed: 0,age,workclass,fnlwgt,education,educationNum,marital,occupation,relationship,race,sex,capGain,capLoss,HoursWeek,NativeCountry,category
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [26]:
data.loc[data.category == '<=50K', 'category'] = 0
data.loc[data.category == '>50K', 'category'] = 1

In [27]:
data.category.head()

0    0
1    0
2    0
3    0
4    0
Name: category, dtype: int64

In [28]:
data.category.tail()

32556    0
32557    1
32558    0
32559    0
32560    1
Name: category, dtype: int64

In [29]:
# 7406609000 whatsapp of Ashok

In [30]:
data.category

0        0
1        0
2        0
3        0
4        0
5        0
6        0
7        1
8        1
9        1
10       1
11       1
12       0
13       0
14       1
15       0
16       0
17       0
18       0
19       1
20       1
21       0
22       0
23       0
24       0
25       1
26       0
27       1
28       0
29       0
        ..
32531    0
32532    1
32533    1
32534    0
32535    0
32536    1
32537    0
32538    1
32539    1
32540    0
32541    0
32542    0
32543    0
32544    0
32545    1
32546    0
32547    0
32548    0
32549    0
32550    0
32551    0
32552    0
32553    0
32554    1
32555    0
32556    0
32557    1
32558    0
32559    0
32560    1
Name: category, Length: 32561, dtype: int64

In [31]:
data.columns

Index(['age', 'workclass', 'fnlwgt', 'education', 'educationNum', 'marital',
       'occupation', 'relationship', 'race', 'sex', 'capGain', 'capLoss',
       'HoursWeek', 'NativeCountry', 'category'],
      dtype='object')

In [32]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
age              32561 non-null int64
workclass        32561 non-null object
fnlwgt           32561 non-null int64
education        32561 non-null object
educationNum     32561 non-null int64
marital          32561 non-null object
occupation       32561 non-null object
relationship     32561 non-null object
race             32561 non-null object
sex              32561 non-null object
capGain          32561 non-null int64
capLoss          32561 non-null int64
HoursWeek        32561 non-null int64
NativeCountry    32561 non-null object
category         32561 non-null int64
dtypes: int64(7), object(8)
memory usage: 3.7+ MB


In [33]:
data.replace(['$', '?', '*', '-'], np.nan, inplace=True)

In [34]:
object_types = data.dtypes[data.dtypes == 'object']

In [35]:

for i in object_types.index:
    print(i)
    print(set(data[i]))
    # data[i] = labelencoder.fit_transform(data[i])

workclass
{nan, 'Never-worked', 'Federal-gov', 'State-gov', 'Private', 'Local-gov', 'Self-emp-inc', 'Without-pay', 'Self-emp-not-inc'}
education
{'Assoc-voc', '5th-6th', '1st-4th', 'HS-grad', 'Doctorate', 'Assoc-acdm', 'Masters', 'Preschool', '12th', 'Bachelors', 'Prof-school', '11th', 'Some-college', '10th', '7th-8th', '9th'}
marital
{'Married-civ-spouse', 'Divorced', 'Separated', 'Married-AF-spouse', 'Married-spouse-absent', 'Widowed', 'Never-married'}
occupation
{'Craft-repair', nan, 'Adm-clerical', 'Farming-fishing', 'Other-service', 'Armed-Forces', 'Handlers-cleaners', 'Exec-managerial', 'Priv-house-serv', 'Sales', 'Machine-op-inspct', 'Protective-serv', 'Tech-support', 'Prof-specialty', 'Transport-moving'}
relationship
{'Not-in-family', 'Own-child', 'Unmarried', 'Wife', 'Husband', 'Other-relative'}
race
{'Black', 'Amer-Indian-Eskimo', 'Asian-Pac-Islander', 'White', 'Other'}
sex
{'Female', 'Male'}
NativeCountry
{nan, 'Laos', 'Ireland', 'Scotland', 'Poland', 'Columbia', 'Honduras',

In [36]:
all_nulls = data.isnull().sum()
all_nulls

age                 0
workclass        1836
fnlwgt              0
education           0
educationNum        0
marital             0
occupation       1843
relationship        0
race                0
sex                 0
capGain             0
capLoss             0
HoursWeek           0
NativeCountry     583
category            0
dtype: int64

In [37]:
all_not_nulls = data.notnull().sum()
all_not_nulls

age              32561
workclass        30725
fnlwgt           32561
education        32561
educationNum     32561
marital          32561
occupation       30718
relationship     32561
race             32561
sex              32561
capGain          32561
capLoss          32561
HoursWeek        32561
NativeCountry    31978
category         32561
dtype: int64

In [38]:
def get_null_percentage(data):
    all_nulls = data.isnull().sum()
    all_not_nulls = data.notnull().sum()
    return all_nulls / all_not_nulls * 100

all_nulls / all_not_nulls * 100

age              0.000000
workclass        5.975590
fnlwgt           0.000000
education        0.000000
educationNum     0.000000
marital          0.000000
occupation       5.999740
relationship     0.000000
race             0.000000
sex              0.000000
capGain          0.000000
capLoss          0.000000
HoursWeek        0.000000
NativeCountry    1.823128
category         0.000000
dtype: float64

In [39]:
from collections import Counter

Counter(data['workclass'])

Counter({'State-gov': 1298,
         'Self-emp-not-inc': 2541,
         'Private': 22696,
         'Federal-gov': 960,
         'Local-gov': 2093,
         nan: 1836,
         'Self-emp-inc': 1116,
         'Without-pay': 14,
         'Never-worked': 7})

In [40]:
work_class_df_one = data[['workclass', 'category']].groupby(['workclass']).sum()

#work_class_df_zero = data[data['category']==0]['workclass', 'category'].groupby(['workclass']).sum()

work_class_df_zero = data[data['category']==0][['workclass', 'category']]

work_class_df_zero['category_zero'] = work_class_df_zero['category'] + 1
# df_1 = pd.DataFrame({'b': ['x', 'y', 'z'], 'a': [1, 2, 3]})
# df_1['c'] = df_1['a'] + 1
# df_1
work_class_df_zero

work_class_df_zero = work_class_df_zero[['workclass', 'category_zero']].groupby('workclass').sum()

work_class_df_one


Unnamed: 0_level_0,category
workclass,Unnamed: 1_level_1
Federal-gov,371
Local-gov,617
Never-worked,0
Private,4963
Self-emp-inc,622
Self-emp-not-inc,724
State-gov,353
Without-pay,0


In [41]:
work_class_df_zero

Unnamed: 0_level_0,category_zero
workclass,Unnamed: 1_level_1
Federal-gov,589
Local-gov,1476
Never-worked,7
Private,17733
Self-emp-inc,494
Self-emp-not-inc,1817
State-gov,945
Without-pay,14


In [42]:
work_class_df_zero.shape

(8, 1)

In [48]:
work_class_df_one

Unnamed: 0_level_0,category
workclass,Unnamed: 1_level_1
Federal-gov,371
Local-gov,617
Never-worked,0
Private,4963
Self-emp-inc,622
Self-emp-not-inc,724
State-gov,353
Without-pay,0


In [49]:
work_class_df = pd.DataFrame({
    'category_zero': work_class_df_zero['category_zero'], 
    'category_one': work_class_df_one['category']
}
)

work_class_df

Unnamed: 0_level_0,category_zero,category_one
workclass,Unnamed: 1_level_1,Unnamed: 2_level_1
Federal-gov,589,371
Local-gov,1476,617
Never-worked,7,0
Private,17733,4963
Self-emp-inc,494,622
Self-emp-not-inc,1817,724
State-gov,945,353
Without-pay,14,0


In [51]:
object_types

workclass        object
education        object
marital          object
occupation       object
relationship     object
race             object
sex              object
NativeCountry    object
dtype: object

In [57]:
def plot_by_category(labels, zero_count, one_count):
    
    labels = columns

    x = np.arange(len(labels))  # the label locations
    width = 0.35  # the width of the bars

    fig, ax = plt.subplots()
    rects1 = ax.bar(x - width/2, zero_count, width, label='Men')
    rects2 = ax.bar(x + width/2, one_count, width, label='Women')

    # Add some text for labels, title and custom x-axis tick labels, etc.
    ax.set_ylabel('Scores')
    ax.set_title('Scores by group and gender')
    ax.set_xticks(x)
    ax.set_xticklabels(labels)
    ax.legend()


    def autolabel(rects):
        """Attach a text label above each bar in *rects*, displaying its height."""
        for rect in rects:
            height = rect.get_height()
            ax.annotate('{}'.format(height),
                        xy=(rect.get_x() + rect.get_width() / 2, height),
                        xytext=(0, 3),  # 3 points vertical offset
                        textcoords="offset points",
                        ha='center', va='bottom')


    autolabel(rects1)
    autolabel(rects2)

    fig.tight_layout()

    plt.show()

In [60]:
def create_grouped_count(data, columns, category):
    
    for column in columns:
        column_df_one = data[[column, category]].groupby([column]).sum()

        column_df_zero = data[data[category]==0][[column, category]]

        column_df_zero['category_zero'] = column_df_zero[category] + 1
        column_df_zero = column_df_zero[[column, 'category_zero']].groupby(column).sum()

        column_df = pd.DataFrame(
            {
            'category_zero': column_df_zero['category_zero'], 
            'category_one': column_df_one[category]
            })
        print(column_df)
        labels = column_df.index
        zero_count = column_df.category_zero
        one_count = column_df.category_one
        plot_by_category(labels, zero_count, one_count)
        
create_grouped_count(data, object_types.index, 'category')



                  category_zero  category_one
workclass                                    
Federal-gov                 589           371
Local-gov                  1476           617
Never-worked                  7             0
Private                   17733          4963
Self-emp-inc                494           622
Self-emp-not-inc           1817           724
State-gov                   945           353
Without-pay                  14             0


NameError: name 'columns' is not defined

In [None]:
# replace the workclass with maximum occuring value

data['workclass'].replace(np.nan, 'Private', inplace=True)

In [None]:
Counter(data['occupation']).most_common()

In [None]:
data['occupation'].replace(np.nan, 'Prof-speciality', inplace=True)

In [None]:
data.dropna(inplace=True)

In [None]:
get_null_percentage(data)

In [None]:
object_types.index

In [None]:
data1 = pd.get_dummies(data, columns=object_types.index)
data1.head()

In [None]:
X = data1.iloc[:, :-1]
y = data1.category

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)



In [None]:
X_train.shape

In [None]:
X_test.shape

In [None]:
y_train.shape

In [None]:
y_test.shape

In [None]:
model = LogisticRegression()
model.fit(X_train, y_train)
y_predict = model.predict(X_test)

In [None]:
accuracy_score(y_test, y_predict)

In [None]:
y_test[-10:].values

In [None]:
y_predict[-10:]

In [None]:
pd.crosstab(y_test, y_predict)

In [None]:
print(classification_report(y_test, y_predict))