In [1]:
import pandas as pd
import sklearn

In [2]:
df = pd.read_csv(r'data/adult.csv')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48842 entries, 0 to 48841
Data columns (total 15 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   age              48842 non-null  int64 
 1   workclass        48842 non-null  object
 2   fnlwgt           48842 non-null  int64 
 3   education        48842 non-null  object
 4   educational-num  48842 non-null  int64 
 5   marital-status   48842 non-null  object
 6   occupation       48842 non-null  object
 7   relationship     48842 non-null  object
 8   race             48842 non-null  object
 9   gender           48842 non-null  object
 10  capital-gain     48842 non-null  int64 
 11  capital-loss     48842 non-null  int64 
 12  hours-per-week   48842 non-null  int64 
 13  native-country   48842 non-null  object
 14  income           48842 non-null  object
dtypes: int64(6), object(9)
memory usage: 5.6+ MB


In [5]:
df.groupby('income').size()

income
<=50K    37155
>50K     11687
dtype: int64

In [7]:
pivot = pd.pivot_table(df, 
                      index = 'income', columns = 'gender', values = 'age', aggfunc = 'count', margins = True)
pivot

gender,Female,Male,All
income,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
<=50K,14423,22732,37155
>50K,1769,9918,11687
All,16192,32650,48842


In [8]:
df_dummies = pd.get_dummies(df)

In [27]:
features = df_dummies.loc[:,'age':'native-country_Yugoslavia']

In [28]:
X = features.values

In [29]:
y = df_dummies['income_>50K'].values

In [30]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [31]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [32]:
model = LogisticRegression()

In [33]:
model.fit(X_train, y_train)

In [64]:
coeff = pd.DataFrame(model.coef_, columns = features.columns)
coeff = coeff.T
coeff.columns =['coefficient']
coeff.sort_values(by = 'coefficient')

Unnamed: 0,coefficient
marital-status_Never-married,-0.666808
gender_Female,-0.485806
relationship_Not-in-family,-0.391733
relationship_Own-child,-0.362643
workclass_Private,-0.327301
...,...
education_Bachelors,0.181551
gender_Male,0.187419
occupation_Exec-managerial,0.212946
relationship_Husband,0.628560


In [35]:
model.get_params()

{'C': 1.0,
 'class_weight': None,
 'dual': False,
 'fit_intercept': True,
 'intercept_scaling': 1,
 'l1_ratio': None,
 'max_iter': 100,
 'multi_class': 'auto',
 'n_jobs': None,
 'penalty': 'l2',
 'random_state': None,
 'solver': 'lbfgs',
 'tol': 0.0001,
 'verbose': 0,
 'warm_start': False}

In [36]:
y_pred = model.predict(X)
probs = model.predict_proba(X)
probs 

array([[0.93408577, 0.06591423],
       [0.62980115, 0.37019885],
       [0.59468682, 0.40531318],
       ...,
       [0.95688592, 0.04311408],
       [0.96206855, 0.03793145],
       [0.03306175, 0.96693825]])

In [37]:
sklearn.metrics.accuracy_score(y, y_pred)

0.7959338274435936

In [65]:
conf_matrix = pd.DataFrame(
    sklearn.metrics.confusion_matrix(y, y_pred), 
    columns=['Forecast <=50K', 'Forecast >=50K'],
    index=['Actual <=50K', 'Actual >=50K'])
conf_matrix

Unnamed: 0,Forecast <=50K,Forecast >=50K
Actual <=50K,34928,2227
Actual >=50K,7740,3947


In [58]:
# correct % of all = accuracy_score
(conf_matrix.iloc[0, 0] + conf_matrix.iloc[1, 1] ) / conf_matrix.values.sum()

0.7959338274435936

In [61]:
# % of all
conf_matrix.values / conf_matrix.values.sum()

array([[0.71512223, 0.045596  ],
       [0.15847017, 0.0808116 ]])

In [62]:
# actual income
conf_matrix.sum(axis=1)

Actual <=50K    37155
Actual >=50K    11687
dtype: int64

In [63]:
# predicted income
conf_matrix.sum(axis=0)

Forecast <=50K    42668
Forecast >=50K     6174
dtype: int64

In [42]:
model.score(X_test, y_test)

0.7949389894357547

In [43]:
model.score(X_train, y_train)

0.7962654582184489