In [1]:
import pandas as pd

In [2]:
adult_census = pd.read_csv('data/adult-census.csv')

In [3]:
target_name = "class"
target = adult_census[target_name]
data = adult_census.drop(columns=target_name)

In [4]:
numerical_columns = ["age", "capital-gain", "capital-loss", "hours-per-week"]
data_numeric = data[numerical_columns]

In [5]:
data_numeric.head()

Unnamed: 0,age,capital-gain,capital-loss,hours-per-week
0,25,0,0,40
1,38,0,0,50
2,28,0,0,40
3,44,7688,0,40
4,18,0,0,30


In [6]:
from sklearn.model_selection import train_test_split

In [7]:
data_train, data_test, target_train, target_test = train_test_split(data_numeric, target, random_state=42)

In [8]:
from sklearn.dummy import DummyClassifier

In [9]:
class_to_predict = " >50K"
high_revenue_clf = DummyClassifier(strategy="constant",
                                   constant=class_to_predict)
high_revenue_clf.fit(data_train, target_train)
score = high_revenue_clf.score(data_test, target_test)
print(f"Accuracy of a model predicting only high revenue: {score:.3f}")

Accuracy of a model predicting only high revenue: 0.234


In [10]:
class_to_predict = " <=50K"
low_revenue_clf = DummyClassifier(strategy="constant",
                                  constant=class_to_predict)
low_revenue_clf.fit(data_train, target_train)
score = low_revenue_clf.score(data_test, target_test)
print(f"Accuracy of a model predicting only low revenue: {score:.3f}")

Accuracy of a model predicting only low revenue: 0.766


We observe that this model has an accuracy higher than 0.5. This is due to the fact that we have 3/4 of the target belonging to low-revenue class.

Therefore, any predictive model giving results below this dummy classifier will not be helpful.

In [11]:
adult_census["class"].value_counts()

 <=50K    37155
 >50K     11687
Name: class, dtype: int64

In [12]:
(target == " <=50K").mean()

0.7607182343065395

In practice, we could have the strategy "most_frequent" to predict the class that appears the most in the training target.

In [13]:
most_freq_revenue_clf = DummyClassifier(strategy="most_frequent")
most_freq_revenue_clf.fit(data_train, target_train)
score = most_freq_revenue_clf.score(data_test, target_test)
print(f"Accuracy of a model predicting the most frequent class: {score:.3f}")

Accuracy of a model predicting the most frequent class: 0.766


So the LogisticRegression accuracy (roughly 81%) seems better than the DummyClassifier accuracy (roughly 76%). In a way it is a bit reassuring, using a machine learning model gives you a better performance than always predicting the majority class, i.e. the low income class " <=50K".