In [26]:
import pandas as pd 
import numpy as np
import matplotlib
import matplotlib.pyplot as plt

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report



In [15]:
columns = ['age', 'workclass', 'fnlwgt', 'education', 
          'education-num', 'marital-status', 'occupation',
          'relationship', 'race', 'sex', 'capital-gain', 
          'capital-loss', 'hours-per-week', 'native-country', 'salary']

df = pd.read_csv('adult.data', header=None, names=columns, na_values='?')


df = df.drop('education', axis = 1)
df['salary'] = df['salary'].map({' <=50K':0, ' >50K':1})
df = df.dropna()

test = pd.read_csv('adult.data', header=None, names=columns, na_values='?', skiprows=True)
test = test.drop('education', axis = 1)
test['salary'] = test['salary'].map({' <=50K':0, ' >50K':1})
test = test.dropna()


In [16]:
df.head()

Unnamed: 0,age,workclass,fnlwgt,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
0,39,State-gov,77516,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,0
1,50,Self-emp-not-inc,83311,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,0
2,38,Private,215646,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,0
3,53,Private,234721,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,0
4,28,Private,338409,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,0


In [17]:
df['salary'].value_counts(normalize=True)

0    0.75919
1    0.24081
Name: salary, dtype: float64

In [18]:
test.head()

Unnamed: 0,age,workclass,fnlwgt,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
0,50,Self-emp-not-inc,83311,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,0
1,38,Private,215646,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,0
2,53,Private,234721,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,0
3,28,Private,338409,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,0
4,37,Private,284582,14,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States,0


In [21]:
# разбили выборку на обучающую и тестовую

X_train = pd.get_dummies(df).drop('salary', axis = 1)
y_train = df['salary']

X_test = pd.get_dummies(test).drop('salary', axis = 1)
y_test = test['salary']

In [23]:
print(len (X_train.columns))
print (len (X_test.columns))

# приводим множество названий колонок к типу set, находим разность двух множеств

print(set (X_train.columns) - set (X_test.columns))
print(set (X_test.columns) - set (X_train.columns))




92
92
set()
set()


In [24]:
columns = set(X_train.columns) | set(X_test.columns)
X_train = X_train.reindex(columns=columns).fillna(0)
X_test = X_test.reindex(columns = columns).fillna(0)




In [25]:
# проверим корректность совпадения колонок и тд

all(X_train.columns == X_test.columns)

True

In [27]:
# перейдем к обучению модели

model = GradientBoostingClassifier(random_state=42,
                                  # число деревьев, участвующих в приближении регрессии
                                  n_estimators=100,
                                  # макс глубина каждого дерева
                                  max_depth=3,
                                  # параметр уменьшающий переобучение
                                  # лучше ставить небольшое число
                                  learning_rate=0.1)

model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.89      0.95      0.92     24719
           1       0.80      0.61      0.69      7841

    accuracy                           0.87     32560
   macro avg       0.84      0.78      0.80     32560
weighted avg       0.86      0.87      0.86     32560

