# Задание
Имеются данные adult.csv (см. в материалах к занятию).
Целевой переменной является уровень дохода income (крайний правый столбец).
Описание признаков можно найти по ссылке www.cs.toronto.edu...etail.html
Вам необходимо построить модели логистической регрессии и SVM, которые предсказывает уровень дохода человека.
Вывести качество полученных моделей на тестовой выборке, используя функцию score у модели.
Готовый ноутбук выложить на гитхаб и прислать ссылку.

In [4]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression

## 1. Чтение и преобразование данных

In [5]:
adults = pd.read_csv('adult.csv')

In [6]:
adults.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48842 entries, 0 to 48841
Data columns (total 15 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   age              48842 non-null  int64 
 1   workclass        48842 non-null  object
 2   fnlwgt           48842 non-null  int64 
 3   education        48842 non-null  object
 4   educational-num  48842 non-null  int64 
 5   marital-status   48842 non-null  object
 6   occupation       48842 non-null  object
 7   relationship     48842 non-null  object
 8   race             48842 non-null  object
 9   gender           48842 non-null  object
 10  capital-gain     48842 non-null  int64 
 11  capital-loss     48842 non-null  int64 
 12  hours-per-week   48842 non-null  int64 
 13  native-country   48842 non-null  object
 14  income           48842 non-null  object
dtypes: int64(6), object(9)
memory usage: 5.6+ MB


In [7]:
adults.head()

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K
4,18,?,103497,Some-college,10,Never-married,?,Own-child,White,Female,0,0,30,United-States,<=50K


In [8]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

In [9]:
columns_to_encode = ['workclass', 'marital-status', 'occupation', 'relationship', 'race', 'native-country', 'gender', 'education']

In [10]:
adults[columns_to_encode] = adults[columns_to_encode].apply(le.fit_transform)

In [11]:
adults

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,25,4,226802,1,7,4,7,3,2,1,0,0,40,39,<=50K
1,38,4,89814,11,9,2,5,0,4,1,0,0,50,39,<=50K
2,28,2,336951,7,12,2,11,0,4,1,0,0,40,39,>50K
3,44,4,160323,15,10,2,7,0,2,1,7688,0,40,39,>50K
4,18,0,103497,15,10,4,0,3,4,0,0,0,30,39,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48837,27,4,257302,7,12,2,13,5,4,0,0,0,38,39,<=50K
48838,40,4,154374,11,9,2,7,0,4,1,0,0,40,39,>50K
48839,58,4,151910,11,9,6,1,4,4,0,0,0,40,39,<=50K
48840,22,4,201490,11,9,4,1,3,4,1,0,0,20,39,<=50K


In [12]:
adults_changed = adults.drop(columns = ['income'])

In [13]:
adults_changed

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country
0,25,4,226802,1,7,4,7,3,2,1,0,0,40,39
1,38,4,89814,11,9,2,5,0,4,1,0,0,50,39
2,28,2,336951,7,12,2,11,0,4,1,0,0,40,39
3,44,4,160323,15,10,2,7,0,2,1,7688,0,40,39
4,18,0,103497,15,10,4,0,3,4,0,0,0,30,39
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48837,27,4,257302,7,12,2,13,5,4,0,0,0,38,39
48838,40,4,154374,11,9,2,7,0,4,1,0,0,40,39
48839,58,4,151910,11,9,6,1,4,4,0,0,0,40,39
48840,22,4,201490,11,9,4,1,3,4,1,0,0,20,39


adults_changed

In [14]:
le.fit(adults['income'])
Y = pd.Series(le.transform (adults['income']))

In [15]:
from sklearn.model_selection import train_test_split

In [16]:
X_train, X_test, Y_train, Y_test = train_test_split(adults_changed, Y, test_size = 0.3, random_state = 42)

## 2. Логистрическая регрессия с масштабированием данных

In [17]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

In [18]:
model_lr = make_pipeline(StandardScaler(), LogisticRegression(max_iter = 1000))

In [19]:
model_lr.fit(X_train, Y_train)
predictions = model_lr.predict(X_test)

In [20]:
model_lr.predict_proba(X_test)

array([[0.88287499, 0.11712501],
       [0.92349402, 0.07650598],
       [0.01970196, 0.98029804],
       ...,
       [0.90430866, 0.09569134],
       [0.64371126, 0.35628874],
       [0.9432854 , 0.0567146 ]])

In [21]:
model_lr.score(X_train, Y_train)

0.8230424990494019

In [22]:
model_lr.score(X_test, Y_test)

0.8295229645806319

## 3. Логистическая регрессия без масштабирования

In [23]:
model_lr_1 = LogisticRegression(max_iter = 1000)

In [24]:
model_lr_1.fit(X_train, Y_train)
predictions_lr_1 = model_lr_1.predict(X_test)

In [25]:
model_lr_1.score(X_train, Y_train)

0.785662055046945

In [26]:
model_lr_1.score(X_test, Y_test)

0.7938306148911486

## 4. SVM с масштабированием

In [27]:
from sklearn.svm import SVC

In [28]:
model_svc = make_pipeline(StandardScaler(), SVC(gamma='auto'))
model_svc.fit(X_train, Y_train)

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('svc', SVC(gamma='auto'))])

In [29]:
predictions_svc = model_svc.predict(X_test)

In [30]:
model_svc.score(X_test, Y_test)

0.8583225278100047

In [31]:
model_svc.score(X_train, Y_train)

0.8535493872298108

## 5. SVM без масштабирования

In [32]:
model_svm_1 = SVC(gamma = 'auto')

In [None]:
model_svm_1.fit(X_train, Y_train)

In [None]:
predictions_svm_1 = model_svm_1.predict(X_test)

In [None]:
model_svm_1.score(X_train, Y_train)

In [None]:
model_svm_1.score(X_test, Y_test)