# Импорт

In [1]:
import pandas as pd
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from modules import mlpclsf
import numpy as np
import warnings
warnings.filterwarnings('ignore')

## Датасет Титаник

In [2]:
titanic = pd.read_csv('../data/titanic.csv', sep = ',')
titanic.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
titanic.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1, inplace=True)
titanic.fillna(titanic.mean(), inplace=True)
titanic.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.25,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.925,S
3,1,1,female,35.0,1,0,53.1,S
4,0,3,male,35.0,0,0,8.05,S


In [4]:
titanic = pd.get_dummies(data=titanic, prefix='c', columns=['Sex', 'Embarked'])
titanic.head()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,c_female,c_male,c_C,c_Q,c_S
0,0,3,22.0,1,0,7.25,0,1,0,0,1
1,1,1,38.0,1,0,71.2833,1,0,1,0,0
2,1,3,26.0,0,0,7.925,1,0,0,0,1
3,1,1,35.0,1,0,53.1,1,0,0,0,1
4,0,3,35.0,0,0,8.05,0,1,0,0,1


Разделим на обучающую и тестовую выборки:

In [5]:
titanic_X = titanic.drop('Survived', 1).values
titanic_y = titanic['Survived'].values
titanic_X_train, titanic_X_test, titanic_y_train, titanic_y_test = train_test_split(titanic_X, titanic_y, test_size=0.2, stratify=titanic_y)

Инициализируем MLP:

In [6]:
mlp = mlpclsf.MLP('Sigmoid', 'SGD', layers=[titanic_X.shape[1], 100, 100, len(np.unique(titanic_y))])

Обучение модели:

In [7]:
mlp.fit(titanic_X_train, titanic_y_train)

epoch 0, loss: 0.36241620310250006
epoch 1, loss: 0.3152495614014481
epoch 2, loss: 0.29622521384986283
epoch 3, loss: 0.28196502540650975
epoch 4, loss: 0.27171785747619615
epoch 5, loss: 0.2648077451811009
epoch 6, loss: 0.2614326213110677
epoch 7, loss: 0.2576753804801728
epoch 8, loss: 0.25378871613382276
epoch 9, loss: 0.2512724917884267
epoch 10, loss: 0.24953635509957148
epoch 11, loss: 0.2484426754807534
epoch 12, loss: 0.2553288829557737
epoch 13, loss: 0.25047971478546116
epoch 14, loss: 0.2460288022341226
epoch 15, loss: 0.24503321566146952
epoch 16, loss: 0.24620300793361122
epoch 17, loss: 0.2431337347676787
epoch 18, loss: 0.24276320356658065
epoch 19, loss: 0.2427052426443284


Проверим точность полученных результатов:

In [8]:
print(sum(np.argmax(mlp.predict(titanic_X_test), axis=1) == titanic_y_test) / titanic_y_test.shape[0])

0.7486033519553073


## Датасет Ирис

In [9]:
iris = datasets.load_iris()

Разделим на обучающую и тестовую выборки:

In [10]:
iris_X = iris.data
iris_y = iris.target
iris_X_train, iris_X_test, iris_y_train, iris_y_test = train_test_split(iris_X, iris_y, test_size=0.2, stratify=iris_y)

In [11]:
iris_X

array([[5.1, 3.5, 1.4, 0.2],
       [4.9, 3. , 1.4, 0.2],
       [4.7, 3.2, 1.3, 0.2],
       [4.6, 3.1, 1.5, 0.2],
       [5. , 3.6, 1.4, 0.2],
       [5.4, 3.9, 1.7, 0.4],
       [4.6, 3.4, 1.4, 0.3],
       [5. , 3.4, 1.5, 0.2],
       [4.4, 2.9, 1.4, 0.2],
       [4.9, 3.1, 1.5, 0.1],
       [5.4, 3.7, 1.5, 0.2],
       [4.8, 3.4, 1.6, 0.2],
       [4.8, 3. , 1.4, 0.1],
       [4.3, 3. , 1.1, 0.1],
       [5.8, 4. , 1.2, 0.2],
       [5.7, 4.4, 1.5, 0.4],
       [5.4, 3.9, 1.3, 0.4],
       [5.1, 3.5, 1.4, 0.3],
       [5.7, 3.8, 1.7, 0.3],
       [5.1, 3.8, 1.5, 0.3],
       [5.4, 3.4, 1.7, 0.2],
       [5.1, 3.7, 1.5, 0.4],
       [4.6, 3.6, 1. , 0.2],
       [5.1, 3.3, 1.7, 0.5],
       [4.8, 3.4, 1.9, 0.2],
       [5. , 3. , 1.6, 0.2],
       [5. , 3.4, 1.6, 0.4],
       [5.2, 3.5, 1.5, 0.2],
       [5.2, 3.4, 1.4, 0.2],
       [4.7, 3.2, 1.6, 0.2],
       [4.8, 3.1, 1.6, 0.2],
       [5.4, 3.4, 1.5, 0.4],
       [5.2, 4.1, 1.5, 0.1],
       [5.5, 4.2, 1.4, 0.2],
       [4.9, 3

Нормализуем данные:

In [12]:
scaler = StandardScaler()
iris_X_train = scaler.fit_transform(iris_X_train)
iris_X_test = scaler.transform(iris_X_test)

Инициализируем MLP:

In [13]:
mlp = mlpclsf.MLP('Sigmoid', 'SGD', layers=[iris_X.shape[1], 100, 100, len(np.unique(iris_y))])

Обучение модели:

In [14]:
mlp.fit(iris_X_train, iris_y_train)

epoch 0, loss: 1.0456318944904823
epoch 1, loss: 0.9745056772179638
epoch 2, loss: 0.8821797715202145
epoch 3, loss: 0.770257304108116
epoch 4, loss: 0.6452788596895024
epoch 5, loss: 0.5187481905288281
epoch 6, loss: 0.402825201829993
epoch 7, loss: 0.3093738727035441
epoch 8, loss: 0.24217128864274767
epoch 9, loss: 0.19507832152956933
epoch 10, loss: 0.16358538129287448
epoch 11, loss: 0.14235244030209226
epoch 12, loss: 0.1279114878670845
epoch 13, loss: 0.11800215537100019
epoch 14, loss: 0.11097149365454982
epoch 15, loss: 0.10598019969560918
epoch 16, loss: 0.10223702483331013
epoch 17, loss: 0.09955648774130577
epoch 18, loss: 0.097416495967266
epoch 19, loss: 0.09557180002724207


Проверим точность полученных результатов:

In [15]:
print(sum(np.argmax(mlp.predict(iris_X_test), axis=1) == iris_y_test) / iris_y_test.shape[0])

0.9666666666666667


## Task 2. Датасет Adult

In [16]:
adult = pd.read_csv('../data/adult_preprocessed.csv')
adult.head()

Unnamed: 0,age,fnlwgt,education-num,sex,capital-gain,capital-loss,hours-per-week,salary,workclass_Federal-gov,workclass_Local-gov,...,native-country_Portugal,native-country_Puerto-Rico,native-country_Scotland,native-country_South,native-country_Taiwan,native-country_Thailand,native-country_Trinadad&Tobago,native-country_United-States,native-country_Vietnam,native-country_Yugoslavia
0,39,77516,13,0,2174,0,40,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,50,83311,13,0,0,0,13,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,38,215646,9,0,0,0,40,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,53,234721,7,0,0,0,40,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,28,338409,13,1,0,0,40,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [17]:
adult_X = adult.drop('salary', 1).values
adult_y = adult['salary'].values

Разделим на обучающую и тестовую выборки:

In [18]:
adult_X_train, adult_X_test, adult_y_train, adult_y_test = train_test_split(adult_X, adult_y, test_size=0.20, stratify=adult_y)

Нормализуем данные:

In [19]:
scaler = StandardScaler()
scaler.fit(adult_X_train)
X_train = scaler.transform(adult_X_train)
X_test = scaler.transform(adult_X_test)

Инициализируем MLP:

In [20]:
mlp = mlpclsf.MLP('Sigmoid', 'SGD', layers=[adult_X.shape[1], 100, 100, len(np.unique(adult_y))])

Обучение модели:

In [21]:
mlp.fit(adult_X_train, adult_y_train)

epoch 0, loss: 0.2820847194257147
epoch 1, loss: 0.2771988573097236
epoch 2, loss: 0.27509415307170587
epoch 3, loss: 0.2734145888895231
epoch 4, loss: 0.2746434216353496
epoch 5, loss: 0.2744265500294284
epoch 6, loss: 0.27638853974469507
epoch 7, loss: 0.27489973677952567
epoch 8, loss: 0.27519948435646624
epoch 9, loss: 0.27469366465878453
epoch 10, loss: 0.2767099945837172
epoch 11, loss: 0.2771415463067853
epoch 12, loss: 0.2776617822114463
epoch 13, loss: 0.27660850591496217
epoch 14, loss: 0.27674693706179326
epoch 15, loss: 0.27700271008690114
epoch 16, loss: 0.27771104281890296
epoch 17, loss: 0.2768710842739077
epoch 18, loss: 0.27805959174816525
epoch 19, loss: 0.279325054999771


Проверим точность полученных результатов:

In [22]:
print(sum(np.argmax(mlp.predict(adult_X_test), axis=1) == adult_y_test) / adult_y_test.shape[0])

0.7560086192607326
