## Лабораторная работа №2. Обработка признаков
**Задание:**

Выбрать набор данных (датасет), содержащий категориальные и числовые признаки и пропуски в данных. Для выполнения следующих пунктов можно использовать несколько различных наборов данных (один для обработки пропусков, другой для категориальных признаков и т.д.).
Для выбранного датасета (датасетов) на основе материалов лекций решить следующие задачи:
1. устранение пропусков в данных;
1. кодирование категориальных признаков;
1. нормализацию числовых признаков.

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.impute import KNNImputer
from sklearn.preprocessing import MinMaxScaler
from category_encoders.count import CountEncoder as ce_CountEncoder

In [2]:
data_loaded = pd.read_csv('../dataset.csv', sep=',', decimal=',')
data = data_loaded.rename(columns={
    "Area (sq. mi.)": "Area",
    "Pop. Density (per sq. mi.)" : "Density",
    "Coastline (coast/area ratio)": "Coastline",
    "Infant mortality (per 1000 births)": "Infant mortality",
    "GDP ($ per capita)": "GDP",
    "Literacy (%)": "Literacy",
    "Phones (per 1000)": "Phones"
})
data.describe()

Unnamed: 0,Population,Area,Density,Coastline,Net migration,Infant mortality,GDP,Literacy,Phones,Arable (%),Crops (%),Other (%),Climate,Birthrate,Deathrate,Agriculture,Industry,Service
count,227.0,227.0,227.0,227.0,224.0,224.0,226.0,209.0,223.0,225.0,225.0,225.0,205.0,224.0,223.0,212.0,211.0,212.0
mean,28740280.0,598227.0,379.047137,21.16533,0.038125,35.506964,9689.823009,82.838278,236.061435,13.797111,4.564222,81.638311,2.139024,22.114732,9.241345,0.150844,0.282711,0.565283
std,117891300.0,1790282.0,1660.185825,72.286863,4.889269,35.389899,10049.138513,19.722173,227.991829,13.040402,8.36147,16.140835,0.699397,11.176716,4.990026,0.146798,0.138272,0.165841
min,7026.0,2.0,0.0,0.0,-20.99,2.29,500.0,17.6,0.2,0.0,0.0,33.33,1.0,7.29,2.29,0.0,0.02,0.062
25%,437624.0,4647.5,29.15,0.1,-0.9275,8.15,1900.0,70.6,37.8,3.22,0.19,71.65,2.0,12.6725,5.91,0.03775,0.193,0.42925
50%,4786994.0,86600.0,78.8,0.73,0.0,21.0,5550.0,92.5,176.2,10.42,1.03,85.7,2.0,18.79,7.84,0.099,0.272,0.571
75%,17497770.0,441811.0,190.15,10.345,0.9975,55.705,15700.0,98.0,389.65,20.0,4.44,95.44,3.0,29.82,10.605,0.221,0.341,0.6785
max,1313974000.0,17075200.0,16271.5,870.66,23.06,191.19,55100.0,100.0,1035.6,62.11,50.68,100.0,4.0,50.73,29.74,0.769,0.906,0.954


#### Устранение пропусков данных

In [3]:
cols_with_na = [c for c in data.columns if data[c].isnull().sum() > 0]
[(c, data[c].isnull().sum(), "%.3f" % data[c].isnull().mean()) for c in cols_with_na]

[('Net migration', 3, '0.013'),
 ('Infant mortality', 3, '0.013'),
 ('GDP', 1, '0.004'),
 ('Literacy', 18, '0.079'),
 ('Phones', 4, '0.018'),
 ('Arable (%)', 2, '0.009'),
 ('Crops (%)', 2, '0.009'),
 ('Other (%)', 2, '0.009'),
 ('Climate', 22, '0.097'),
 ('Birthrate', 3, '0.013'),
 ('Deathrate', 4, '0.018'),
 ('Agriculture', 15, '0.066'),
 ('Industry', 16, '0.070'),
 ('Service', 15, '0.066')]

In [4]:
cols_to_delete = ['Climate', 'Agriculture', 'Industry', 'Service']
data_droped = data.drop(cols_to_delete, axis=1)
data_droped

Unnamed: 0,Country,Region,Population,Area,Density,Coastline,Net migration,Infant mortality,GDP,Literacy,Phones,Arable (%),Crops (%),Other (%),Birthrate,Deathrate
0,Afghanistan,ASIA (EX. NEAR EAST),31056997,647500,48.0,0.00,23.06,163.07,700.0,36.0,3.2,12.13,0.22,87.65,46.60,20.34
1,Albania,EASTERN EUROPE,3581655,28748,124.6,1.26,-4.93,21.52,4500.0,86.5,71.2,21.09,4.42,74.49,15.11,5.22
2,Algeria,NORTHERN AFRICA,32930091,2381740,13.8,0.04,-0.39,31.00,6000.0,70.0,78.1,3.22,0.25,96.53,17.14,4.61
3,American Samoa,OCEANIA,57794,199,290.4,58.29,-20.71,9.27,8000.0,97.0,259.5,10.00,15.00,75.00,22.46,3.27
4,Andorra,WESTERN EUROPE,71201,468,152.1,0.00,6.60,4.05,19000.0,100.0,497.2,2.22,0.00,97.78,8.71,6.25
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
222,West Bank,NEAR EAST,2460492,5860,419.9,0.00,2.98,19.62,800.0,,145.2,16.90,18.97,64.13,31.67,3.92
223,Western Sahara,NORTHERN AFRICA,273008,266000,1.0,0.42,,,,,,0.02,0.00,99.98,,
224,Yemen,NEAR EAST,21456188,527970,40.6,0.36,0.00,61.50,800.0,50.2,37.2,2.78,0.24,96.98,42.89,8.30
225,Zambia,SUB-SAHARAN AFRICA,11502010,752614,15.3,0.00,0.00,88.29,800.0,80.6,8.2,7.08,0.03,92.90,41.00,19.93


In [5]:
data_to_impute = data_droped[[c for c in data_droped.columns if data_droped[c].isnull().sum() > 0]]
data_to_impute.isnull().sum()

Net migration        3
Infant mortality     3
GDP                  1
Literacy            18
Phones               4
Arable (%)           2
Crops (%)            2
Other (%)            2
Birthrate            3
Deathrate            4
dtype: int64

In [6]:
knnimputer = KNNImputer(
    n_neighbors=5, 
    weights='distance', 
    metric='nan_euclidean', 
    add_indicator=False, 
)
array_imputed = knnimputer.fit_transform(data_to_impute)
data_imputed = data_droped.merge(pd.DataFrame(array_imputed, columns=data_to_impute.columns))
data_imputed.isnull().sum()

Country             0
Region              0
Population          0
Area                0
Density             0
Coastline           0
Net migration       0
Infant mortality    0
GDP                 0
Literacy            0
Phones              0
Arable (%)          0
Crops (%)           0
Other (%)           0
Birthrate           0
Deathrate           0
dtype: int64

#### Кодирование категориальных признаков

In [7]:
print(data_imputed["Region"].unique(), '\n')

encoder_test = ce_CountEncoder()
col_encoded_test = encoder_test.fit_transform(data_imputed[['Region']])
print(col_encoded_test["Region"].unique(), '\n')

encoder = ce_CountEncoder(normalize=True)
col_encoded = encoder.fit_transform(data_imputed[['Region']])
print(col_encoded["Region"].unique())

['ASIA (EX. NEAR EAST)' 'EASTERN EUROPE' 'NORTHERN AFRICA' 'OCEANIA'
 'WESTERN EUROPE' 'SUB-SAHARAN AFRICA' 'LATIN AMER. & CARIB'
 'C.W. OF IND. STATES' 'NEAR EAST' 'NORTHERN AMERICA' 'BALTICS'] 

[27  8  5 15 23 50 43 12 13  4  3] 

[0.13300493 0.03940887 0.02463054 0.07389163 0.11330049 0.24630542
 0.21182266 0.0591133  0.06403941 0.01970443 0.01477833]


In [8]:
data_encoded = data_imputed.copy()
data_encoded['Region'] = col_encoded['Region']
data_encoded.head()

Unnamed: 0,Country,Region,Population,Area,Density,Coastline,Net migration,Infant mortality,GDP,Literacy,Phones,Arable (%),Crops (%),Other (%),Birthrate,Deathrate
0,Afghanistan,0.133005,31056997,647500,48.0,0.0,23.06,163.07,700.0,36.0,3.2,12.13,0.22,87.65,46.6,20.34
1,Albania,0.039409,3581655,28748,124.6,1.26,-4.93,21.52,4500.0,86.5,71.2,21.09,4.42,74.49,15.11,5.22
2,Algeria,0.024631,32930091,2381740,13.8,0.04,-0.39,31.0,6000.0,70.0,78.1,3.22,0.25,96.53,17.14,4.61
3,American Samoa,0.073892,57794,199,290.4,58.29,-20.71,9.27,8000.0,97.0,259.5,10.0,15.0,75.0,22.46,3.27
4,Andorra,0.1133,71201,468,152.1,0.0,6.6,4.05,19000.0,100.0,497.2,2.22,0.0,97.78,8.71,6.25


#### Нормализация числовых признаков

In [9]:
cols_to_scale = ["Population", "Area", "Density", "Coastline", "Net migration", "Infant mortality", "GDP", "Literacy", "Phones", "Arable (%)", "Crops (%)", "Other (%)", "Birthrate", "Deathrate"]

MMScaler = MinMaxScaler()
data_scaled = data_encoded.copy()
for col in cols_to_scale:
    data_scaled[col] = MMScaler.fit_transform(data_encoded[[col]])
data_scaled.describe()

Unnamed: 0,Region,Population,Area,Density,Coastline,Net migration,Infant mortality,GDP,Literacy,Phones,Arable (%),Crops (%),Other (%),Birthrate,Deathrate
count,203.0,203.0,203.0,203.0,203.0,203.0,203.0,203.0,203.0,203.0,203.0,203.0,203.0,203.0,203.0
mean,0.151884,0.024346,0.038395,0.022071,0.021416,0.476824,0.185661,0.169003,0.792075,0.21743,0.228257,0.088297,0.722507,0.347089,0.259185
std,0.078583,0.094592,0.110072,0.10548,0.085126,0.113691,0.192335,0.187052,0.239344,0.211985,0.2127,0.159548,0.240932,0.259375,0.188183
min,0.014778,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.073892,0.000586,0.000761,0.001555,0.000109,0.453462,0.03298,0.025641,0.665049,0.030906,0.060538,0.004391,0.592395,0.127647,0.127687
50%,0.133005,0.004565,0.006495,0.004438,0.000724,0.476504,0.112335,0.093407,0.906553,0.148348,0.171792,0.020629,0.785511,0.267265,0.211658
75%,0.211823,0.01587,0.028214,0.011128,0.007718,0.498297,0.314267,0.260073,0.975728,0.360151,0.328128,0.091708,0.922154,0.519107,0.336066
max,0.246305,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


#### Итоговые данные

In [10]:
data_scaled

Unnamed: 0,Country,Region,Population,Area,Density,Coastline,Net migration,Infant mortality,GDP,Literacy,Phones,Arable (%),Crops (%),Other (%),Birthrate,Deathrate
0,Afghanistan,0.133005,0.023631,0.037920,0.002840,0.000000,1.000000,0.851138,0.003663,0.223301,0.002897,0.195299,0.004493,0.814759,0.904926,0.657559
1,Albania,0.039409,0.002720,0.001683,0.007548,0.001447,0.364586,0.101800,0.073260,0.836165,0.068573,0.339559,0.090278,0.617369,0.180018,0.106740
2,Algeria,0.024631,0.025056,0.139485,0.000738,0.000046,0.467650,0.151985,0.100733,0.635922,0.075237,0.051844,0.005106,0.947953,0.226750,0.084517
3,American Samoa,0.073892,0.000039,0.000012,0.017738,0.066949,0.006356,0.036951,0.137363,0.963592,0.250435,0.161005,0.306373,0.625019,0.349217,0.035701
4,Andorra,0.113300,0.000049,0.000027,0.009238,0.000000,0.626334,0.009317,0.338828,1.000000,0.480008,0.035743,0.000000,0.966702,0.032689,0.144262
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
198,Venezuela,0.211823,0.019577,0.053414,0.001623,0.000356,0.475596,0.105400,0.078755,0.919903,0.135117,0.047496,0.018791,0.941953,0.262891,0.095811
199,Vietnam,0.133005,0.064230,0.019300,0.015630,0.001206,0.466288,0.125251,0.036630,0.882282,0.181089,0.321526,0.121528,0.611219,0.220304,0.143169
200,Yemen,0.064039,0.016324,0.030920,0.002385,0.000413,0.476504,0.313446,0.005495,0.395631,0.035735,0.044759,0.004902,0.954702,0.819521,0.218944
201,Zambia,0.246305,0.008748,0.044076,0.000830,0.000000,0.476504,0.455267,0.005495,0.764563,0.007726,0.113991,0.000613,0.893505,0.776013,0.642623
