# Кучера Жанна 20 МАГ ИАД

In [33]:
import numpy as np              # Массивы (матрицы, векторы, линейная алгебра)
import matplotlib.pyplot as plt # Научная графика
%matplotlib inline 
    # Говорим jupyter'у, чтобы весь графический вывод был в браузере, а не в отдельном окне
import pandas as pd             # Таблицы и временные ряды (dataframe, series)
import seaborn as sns           # Еще больше красивой графики для визуализации данных
import sklearn                  # Алгоритмы машинного обучения
from sklearn.decomposition import PCA
from sklearn import preprocessing
import matplotlib.pyplot as plt

# Чтение данных

Увеличим размерность показываемой таблицы для удобства

In [34]:
pd.set_option('display.max_rows', 50)
pd.set_option('display.max_columns', 30)

Читаем данные

In [35]:
data_raw = pd.read_csv('marketing_campaign.csv')

#### Приведем к соответствию типы признаков

Категориальные признаки:

In [36]:
data_raw['Education'] = data_raw['Education'].astype('category')
data_raw['Marital_Status'] = data_raw['Marital_Status'].astype('category')

Дата:

In [37]:
data_raw['Dt_Customer'] = pd.to_datetime(data_raw['Dt_Customer'])
data_raw['Year_Birth'] = pd.to_datetime(data_raw['Year_Birth'], format="%Y")

In [38]:
data_raw.dtypes

ID                              int64
Year_Birth             datetime64[ns]
Education                    category
Marital_Status               category
Income                        float64
Kidhome                         int64
Teenhome                        int64
Dt_Customer            datetime64[ns]
Recency                         int64
MntWines                        int64
MntFruits                       int64
MntMeatProducts                 int64
MntFishProducts                 int64
MntSweetProducts                int64
MntGoldProds                    int64
NumDealsPurchases               int64
NumWebPurchases                 int64
NumCatalogPurchases             int64
NumStorePurchases               int64
NumWebVisitsMonth               int64
AcceptedCmp3                    int64
AcceptedCmp4                    int64
AcceptedCmp5                    int64
AcceptedCmp1                    int64
AcceptedCmp2                    int64
Complain                        int64
Z_CostContac

Проверим пропущенные значения в датасете.

In [39]:
data_raw.isna().sum()

ID                      0
Year_Birth              0
Education               0
Marital_Status          0
Income                 24
Kidhome                 0
Teenhome                0
Dt_Customer             0
Recency                 0
MntWines                0
MntFruits               0
MntMeatProducts         0
MntFishProducts         0
MntSweetProducts        0
MntGoldProds            0
NumDealsPurchases       0
NumWebPurchases         0
NumCatalogPurchases     0
NumStorePurchases       0
NumWebVisitsMonth       0
AcceptedCmp3            0
AcceptedCmp4            0
AcceptedCmp5            0
AcceptedCmp1            0
AcceptedCmp2            0
Complain                0
Z_CostContact           0
Z_Revenue               0
Response                0
dtype: int64

Сколько в процентном соотношении отсутствующих значений

In [40]:
null_value = data_raw.isna().sum()
percent_of_nulls = null_value/data_raw.shape[0]*100

In [41]:
pd.DataFrame({'Missings':null_value, '%': percent_of_nulls})

Unnamed: 0,Missings,%
ID,0,0.0
Year_Birth,0,0.0
Education,0,0.0
Marital_Status,0,0.0
Income,24,1.071429
Kidhome,0,0.0
Teenhome,0,0.0
Dt_Customer,0,0.0
Recency,0,0.0
MntWines,0,0.0


Неизвестен доход одного процента людей из выборки. Можем избавиться от этих данных.

In [42]:
#data_raw = data_raw.dropna()
#data_raw.isna().sum()

Посмотрим, какие значения принимают категориальные данные

In [43]:
data_raw['Marital_Status'].value_counts()

Married     864
Together    580
Single      480
Divorced    232
Widow        77
Alone         3
YOLO          2
Absurd        2
Name: Marital_Status, dtype: int64

In [44]:
data_raw['Marital_Status'].replace(to_replace=["Alone", "YOLO", "Absurd"], value=["Single", "Single", "Single"], inplace=True)

Можно заметить, что есть два отдельных признака: Single и Alone, что в целом означает одно и то же. Заменим значения Alone на Single. Ответы "YOLO" и "Absurd" также заменим на "Single", так как их мало и они не несут нам почти никакой информации. Чисто интуитивно их можно отнести к "Single".

In [45]:
data_raw['Marital_Status'].value_counts()

Married     864
Together    580
Single      487
Divorced    232
Widow        77
Name: Marital_Status, dtype: int64

In [46]:
data_raw['Marital_Status'] = data_raw['Marital_Status'].astype('category')

Номер ID не нужен для дальнейшего анализа данных. Уберем признак.

In [47]:
data_raw = data_raw.drop(['ID'], axis=1)

In [48]:
def getEachTypeCols(data):
    bincol = [c for c in data.columns 
              if data[c].value_counts().index.isin([0,1]).all()]
    categorical = [c for c in data.columns 
                   if data[c].dtype.name == 'category']
    numerical = [c for c in data.columns
                 if ((data[c].dtype.name == 'int64' 
                      or data[c].dtype.name == 'float64'))
                 and c not in bincol]
    date = [c for c in data.columns 
                    if data[c].dtype.name == 'datetime64[ns]']
    ser = {'binary cols': bincol,
                'categorical cols': categorical,
                'numerical cols': numerical,
                'date cols': date}
    ret_ser = pd.Series(ser)
    return ret_ser

In [49]:
data_raw.head()

Unnamed: 0,Year_Birth,Education,Marital_Status,Income,Kidhome,Teenhome,Dt_Customer,Recency,MntWines,MntFruits,MntMeatProducts,MntFishProducts,MntSweetProducts,MntGoldProds,NumDealsPurchases,NumWebPurchases,NumCatalogPurchases,NumStorePurchases,NumWebVisitsMonth,AcceptedCmp3,AcceptedCmp4,AcceptedCmp5,AcceptedCmp1,AcceptedCmp2,Complain,Z_CostContact,Z_Revenue,Response
0,1957-01-01,Graduation,Single,58138.0,0,0,2012-09-04,58,635,88,546,172,88,88,3,8,10,4,7,0,0,0,0,0,0,3,11,1
1,1954-01-01,Graduation,Single,46344.0,1,1,2014-03-08,38,11,1,6,2,1,6,2,1,1,2,5,0,0,0,0,0,0,3,11,0
2,1965-01-01,Graduation,Together,71613.0,0,0,2013-08-21,26,426,49,127,111,21,42,1,8,2,10,4,0,0,0,0,0,0,3,11,0
3,1984-01-01,Graduation,Together,26646.0,1,0,2014-02-10,26,11,4,20,10,3,5,2,2,0,4,6,0,0,0,0,0,0,3,11,0
4,1981-01-01,PhD,Married,58293.0,1,0,2014-01-19,94,173,43,118,46,27,15,5,5,3,6,5,0,0,0,0,0,0,3,11,0


In [50]:
today = pd.Timestamp('now')
data_raw['age'] = (today - data_raw['Year_Birth']).astype('<m8[Y]').astype('int64')

In [51]:
data_raw.head()

Unnamed: 0,Year_Birth,Education,Marital_Status,Income,Kidhome,Teenhome,Dt_Customer,Recency,MntWines,MntFruits,MntMeatProducts,MntFishProducts,MntSweetProducts,MntGoldProds,NumDealsPurchases,NumWebPurchases,NumCatalogPurchases,NumStorePurchases,NumWebVisitsMonth,AcceptedCmp3,AcceptedCmp4,AcceptedCmp5,AcceptedCmp1,AcceptedCmp2,Complain,Z_CostContact,Z_Revenue,Response,age
0,1957-01-01,Graduation,Single,58138.0,0,0,2012-09-04,58,635,88,546,172,88,88,3,8,10,4,7,0,0,0,0,0,0,3,11,1,63
1,1954-01-01,Graduation,Single,46344.0,1,1,2014-03-08,38,11,1,6,2,1,6,2,1,1,2,5,0,0,0,0,0,0,3,11,0,66
2,1965-01-01,Graduation,Together,71613.0,0,0,2013-08-21,26,426,49,127,111,21,42,1,8,2,10,4,0,0,0,0,0,0,3,11,0,55
3,1984-01-01,Graduation,Together,26646.0,1,0,2014-02-10,26,11,4,20,10,3,5,2,2,0,4,6,0,0,0,0,0,0,3,11,0,36
4,1981-01-01,PhD,Married,58293.0,1,0,2014-01-19,94,173,43,118,46,27,15,5,5,3,6,5,0,0,0,0,0,0,3,11,0,39


In [52]:
data_raw.dtypes

Year_Birth             datetime64[ns]
Education                    category
Marital_Status               category
Income                        float64
Kidhome                         int64
Teenhome                        int64
Dt_Customer            datetime64[ns]
Recency                         int64
MntWines                        int64
MntFruits                       int64
MntMeatProducts                 int64
MntFishProducts                 int64
MntSweetProducts                int64
MntGoldProds                    int64
NumDealsPurchases               int64
NumWebPurchases                 int64
NumCatalogPurchases             int64
NumStorePurchases               int64
NumWebVisitsMonth               int64
AcceptedCmp3                    int64
AcceptedCmp4                    int64
AcceptedCmp5                    int64
AcceptedCmp1                    int64
AcceptedCmp2                    int64
Complain                        int64
Z_CostContact                   int64
Z_Revenue   

In [53]:
data_raw.head()

Unnamed: 0,Year_Birth,Education,Marital_Status,Income,Kidhome,Teenhome,Dt_Customer,Recency,MntWines,MntFruits,MntMeatProducts,MntFishProducts,MntSweetProducts,MntGoldProds,NumDealsPurchases,NumWebPurchases,NumCatalogPurchases,NumStorePurchases,NumWebVisitsMonth,AcceptedCmp3,AcceptedCmp4,AcceptedCmp5,AcceptedCmp1,AcceptedCmp2,Complain,Z_CostContact,Z_Revenue,Response,age
0,1957-01-01,Graduation,Single,58138.0,0,0,2012-09-04,58,635,88,546,172,88,88,3,8,10,4,7,0,0,0,0,0,0,3,11,1,63
1,1954-01-01,Graduation,Single,46344.0,1,1,2014-03-08,38,11,1,6,2,1,6,2,1,1,2,5,0,0,0,0,0,0,3,11,0,66
2,1965-01-01,Graduation,Together,71613.0,0,0,2013-08-21,26,426,49,127,111,21,42,1,8,2,10,4,0,0,0,0,0,0,3,11,0,55
3,1984-01-01,Graduation,Together,26646.0,1,0,2014-02-10,26,11,4,20,10,3,5,2,2,0,4,6,0,0,0,0,0,0,3,11,0,36
4,1981-01-01,PhD,Married,58293.0,1,0,2014-01-19,94,173,43,118,46,27,15,5,5,3,6,5,0,0,0,0,0,0,3,11,0,39


In [54]:
data_raw['Days_customer'] = (today - data_raw['Dt_Customer']).astype('<m8[D]').astype('int64')

In [55]:
a = getEachTypeCols(data_raw)
a.get('date cols')

['Year_Birth', 'Dt_Customer']

In [56]:
data_raw.drop(a.get('date cols'), axis=1, inplace=True)

In [57]:
data_raw.dtypes

Education              category
Marital_Status         category
Income                  float64
Kidhome                   int64
Teenhome                  int64
Recency                   int64
MntWines                  int64
MntFruits                 int64
MntMeatProducts           int64
MntFishProducts           int64
MntSweetProducts          int64
MntGoldProds              int64
NumDealsPurchases         int64
NumWebPurchases           int64
NumCatalogPurchases       int64
NumStorePurchases         int64
NumWebVisitsMonth         int64
AcceptedCmp3              int64
AcceptedCmp4              int64
AcceptedCmp5              int64
AcceptedCmp1              int64
AcceptedCmp2              int64
Complain                  int64
Z_CostContact             int64
Z_Revenue                 int64
Response                  int64
age                       int64
Days_customer             int64
dtype: object

Мы можем восстановить пустые значения с помощью модели линейной регрессии.

In [58]:
# Preparing all datasets to be used in the Linear Regression Model
X = data_raw
y = X["Income"]
y = y[-y.isna()]

In [59]:
X["Marital_Status"] = pd.Categorical(X["Marital_Status"])
X["Marital_Status"] = X["Marital_Status"].cat.codes

X["Education"] = pd.Categorical(X["Education"])
X["Education"] = X["Education"].cat.codes

In [60]:
x_pred = X[X.Income.isna()]
x_pred = x_pred.drop(columns="Income")
x_pred

Unnamed: 0,Education,Marital_Status,Kidhome,Teenhome,Recency,MntWines,MntFruits,MntMeatProducts,MntFishProducts,MntSweetProducts,MntGoldProds,NumDealsPurchases,NumWebPurchases,NumCatalogPurchases,NumStorePurchases,NumWebVisitsMonth,AcceptedCmp3,AcceptedCmp4,AcceptedCmp5,AcceptedCmp1,AcceptedCmp2,Complain,Z_CostContact,Z_Revenue,Response,age,Days_customer
10,2,1,1,0,11,5,5,6,0,2,1,1,1,0,2,7,0,0,0,0,0,0,3,11,0,37,2522
27,2,2,1,0,19,5,1,3,3,263,362,0,27,0,0,1,0,0,0,0,0,0,3,11,0,34,2790
43,4,2,0,0,80,81,11,50,3,2,39,1,1,3,4,2,0,0,0,0,0,0,3,11,0,61,2532
48,2,2,2,1,96,48,5,48,6,10,7,3,2,1,4,6,0,0,0,0,0,0,3,11,0,69,2475
58,2,2,1,0,57,11,3,22,2,2,6,2,2,0,3,6,0,0,0,0,0,0,3,11,0,38,2673
71,0,1,1,0,25,25,3,43,17,4,17,3,3,0,3,8,0,0,0,0,0,0,3,11,0,47,2949
90,4,1,2,1,4,230,42,192,49,37,53,12,7,2,8,9,0,0,0,0,0,0,3,11,0,63,2883
91,2,2,1,1,45,7,0,8,2,0,1,1,1,0,2,7,0,0,0,0,0,0,3,11,0,63,2329
92,3,3,0,0,87,445,37,359,98,28,18,1,2,4,8,1,0,0,0,0,0,0,3,11,0,47,2514
128,4,1,0,1,23,352,0,27,10,0,15,3,6,1,7,6,0,0,0,0,0,0,3,11,0,59,2649


In [61]:
X = X[-X.Income.isna()]
X = X.drop(columns="Income")
X

Unnamed: 0,Education,Marital_Status,Kidhome,Teenhome,Recency,MntWines,MntFruits,MntMeatProducts,MntFishProducts,MntSweetProducts,MntGoldProds,NumDealsPurchases,NumWebPurchases,NumCatalogPurchases,NumStorePurchases,NumWebVisitsMonth,AcceptedCmp3,AcceptedCmp4,AcceptedCmp5,AcceptedCmp1,AcceptedCmp2,Complain,Z_CostContact,Z_Revenue,Response,age,Days_customer
0,2,2,0,0,58,635,88,546,172,88,88,3,8,10,4,7,0,0,0,0,0,0,3,11,1,63,2959
1,2,2,1,1,38,11,1,6,2,1,6,2,1,1,2,5,0,0,0,0,0,0,3,11,0,66,2409
2,2,3,0,0,26,426,49,127,111,21,42,1,8,2,10,4,0,0,0,0,0,0,3,11,0,55,2608
3,2,3,1,0,26,11,4,20,10,3,5,2,2,0,4,6,0,0,0,0,0,0,3,11,0,36,2435
4,4,1,1,0,94,173,43,118,46,27,15,5,5,3,6,5,0,0,0,0,0,0,3,11,0,39,2457
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2235,2,1,0,1,46,709,43,182,42,118,247,2,9,3,4,5,0,0,0,0,0,0,3,11,0,53,2677
2236,4,3,2,1,56,406,0,30,0,0,8,7,8,2,5,7,0,0,0,1,0,0,3,11,0,74,2315
2237,2,0,0,0,91,908,48,217,32,12,24,1,2,3,13,6,0,1,0,0,0,0,3,11,0,39,2451
2238,3,3,0,1,8,428,30,214,80,30,61,2,6,5,10,3,0,0,0,0,0,0,3,11,0,64,2452


In [62]:
from sklearn.linear_model import LinearRegression
# Linear Regression Model
reg = LinearRegression().fit(X, y)

In [63]:
from sklearn.metrics import mean_squared_error, r2_score
# Predictions
y_pred = reg.predict(x_pred)

In [64]:
pum = data_raw.loc[data_raw.Income.isna(), :].index
pum_1 = data_raw.loc[data_raw.Income.isna(), :]
pum_1

Unnamed: 0,Education,Marital_Status,Income,Kidhome,Teenhome,Recency,MntWines,MntFruits,MntMeatProducts,MntFishProducts,MntSweetProducts,MntGoldProds,NumDealsPurchases,NumWebPurchases,NumCatalogPurchases,NumStorePurchases,NumWebVisitsMonth,AcceptedCmp3,AcceptedCmp4,AcceptedCmp5,AcceptedCmp1,AcceptedCmp2,Complain,Z_CostContact,Z_Revenue,Response,age,Days_customer
10,2,1,,1,0,11,5,5,6,0,2,1,1,1,0,2,7,0,0,0,0,0,0,3,11,0,37,2522
27,2,2,,1,0,19,5,1,3,3,263,362,0,27,0,0,1,0,0,0,0,0,0,3,11,0,34,2790
43,4,2,,0,0,80,81,11,50,3,2,39,1,1,3,4,2,0,0,0,0,0,0,3,11,0,61,2532
48,2,2,,2,1,96,48,5,48,6,10,7,3,2,1,4,6,0,0,0,0,0,0,3,11,0,69,2475
58,2,2,,1,0,57,11,3,22,2,2,6,2,2,0,3,6,0,0,0,0,0,0,3,11,0,38,2673
71,0,1,,1,0,25,25,3,43,17,4,17,3,3,0,3,8,0,0,0,0,0,0,3,11,0,47,2949
90,4,1,,2,1,4,230,42,192,49,37,53,12,7,2,8,9,0,0,0,0,0,0,3,11,0,63,2883
91,2,2,,1,1,45,7,0,8,2,0,1,1,1,0,2,7,0,0,0,0,0,0,3,11,0,63,2329
92,3,3,,0,0,87,445,37,359,98,28,18,1,2,4,8,1,0,0,0,0,0,0,3,11,0,47,2514
128,4,1,,0,1,23,352,0,27,10,0,15,3,6,1,7,6,0,0,0,0,0,0,3,11,0,59,2649


In [66]:
data_raw.loc[data_raw.Income.isna(), "Income"] = np.round(y_pred, 2)

In [67]:
data_raw.loc[pum, :]

Unnamed: 0,Education,Marital_Status,Income,Kidhome,Teenhome,Recency,MntWines,MntFruits,MntMeatProducts,MntFishProducts,MntSweetProducts,MntGoldProds,NumDealsPurchases,NumWebPurchases,NumCatalogPurchases,NumStorePurchases,NumWebVisitsMonth,AcceptedCmp3,AcceptedCmp4,AcceptedCmp5,AcceptedCmp1,AcceptedCmp2,Complain,Z_CostContact,Z_Revenue,Response,age,Days_customer
10,2,1,30349.43,1,0,11,5,5,6,0,2,1,1,1,0,2,7,0,0,0,0,0,0,3,11,0,37,2522
27,2,2,82071.32,1,0,19,5,1,3,3,263,362,0,27,0,0,1,0,0,0,0,0,0,3,11,0,34,2790
43,4,2,49520.99,0,0,80,81,11,50,3,2,39,1,1,3,4,2,0,0,0,0,0,0,3,11,0,61,2532
48,2,2,44468.74,2,1,96,48,5,48,6,10,7,3,2,1,4,6,0,0,0,0,0,0,3,11,0,69,2475
58,2,2,33809.88,1,0,57,11,3,22,2,2,6,2,2,0,3,6,0,0,0,0,0,0,3,11,0,38,2673
71,0,1,27767.55,1,0,25,25,3,43,17,4,17,3,3,0,3,8,0,0,0,0,0,0,3,11,0,47,2949
90,4,1,49622.48,2,1,4,230,42,192,49,37,53,12,7,2,8,9,0,0,0,0,0,0,3,11,0,63,2883
91,2,2,36404.87,1,1,45,7,0,8,2,0,1,1,1,0,2,7,0,0,0,0,0,0,3,11,0,63,2329
92,3,3,67399.89,0,0,87,445,37,359,98,28,18,1,2,4,8,1,0,0,0,0,0,0,3,11,0,47,2514
128,4,1,51007.72,0,1,23,352,0,27,10,0,15,3,6,1,7,6,0,0,0,0,0,0,3,11,0,59,2649


# Визуализация данных

In [None]:
data_raw.describe()

In [None]:
data_raw.describe().shape

При беглом осмотре можно заключить, что два признака (Z_CostContact и Z_Revenue) для всех строк имеют одно и то же значение: 3 и 11 соответственно. Избавимся от данных, стандартное отклонение которых равняется нулю.

In [None]:
std = data_raw.describe().iloc[2,:]
const_lab = [std[std==0.00].index[0], std[std==0.00].index[1]]
const_lab

In [None]:
data_raw.drop(labels=const_lab, axis=1, inplace=True)

In [None]:
data_raw.shape

In [None]:
data_raw.describe()

In [None]:
data_raw.describe(include=['category'])

Максимум в признаке Income слишком отличается от среднего. Возможно, стоит избавиться от слишком больших значений. Проверим значения кванителей.

In [None]:
data_raw['Income'].quantile([0.005,.01,.05,.1,.5,.9,.95,.99,.995])

In [None]:
rows_to_drop = data_raw[(data_raw['Income'] < data_raw['Income'].quantile(0.005)) | (data_raw['Income'] > data_raw['Income'].quantile(0.995))].index
data = data_raw.drop(rows_to_drop)
data.shape

In [None]:
data['age'].quantile([0.005,.01,.05,.1,.5,.9,.95,.99,.999])

In [None]:
a = data['age'].sort_values(ascending=False)

In [None]:
a.head()

In [None]:
rows_to_drop = data[(data['age'] > data['age'].quantile(0.999))].index
data = data.drop(rows_to_drop)

In [None]:
data.describe()

In [None]:
data.shape

In [None]:
data['age'].quantile([0.005,.01,.05,.1,.5,.9,.95,.99,.999])

In [None]:
corr_mat = data.corr()

In [None]:
corr_mat

In [None]:
plt.figure(figsize=(10,10))
sns.heatmap(abs(corr_mat))
pass

Корреляция между определенными признаками и принятием какого-либо коммерческого предложения небольшая, однако наблюдается корреляция между доходом и покупаемыми продуктами, между продуктами и способом покупки.

In [None]:
corr_mat > 0.5

In [None]:
corr_mat.where(np.triu(corr_mat > 0.5, k=1)).stack().sort_values(ascending=False)

Видна зависимость количества покупок от уровня дохода, а также количество купленных продуктов одной категории от количества купленных продуктов другой категории.


# Бинаризация номинальных признаков

В датасете присутствуют два категориальных признака: Marital_Status и Education

In [None]:
data[getEachTypeCols(data).get('categorical cols')]

In [None]:
Education_dummies = pd.get_dummies(data['Education'])

In [None]:
Education_dummies.head()

In [None]:
data_test = pd.concat((data, Education_dummies), axis=1)
data_test.drop(['Education'], axis=1, inplace=True)

In [None]:
data_test

In [None]:
Mstatus_dummies = pd.get_dummies(data_test['Marital_Status'])
Mstatus_dummies.head()

In [None]:
data_test = pd.concat((data_test, Mstatus_dummies), axis=1)
data_test.drop(['Marital_Status'], axis=1, inplace=True)
data_test.head()

# Нормализация количественных признаков

In [None]:
data_numerical = data_test[getEachTypeCols(data_test).get('numerical cols')]
data_numerical.describe()

In [None]:
binary_data = data_test[getEachTypeCols(data_test).get('binary cols')]
binary_data.describe()

In [None]:
data_numerical = (data_numerical - data_numerical.mean(axis = 0))/data_numerical.std(axis = 0)
frames = [data_numerical, binary_data]
result = pd.concat(frames, axis=1)

In [None]:
result.head()

# Отбор признаков

In [None]:
X = result.drop('Response', axis = 1) # выбрасываем столбец 'Response'
y = result['Response']

In [None]:
X = X.to_numpy()
y = np.array(y, dtype='int')

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, 
                                                    random_state = 42)

N_train, _ = X_train.shape 
N_test,  _ = X_test.shape 

print(N_train, N_test)

# Метод KNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors = 10)
knn.fit(X_train, y_train)

In [None]:
y_test_predict = knn.predict(X_test)
y_train_predict = knn.predict(X_train)
err_test  = np.mean(y_test  != y_test_predict)
err_train = np.mean(y_train != y_train_predict)
print(err_train, err_test)

In [None]:
from sklearn.metrics import classification_report, confusion_matrix
print(confusion_matrix(y_test, y_test_predict))
print(classification_report(y_test, y_test_predict))

In [None]:
y_actu = pd.Series(y_test, name='Actual')
y_pred = pd.Series(y_test_predict, name='Predicted')
df_confusion = pd.crosstab(y_actu, y_pred)

In [None]:
df_confusion

Посмотрим, какое количество соседей наилучшее для метода KNN в нашем случае.

In [None]:
from sklearn.model_selection import GridSearchCV
nnb = [1, 3, 5, 10, 15, 20, 25, 35, 45, 55]
knn = KNeighborsClassifier()
grid = GridSearchCV(knn, param_grid = {'n_neighbors': nnb}, cv=10)
grid.fit(X_train, y_train)

best_cv_err = 1 - grid.best_score_
best_n_neighbors = grid.best_estimator_.n_neighbors
print(best_cv_err, best_n_neighbors)

In [None]:
knn1 = KNeighborsClassifier(n_neighbors = 15)
knn1.fit(X_train, y_train)

y_test_predict_1 = knn1.predict(X_test)
y_train_predict_1 = knn1.predict(X_train)

err_train_1 = np.mean(y_train != knn1.predict(X_train))
err_test_1  = np.mean(y_test  != knn1.predict(X_test))

print(err_train_1, err_test_1)

In [None]:
print(confusion_matrix(y_test, y_test_predict_1))
print(classification_report(y_test, y_test_predict_1))

# Вывод

Метод KNN показал плохие результаты для предсказания положительного отклика на маркетинговую кампанию. Значение "True positive" очень маленькое по сравнению с "False Negative" (значение recall для 1).
Необходимо менять метод или подбор параметров.
Код в дальнейшем ещё будет дорабатываться.

# PCA

In [None]:
scaled_data = preprocessing.scale(data_test.T)

In [None]:
pca = PCA()
pca.fit(scaled_data)
pca_data = pca.transform(scaled_data)

In [None]:
per_var = np.round(pca.explained_variance_ratio_*100, decimals=1)

In [None]:
per_var

In [None]:
labels = data_test.columns.tolist()
labels

In [None]:
plt.bar(x=range(1,len(per_var)+1), height=per_var, tick_label=labels)
plt.ylabel('Percentage of Explained Variance')
plt.xlabel('Principal Component')
plt.title('Scree Plot')
plt.show

In [None]:
pca_df = pd.DataFrame(pca_data, columns=labels)
plt.scatter(pca_df.Income, pca_df.Kidhome)
plt.title('PCA Graph')
for sample in pca_df.index:
    plt.annotate(sample, (pca_df.Income.loc[sample], pca_df.Kidhome.loc[sample]))
plt.show()

# Метод главных компонент

In [None]:
pca_model = PCA(n_components=2)
pca_model.fit(result)
print("Главные компоненты (по строкам):")
print(pca_model.components_)
print("Дисперсии по главным компонентам:")
print(pca_model.explained_variance_)

In [None]:
print("Среднеквадратические отклонения:")
print(np.sqrt(pca_model.explained_variance_))

In [None]:
np.dot(pca_model.components_, pca_model.components_.T)

In [None]:
X_mean = np.mean(result, axis = 0)

In [None]:
result.shape

In [None]:
plt.scatter(result[:, 0], result[:, 1], color = 'b', alpha = 0.65)
v0 = pca_model.components_[0]
v1 = pca_model.components_[1]
#v0 = np.sqrt(pca_model.explained_variance_[0])*pca_model.components_[0]
#v1 = np.sqrt(pca_model.explained_variance_[1])*pca_model.components_[1]
plt.arrow(X_mean[0], X_mean[1], v0[0], v0[1], color = 'r', width = .005, head_width = .5)
plt.arrow(X_mean[0], X_mean[1], v1[0], v1[1], color = 'r', width = .005, head_width = .5)
plt.axis('equal');

In [None]:
X = result.drop('Response', axis = 1) # выбрасываем столбец 'Response'
y = result['Response']
N, d = X.shape
pca = PCA(n_components=2)
pca.fit(X)
pca_data = pca.transform(X)
print(X.shape)
print(pca_data.shape)

In [None]:
plt.figure(figsize = (8, 6))
colors = ['b', 'r']
for k in range(2):
    plt.scatter(pca_data[y == k, 0], pca_data[y == k, 1], color = colors[k], alpha = 0.55, label = str(k))
plt.legend()