### Логистическая регрессия. Разреженные матрицы. Открытый курс машинного обучения ods.ai

In [1]:
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix, hstack
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
%matplotlib inline
from matplotlib import pyplot as plt
import seaborn as sns
import pickle

In [10]:
train_df = pd.read_csv('.../train_sessions.csv',
                       index_col='session_id')
test_df = pd.read_csv('.../test_sessions.csv',
                      index_col='session_id')

#### Предобработка данных

In [11]:
# Посмотрим на данные
train_df.head()

Unnamed: 0_level_0,site1,time1,site2,time2,site3,time3,site4,time4,site5,time5,...,time6,site7,time7,site8,time8,site9,time9,site10,time10,target
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,718,2014-02-20 10:02:45,,,,,,,,,...,,,,,,,,,,0
2,890,2014-02-22 11:19:50,941.0,2014-02-22 11:19:50,3847.0,2014-02-22 11:19:51,941.0,2014-02-22 11:19:51,942.0,2014-02-22 11:19:51,...,2014-02-22 11:19:51,3847.0,2014-02-22 11:19:52,3846.0,2014-02-22 11:19:52,1516.0,2014-02-22 11:20:15,1518.0,2014-02-22 11:20:16,0
3,14769,2013-12-16 16:40:17,39.0,2013-12-16 16:40:18,14768.0,2013-12-16 16:40:19,14769.0,2013-12-16 16:40:19,37.0,2013-12-16 16:40:19,...,2013-12-16 16:40:19,14768.0,2013-12-16 16:40:20,14768.0,2013-12-16 16:40:21,14768.0,2013-12-16 16:40:22,14768.0,2013-12-16 16:40:24,0
4,782,2014-03-28 10:52:12,782.0,2014-03-28 10:52:42,782.0,2014-03-28 10:53:12,782.0,2014-03-28 10:53:42,782.0,2014-03-28 10:54:12,...,2014-03-28 10:54:42,782.0,2014-03-28 10:55:12,782.0,2014-03-28 10:55:42,782.0,2014-03-28 10:56:12,782.0,2014-03-28 10:56:42,0
5,22,2014-02-28 10:53:05,177.0,2014-02-28 10:55:22,175.0,2014-02-28 10:55:22,178.0,2014-02-28 10:55:23,177.0,2014-02-28 10:55:23,...,2014-02-28 10:55:59,175.0,2014-02-28 10:55:59,177.0,2014-02-28 10:55:59,177.0,2014-02-28 10:57:06,178.0,2014-02-28 10:57:11,0


In [12]:
# приведем колонки time1, ..., time10 к временному формату
times = ['time%s' % i for i in range(1, 11)]
train_df[times] = train_df[times].apply(pd.to_datetime)
test_df[times] = test_df[times].apply(pd.to_datetime)

# отсортируем данные по времени
train_df = train_df.sort_values(by='time1')

In [13]:
# приведем колонки site1, ..., site10 к целочисленному формату и заменим пропуски нулями
sites = ['site%s' % i for i in range(1, 11)]
train_df[sites] = train_df[sites].fillna(0).astype('int')
test_df[sites] = test_df[sites].fillna(0).astype('int')

# загрузим словарик сайтов
with open(r".../site_dic.pkl", "rb") as input_file:
    site_dict = pickle.load(input_file)

# датафрейм словарика сайтов
sites_dict_df = pd.DataFrame(list(site_dict.keys()), 
                          index=list(site_dict.values()), 
                          columns=['site'])
print('Всего сайтов:', sites_dict_df.shape[0])

Всего сайтов: 48371


In [16]:
sites_dict_df.head()

Unnamed: 0,site
25075,www.abmecatronique.com
13997,groups.live.com
42436,majeureliguefootball.wordpress.com
30911,cdt46.media.tourinsoft.eu
8104,www.hdwallpapers.eu


In [14]:
# наша целевая переменная
y_train = train_df['target']

# объединенная таблица исходных данных
full_df = pd.concat([train_df.drop('target', axis=1), test_df])

# индекс, по которому будем отделять обучающую выборку от тестовой
idx_split = train_df.shape[0]

In [17]:
# табличка с индексами посещенных сайтов в сессии
full_sites = full_df[sites]
full_sites.shape

(336358, 10)

In [18]:
# последовательность с индексами
sites_flatten = full_sites.values.flatten()

# получаем разреженную матрицу
full_sites_sparse = csr_matrix(([1] * sites_flatten.shape[0],
                                sites_flatten,
                                range(0, sites_flatten.shape[0] + 10, 10)))[:, 1:]

In [19]:
X_train_sparse = full_sites_sparse[:idx_split]
X_test_sparse = full_sites_sparse[idx_split:]

Создаем функцию, которая разбивает исходную выборку на тренировочную и тестовую, а также делает предсказание при помощи логистической регрессии. Функция возвращает метрику AUC-ROC. 

In [21]:
def get_auc_lr_valid(X, y, C=1, ratio = 0.9, seed=17):
    '''
    X, y – выборка
    ratio – в каком отношении поделить выборку
    C, seed – коэф-т регуляризации и random_state 
              логистической регрессии
    '''
    
    train_len = int(ratio * X.shape[0])
    X_train = X[train_len:, :]
    X_valid = X[:train_len, :]
    
    y_train = y[train_len:]
    y_valid = y[:train_len]
    
    logit = LogisticRegression(C=C, n_jobs=-1, random_state=seed)
    logit.fit(X_train, y_train)
    
    valid_pred = logit.predict_proba(X_valid)[:, 1]
    
    return roc_auc_score(y_valid, valid_pred)

Посмотрим на результаты модели

In [22]:
%%time
get_auc_lr_valid(X_train_sparse, y_train)

Wall time: 2.98 s


0.8237843603411366

Добавим новые признаки нашим данным

In [23]:
new_feat_train = pd.DataFrame(index = train_df.index)
new_feat_test = pd.DataFrame(index = test_df.index)

In [24]:
def find_morning(time): #функция ищет утреннее время
    
    if time.hour < 11:
        time = 1
        return time
    else:
        time = 0
        return time

In [25]:
#добавить признак год и месяц
new_feat_train['year_month'] = train_df['time1'].apply(lambda ts: 100 * ts.year + ts.month)
new_feat_test['year_month'] = test_df['time1'].apply(lambda ts: 100 * ts.year + ts.month)

#добавить признак время
new_feat_train['start_time'] = train_df['time1'].apply(lambda ts: ts.hour)
new_feat_test['start_time'] = test_df['time1'].apply(lambda ts: ts.hour)

#добавить признак morning
new_feat_train['morning'] = train_df['time1'].apply(lambda ts: find_morning(ts))
new_feat_test['morning'] = test_df['time1'].apply(lambda ts: find_morning(ts))


In [26]:
scaler_year = StandardScaler().fit(new_feat_train['year_month'].values.reshape(-1, 1))
#признак год и месяц
new_feat_train['year_month_scaled'] = scaler_year.transform(new_feat_train['year_month'].values.reshape(-1, 1))
new_feat_test['year_month_scaled'] = scaler_year.transform(new_feat_test['year_month'].values.reshape(-1, 1))

scaler_time = StandardScaler().fit(new_feat_train['start_time'].values.reshape(-1, 1))
#признак время
new_feat_train['start_time_scaled'] = scaler_time.transform(new_feat_train['start_time'].values.reshape(-1, 1))
new_feat_test['start_time_scaled'] = scaler_time.transform(new_feat_test['start_time'].values.reshape(-1, 1))

In [27]:
new_feat_test.head()

Unnamed: 0_level_0,year_month,start_time,morning,year_month_scaled,start_time_scaled
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,201410,11,0,0.822948,-0.407823
2,201407,11,0,0.752287,-0.407823
3,201412,15,0,0.870055,0.858234
4,201411,10,1,0.846501,-0.724338
5,201405,15,0,0.705179,0.858234


In [28]:
X_train_sparse_new = hstack([X_train_sparse, new_feat_train['year_month_scaled'].values.reshape(-1,1)])
X_train_sparse_new = hstack([X_train_sparse, new_feat_train['start_time_scaled'].values.reshape(-1,1)])
X_train_sparse_new = csr_matrix(hstack([X_train_sparse, new_feat_train['morning'].values.reshape(-1,1)]))

In [29]:
X_test_sparse_new = hstack([X_test_sparse, new_feat_test['year_month_scaled'].values.reshape(-1,1)])
X_test_sparse_new = hstack([X_test_sparse, new_feat_test['start_time_scaled'].values.reshape(-1,1)])
X_test_sparse_new = csr_matrix(hstack([X_test_sparse, new_feat_test['morning'].values.reshape(-1,1)]))

Обучаем модель с учетом новых признаков

In [31]:
%%time
get_auc_lr_valid(X_train_sparse_new, y_train)

Wall time: 2.63 s


0.8662502430401882

Делаем предсказание

In [33]:
logit = LogisticRegression(C=1, n_jobs=-1, random_state=17)
logit.fit(X_train_sparse_new, y_train)
logit_pred = logit.predict(X_test_sparse_new)

In [34]:
# функция для записи прогнозов в файл
def write_to_submission_file(predicted_labels, out_file,
                             target='target', index_label="session_id"):
    predicted_df = pd.DataFrame(predicted_labels,
                                index = np.arange(1, predicted_labels.shape[0] + 1),
                                columns=[target])
    predicted_df.to_csv(out_file, index_label=index_label)

In [305]:
write_to_submission_file(logit_pred, 'PATH')