<center>
    
# Материалы к проекту по идентификации пользователей

<center> Исполнитель: Глазунов А.В.

Необходимые и просто полезные библиотеки и установки

In [3]:
from __future__ import print_function
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
pd.set_option('display.max.columns', 50)
import itertools
import random
import os
import re
from glob import glob
from scipy.sparse import csr_matrix,hstack
from tqdm import tqdm_notebook
from time import time
import pickle

from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import  accuracy_score,roc_auc_score,f1_score,classification_report,roc_curve, confusion_matrix
from sklearn.metrics import make_scorer
from sklearn.model_selection import train_test_split,cross_val_score, StratifiedKFold, GridSearchCV
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
import xgboost as xgb

from scipy import stats
from statsmodels.stats.proportion import proportion_confint


import seaborn as sns
sns.set_style("darkgrid")
from matplotlib import pyplot as plt
%matplotlib inline

from plotly.offline import iplot
import plotly.graph_objs as go


from ipywidgets import interactive
import ipywidgets as widgets

from sklearn.manifold import TSNE

In [4]:
%load_ext watermark
%watermark -v -m -p numpy,scipy,pandas,matplotlib,statsmodels,sklearn 

CPython 3.7.3
IPython 7.6.1

numpy 1.16.4
scipy 1.2.1
pandas 0.24.2
matplotlib 3.1.0
statsmodels 0.9.0
sklearn 0.22.2.post1

compiler   : MSC v.1915 64 bit (AMD64)
system     : Windows
release    : 10
machine    : AMD64
processor  : Intel64 Family 6 Model 142 Stepping 12, GenuineIntel
CPU cores  : 4
interpreter: 64bit


Неделя 1

In [None]:
def prepare_train_set(path_to_csv_files, session_length=10):
    
   
    files_paths = list(glob(os.path.join(path_to_csv_files,'user*.csv')))
    
    files = []
    user_IDs = []
    for path in tqdm_notebook(files_paths):
        f_name = os.path.split(path)[1]
        ID = int(re.findall("[0-9]+",f_name)[0])
        user_IDs.append(ID)
        files.append(pd.read_csv(path))
        
    sites_dictionary = {}
    
    for user in tqdm_notebook(files):
        for site in user.site.values:
            if site in sites_dictionary:
               
                sites_dictionary[site] += 1                
                
            else:
                sites_dictionary[site] = 1
              
    d =  sorted(sites_dictionary.items(), key=lambda item: item[1],reverse = True)    
    sites_dict_sorted = {}
    
    for ii,pair in enumerate(d):
        
        sites_dict_sorted[pair[0]] = [ii+1,pair[1]]
        
    
    
    data_list = []  
        
    for ind,user in enumerate(tqdm_notebook(files)): 
        session = np.zeros(session_length+1)
        session[session_length] = user_IDs[ind]
        site_position = 0
        for site in user.site.values:
            session[site_position]=sites_dict_sorted[site][0]                
            site_position += 1
            if site_position == session_length:
                data_list.append(session)                  
                session = np.zeros(session_length+1)
                session[session_length] = user_IDs[ind]
                site_position = 0
        if site_position != 0:
            data_list.append(session)
    
    columns = ['site'+str(num) for num in range(1,session_length+1)]+['user_ID']
    data = pd.DataFrame(data_list,dtype = int)
    data.columns= columns   
    

    
    
    return data,sites_dict_sorted 

In [None]:
def to_sparse_format_doc(sessions):
    indptr = [0]
    indices = []
    data = []
    vocabulary = {}
    for s in tqdm_notebook(sessions):
        for ID in s:
            index = vocabulary.setdefault(ID, ID)
            indices.append(index)
            data.append(1)
        indptr.append(len(indices))

    return csr_matrix((data, indices, indptr), dtype=int)[:,1:]   

In [None]:
def to_sparse_format(X):
    data = np.ones(X.size, dtype=int)
    indices = X.reshape(-1)
    indptr = np.arange(X.shape[0] + 1) * X.shape[1]
    return csr_matrix((data, indices, indptr), dtype=int)[:, 1:]

Неделя 2

In [None]:
def prepare_sparse_train_set_window(path_to_csv_files, site_freq_path, 
                                    session_length=10, window_size=10):
        
    #Загрузка файлов пользователей
    files_paths = list(glob(os.path.join(path_to_csv_files,'user*.csv')))
    files = []
    user_IDs = []
    for path in tqdm_notebook(files_paths):
        f_name = os.path.split(path)[1]
        ID = int(re.findall("[0-9]+",f_name)[0])
        user_IDs.append(ID)
        files.append(pd.read_csv(path))
    user_IDs = np.array(user_IDs)
            
    #Получение словаря сайтов с ID и частотами
    if (session_length==10) and (window_size==10):#если так, то подгружаем уже готовый
        with open(site_freq_path,'rb') as f:
            sites_dict_sorted = pickle.load(f)
        f.close()
    else:        #иначе создаем из файлов пользователй
        sites_dictionary = {}
    
        for user in tqdm_notebook(files):
            for site in user.site.values:
                if site in sites_dictionary:
               
                    sites_dictionary[site] += 1                
                
                else:
                    sites_dictionary[site] = 1
              
        d =  sorted(sites_dictionary.items(), key=lambda item: item[1],reverse = True)    
        sites_dict_sorted = {}
    
        for ii,pair in enumerate(d):
        
            sites_dict_sorted[pair[0]] = [ii+1,pair[1]]  #заполнение словаря ID и частотами из отсортированного d
            
    
    # Заполнение таблицы пользовательских сессий ID просмотренных пользователем сайтов
    Session_list = []  
        
    for ind,user in enumerate(tqdm_notebook(files)): 
        session = np.zeros(session_length+1)
        session[session_length] = user_IDs[ind] #заполнение стобца ID пользователей
        
        sites = user.site.values
        ufile_length = sites.shape[0]
        site_position = 0 #позиция в сессии
        pos_in_ufile = 0 #позиция в файле
        while pos_in_ufile < ufile_length:            
            session[site_position]=sites_dict_sorted[sites[pos_in_ufile]][0]#получение ID сайта из словаря по названию
            pos_in_ufile += 1
            site_position += 1
            
            if site_position == session_length:
                Session_list.append(session)                  
                session = np.zeros(session_length+1)
                session[session_length] = user_IDs[ind]
                site_position = 0
                pos_in_ufile -= session_length - window_size
           
            if (pos_in_ufile == ufile_length) & (site_position != 0):
                Session_list.append(session)
                if site_position > window_size:
                    session = np.zeros(session_length+1)
                    session[session_length] = user_IDs[ind]               
                    pos_in_ufile -= site_position - window_size
                    site_position = 0  

    
    Sessions = np.array(Session_list,dtype = int)
        
    #Отделений ID пользователей от таблицы
    X, y = Sessions[:,:-1],Sessions[:,-1]
    
    #Получение разреженной матрицы частот встречаемости сайтов в каждой сессии      
    indptr = [0]
    indices = []
    data = []
    vocabulary = {}
    for s in tqdm_notebook(X):
        for ID in s:
            index = vocabulary.setdefault(ID, ID)
            indices.append(index)
            data.append(1)
        indptr.append(len(indices))

    X_sparse = csr_matrix((data, indices, indptr), dtype=int)[:,1:] 
    
    
    return X_sparse, y

In [None]:
train_df = pd.read_csv(os.path.join(PATH_TO_DATA, 'train_data_10users.csv'), 
                       index_col='session_id')

In [None]:
train_df.head()

In [None]:
train_df.info()

In [None]:
train_df['user_ID'].value_counts(

Посчитаем распределение числа уникальных сайтов в каждой сессии из 10 посещенных подряд сайтов.

In [None]:
num_unique_sites = [np.unique(train_df.values[i, :-1]).shape[0] 
                    for i in range(train_df.shape[0])]

In [None]:
pd.Series(num_unique_sites).hist()

Проверьте с помощью QQ-плота и критерия Шапиро-Уилка, что эта величина распределена нормально

In [None]:
stats.probplot(num_unique_sites, plot=plt)
plt.show()
print("Shapiro-Wilk normality test, W-statistic: %f, p-value: %f" % stats.shapiro(num_unique_sites))

Проверим гипотезу о том, что пользователь хотя бы раз зайдет на сайт, который он уже ранее посетил в сессии из 10 сайтов. Проверим с помощью биномиального критерия для доли, что доля случаев, когда пользователь повторно посетил какой-то сайт (то есть число уникальных сайтов в сессии < 10) велика: больше 95% 

In [None]:
has_two_similar = (np.array(num_unique_sites) < 10).astype('int')
has_two_similar 

In [None]:
pi_val = stats.binom_test(sum(has_two_similar),has_two_similar.shape[0], p=0.95,alternative = 'greater')
pi_val

 95% доверительный интервал Уилсона для доли случаев, когда пользователь повторно посетил какой-то сайт

In [None]:
wilson_interval = proportion_confint(sum(has_two_similar), has_two_similar.shape[0], method = 'wilson')
wilson_interval

Распределение частоты посещения сайтов (сколько раз тот или иной сайт попадается в выборке) для сайтов, которые были посещены как минимум 1000 раз

In [None]:
with open(os.path.join(PATH_TO_DATA,'site_freq_10users.pkl'), 'rb') as f:
    freq_dict = pickle.load(f)
    f.close()
freq_dict

In [None]:
df_freqs = pd.DataFrame(freq_dict).T
df_freqs[df_freqs[1]>=1000][1].hist()

In [None]:
site_freqs = df_freqs[df_freqs[1]>=1000][1].values
site_freqs

In [None]:
len(freq_dict)

Каков 95% доверительный интервал для средней частоты появления сайта в выборке?

In [None]:
def get_bootstrap_samples(data, n_samples, random_seed=17):
    np.random.seed(random_seed)
    indices = np.random.randint(0, len(data), (n_samples, len(data)))
    samples = data[indices]
    return samples

In [None]:
def stat_intervals(stat, alpha):
    boundaries = np.percentile(stat, 
                 [100 * alpha / 2., 100 * (1 - alpha / 2.)])
    return boundaries

In [None]:
samples = get_bootstrap_samples(df_freqs[1].values,len(df_freqs[1].values))
samples

In [None]:
means = list(map(np.mean, samples))
df_freqs[1].values.mean()

In [None]:
left = stat_intervals(means,0.05)[0]
right = stat_intervals(means,0.05)[1]
print(round(left,3),round(right,3))

Неделя 3

In [None]:
def prepare_train_set_with_fe(path_to_csv_files, site_freq_path, feature_names,
                                    session_length=10, window_size=10):
    ''' ВАШ КОД ЗДЕСЬ '''
    
    
    #Загрузка файлов пользователей
    files_paths = list(glob(os.path.join(path_to_csv_files,'user*.csv')))
    files = []
    user_IDs = []
    for path in tqdm_notebook(files_paths):
        f_name = os.path.split(path)[1]
        ID = int(re.findall("[0-9]+",f_name)[0])
        user_IDs.append(ID)
        files.append(pd.read_csv(path))
    user_IDs = np.array(user_IDs)
            
    #Получение словаря сайтов с ID и частотами
    if (session_length==10) and (window_size==10):#если так, то подгружаем уже готовый
        with open(site_freq_path,'rb') as f:
            sites_dict_sorted = pickle.load(f)
        f.close()
    else:        #иначе создаем из файлов пользователй
        sites_dictionary = {}
        
        for user in tqdm_notebook(files):
            for site in user.site.values:
                if site in sites_dictionary:
               
                    sites_dictionary[site] += 1                
                
                else:
                    sites_dictionary[site] = 1
              
        d =  sorted(sites_dictionary.items(), key=lambda item: item[1],reverse = True)    
        sites_dict_sorted = {}
    
        for ii,pair in enumerate(d):
        
            sites_dict_sorted[pair[0]] = [ii+1,pair[1]]  #заполнение словаря ID и частотами из отсортированного d
            
    
    # Заполнение таблицы пользовательских сессий ID просмотренных пользователем сайтов
    Session_list = []  
    
    for ind,user in enumerate(tqdm_notebook(files)): 
        session = np.zeros(len(feature_names))
        session[-1] = user_IDs[ind] #заполнение столбца ID пользователей
         
        sites = user.site.values
        user.timestamps = pd.to_datetime(user.timestamp)#Столбец даты и времени начала просмотра сайта
        weekdays = user.timestamps.apply(lambda x:x.weekday()).values
        hours = user.timestamps.apply(lambda x:x.time().hour).values
        seconds = user.timestamps.apply(lambda x:x.timestamp()).values
        
               
        session[-2] = weekdays[0] # день недели просмотра сайта
        session[-3] = hours[0] # час просмотра сайта
        
        ufile_length = sites.shape[0]
        site_position = 0 #начальная позиция в сессии
        pos_in_ufile = 0 #начальная позиция в файле
        while pos_in_ufile < ufile_length: 
            session[site_position]=sites_dict_sorted[sites[pos_in_ufile]][0]#получение ID сайта из словаря по названию
            if site_position > 0:
                session[site_position + session_length-1] = seconds[pos_in_ufile] -\
                                                            seconds[pos_in_ufile-1] #время просмотра сайта
            pos_in_ufile += 1 #шаг по файлу
            site_position += 1 #шаг по сессии
        
            if site_position == session_length: #если достигнут конец сессии
                session[-4] = np.unique(session[:session_length]).shape[0] #Количество уникальных сайтов в сессии
                session[-5] = seconds[pos_in_ufile-1] -\
                                seconds[pos_in_ufile-session_length]          #длительность сессии   
                Session_list.append(session) #готовая сессия добавляется в список
                
                session = np.zeros(len(feature_names))#инициализируется новая сессия
                session[-1] = user_IDs[ind]
                site_position = 0
                pos_in_ufile -= session_length - window_size
                if (pos_in_ufile != ufile_length):#если не достигнут конец файла
                    session[-2] = weekdays[pos_in_ufile]
                    session[-3] = hours[pos_in_ufile]
                
            
            
            if (pos_in_ufile == ufile_length) & (site_position != 0):#если файл закончился раньше сессии
                    
                session[-4] = np.unique(session[:site_position]).shape[0] 
                session[-5] = seconds[pos_in_ufile-1]-\
                                    seconds[pos_in_ufile-site_position]                  
                Session_list.append(session)
                
                if site_position > window_size:#если каретка вышла за пределы окна (можно начать новую сессию)
                    session = np.zeros(len(feature_names))
                    session[-1] = user_IDs[ind]               
                    pos_in_ufile -= site_position - window_size
                    site_position = 0 
                    session[-2] = weekdays[pos_in_ufile]
                    session[-3] = hours[pos_in_ufile]
    
    
    Sessions = pd.DataFrame(Session_list,columns=feature_names, dtype = int)
        
       
    return Sessions  

In [None]:
id_name_dict = {128: 'Mary-Kate', 39: 'Ashley', 207: 'Lindsey', 127: 'Naomi', 237: 'Avril',
               33: 'Bob', 50: 'Bill', 31: 'John', 100: 'Dick', 241: 'Ed'}
train_data_10users['target'] = train_data_10users['target'].map(id_name_dict)

In [None]:
color_dic = {'Mary-Kate': 'pink', 'Ashley': 'darkviolet', 'Lindsey':'blueviolet', 
             'Naomi': 'hotpink', 'Avril': 'orchid', 
             'Bob': 'firebrick', 'Bill': 'gold', 'John': 'forestgreen', 
             'Dick': 'slategrey', 'Ed':'brown'}

In [None]:
train_data_10users.head()

In [None]:
plt.hist(train_data_10users['session_timespan'],color='darkviolet',range = (0,200))

plt.xlabel('Длина сессии в секундах,с')
plt.ylabel('Количество сессий')
plt.title('Гистограмма распределения длины сессии')

plt.grid(True)
plt.show()

In [None]:
plt.hist(train_data_10users['#unique_sites'],color='aqua')  
plt.xlabel('Число уникальных сайтов в сессии')
plt.ylabel('Количество сессий')
plt.title('Гистограмма распределения числа уникальных сайтов в сессии')
plt.grid(True)
plt.show()

In [None]:
fig, axes = plt.subplots(nrows=3, ncols=4, figsize=(16, 10))

for idx,(user,sub_df) in enumerate(train_data_10users.groupby('target')):
    ax = axes[idx // 4,idx % 4]
    ax.hist(sub_df['#unique_sites'],color=color_dic[user])  
    ax.set(xlabel='Число уникальных сайтов в сессии', ylabel='Количество сессий')
    ax.legend([user])
    plt.grid(True)

In [None]:
fig, axes = plt.subplots(nrows=3, ncols=4, figsize=(16, 10))

for idx, (user, sub_df) in  enumerate(train_data_10users.groupby('target')): 
    ax = axes[idx // 4,idx % 4]
    ax.hist(sub_df.start_hour,color = color_dic[user])
    ax.set(xlabel = 'Час начала сессии',ylabel = 'Количество сессий')
    ax.legend([user])
    ax.grid(True)

In [None]:
plt.hist(train_data_10users['day_of_week'], color = 'sienna',bins=train_data_10users['day_of_week'].unique().shape[0])
plt.xlabel('дни недели')
plt.ylabel('количество сессий')
plt.title('Гистограмма распредления дня недели начала сессии')
plt.xticks(np.linspace(0,6,7),['Пн', 'Вт', 'Ср', 'Чт', 'Пт', 'Сб', 'Вс'])
plt.show()

In [None]:
fig, axes = plt.subplots(nrows=3, ncols=4, figsize=(16, 10))

for idx, (user, sub_df) in  enumerate(train_data_10users.groupby('target')): 
    ax = axes[idx//4,idx%4]    
    ax.hist(sub_df.day_of_week,color = color_dic[user],bins = sub_df.day_of_week.unique().shape[0])
    ax.set(xlabel='дни недели',ylabel='количество сессий')
    ax.set_xticks(range(7))
    ax.set_xticklabels(['Пн', 'Вт', 'Ср', 'Чт', 'Пт', 'Сб', 'Вс'] )
    ax.grid(True)
    ax.legend([user])

In [None]:
df_freq = pd.DataFrame(site_freq_10users).T
df_freq.columns = ['id','freq']
df_freq.head(10)

In [None]:
top10_freqs = list(df_freq[df_freq.id <= 10].freq)
top10_sites = list(df_freq[df_freq.id <= 10].index)

In [None]:
print(top10_sites)
print(top10_freqs)

In [None]:
sns.barplot(top10_sites,top10_freqs)
plt.ylabel('Частота посещения')

plt.title('Частоты посещения топ-10 сайтов')
plt.xticks(rotation=90)
plt.grid(True)


plt.show()

In [None]:
def feature_engineering(path_to_csv_files, site_freq_path, feature_names,
                                    session_length=10, window_size=10):
    ''' ВАШ КОД ЗДЕСЬ '''
    
    
    #Загрузка файлов пользователей
    files_paths = list(glob(os.path.join(path_to_csv_files,'user*.csv')))
    files = []
    user_IDs = []
    for path in tqdm_notebook(files_paths):
        f_name = os.path.split(path)[1]
        ID = int(re.findall("[0-9]+",f_name)[0])
        user_IDs.append(ID)
        files.append(pd.read_csv(path))
    user_IDs = np.array(user_IDs)
            
    #Получение словаря сайтов с ID и частотами
    if (session_length==10) and (window_size==10):#если так, то подгружаем уже готовый
        with open(site_freq_path,'rb') as f:
            sites_dict_sorted = pickle.load(f)
        f.close()
    else:        #иначе создаем из файлов пользователй
        sites_dictionary = {}
        
        for user in tqdm_notebook(files):
            for site in user.site.values:
                if site in sites_dictionary:
               
                    sites_dictionary[site] += 1                
                
                else:
                    sites_dictionary[site] = 1
              
        d =  sorted(sites_dictionary.items(), key=lambda item: item[1],reverse = True)    
        sites_dict_sorted = {}
    
        for ii,pair in enumerate(d):
        
            sites_dict_sorted[pair[0]] = [ii+1,pair[1]]  #заполнение словаря ID и частотами из отсортированного d
            
    
    # Заполнение таблицы пользовательских сессий ID просмотренных пользователем сайтов
    Session_list = []  
    
    for ind,user in enumerate(tqdm_notebook(files)): 
        session = np.zeros(len(feature_names))
        session[-1] = user_IDs[ind] #заполнение столбца ID пользователей
         
        sites = user.site.values
        user.timestamps = pd.to_datetime(user.timestamp)#Столбец даты и времени начала просмотра сайта
        weekdays = user.timestamps.apply(lambda x:x.weekday()).values
        hours = user.timestamps.apply(lambda x:x.time().hour).values
        seconds = user.timestamps.apply(lambda x:x.timestamp()).values
        
               
        session[-2] = weekdays[0] # день недели просмотра сайта
        session[-3] = hours[0] # час просмотра сайта
        
        ufile_length = sites.shape[0]
        site_position = 0 #начальная позиция в сессии
        pos_in_ufile = 0 #начальная позиция в файле
        while pos_in_ufile < ufile_length: 
            session[site_position]=sites_dict_sorted[sites[pos_in_ufile]][0]#получение ID сайта из словаря по названию
            if site_position > 0:
                session[site_position + session_length-1] = seconds[pos_in_ufile] -\
                                                            seconds[pos_in_ufile-1] #время просмотра сайта
            
            
            
            #ДОБАВЛЕННЫЕ ПРИЗНАКИ (дабавляются с конца списка признаков и после уже известных)
            
            site_id = int(session[site_position])
            dict_len = len(sites_dict_sorted)
            
                
            #Время просмотра одного из топ-30 сайтов словаря
            if site_position > 0:
                if site_id <= 30:
                    
                    session[-5 - 31 + site_id] += seconds[pos_in_ufile] -\
                                                            seconds[pos_in_ufile-1]   
            
            
            # Индикаторы посещения топ-30 сайтов из словаря
            if site_id <= 30:
                if session[-5 - 61 + site_id] == 0:
                    session[-5 - 61 + site_id] = 1 
            
            
            
            #ПРИЗНАКИ на основе предпочтений посещения сайтов (частоты)
            
            #Частоты посещения 10 не самых популярных, но более специфичных сайтов, чем лидеры
            if (site_id <= dict_len//50) and (site_id > (dict_len//50)-10):
                position10 = site_id - (dict_len//50-10) #позиция признака по порядку следования (1...10)
                session[-5 - 71 + position10] += 1 #увеличивается частота посещения
            
            #Частоты посещения 10  самых популярных сайтов
            if site_id <= 10:                
                session[-5 - 81 + site_id] += 1 #увеличивается частота посещения       
                     
           
            
            #конец составления добавочных признаков
                
            
            pos_in_ufile += 1 #шаг по файлу
            site_position += 1 #шаг по сессии
        
            if site_position == session_length: #если достигнут конец сессии
                session[-4] = np.unique(session[:session_length]).shape[0] #Количество уникальных сайтов в сессии
                session[-5] = seconds[pos_in_ufile-1] -\
                                seconds[pos_in_ufile-session_length]     #длительность сессии        
                Session_list.append(session) #готовая сессия добавляется в список
                
                session = np.zeros(len(feature_names))#инициализируется новая сессия
                session[-1] = user_IDs[ind]
                site_position = 0
                pos_in_ufile -= session_length - window_size
                if (pos_in_ufile != ufile_length):#если не достигнут конец файла
                    session[-2] = weekdays[pos_in_ufile]
                    session[-3] = hours[pos_in_ufile]
                
            
            
            if (pos_in_ufile == ufile_length) & (site_position != 0):#если файл закончился раньше сессии
                    
                session[-4] = np.unique(session[:site_position]).shape[0] 
                session[-5] = seconds[pos_in_ufile-1]-\
                                    seconds[pos_in_ufile-site_position]                  
                Session_list.append(session)
                
                if site_position > window_size:#если каретка вышла за пределы окна (можно начать новую сессию)
                    session = np.zeros(len(feature_names))
                    session[-1] = user_IDs[ind]               
                    pos_in_ufile -= site_position - window_size
                    site_position = 0 
                    session[-2] = weekdays[pos_in_ufile]
                    session[-3] = hours[pos_in_ufile]
    
    
    Sessions = pd.DataFrame(Session_list,columns=feature_names, dtype = int)
        
       
    return Sessions  

In [None]:
train10users_many_fe['target'] = train10users_many_fe['target'].map(id_name_dict) #назначаем имена пользователям
#далее будем использовать еще словарь цветов пользователей

In [None]:
sub1 = ['time_diff' + str(i) for i in range(1,5)]
sns.pairplot(train10users_many_fe[sub1+['target']],hue='target',palette=color_dic, diag_kind="kde")

In [None]:
sub2 =  ['freq__top10_' + str(i) for i in range(1,5)]
sns.pairplot(train10users_many_fe[sub2+['target']],hue='target',palette=color_dic, diag_kind="kde")

In [None]:
sub3 =  ['freq__top10_' + str(i) for i in range(1,5)]
sns.pairplot(train10users_many_fe[sub3+['target']],hue='target',palette=color_dic, diag_kind="kde")

In [None]:
sub4 = ['ind_top'+str(i) for i in range(1,5)]
sns.pairplot(train10users_many_fe[sub4+['target']],hue='target',palette=color_dic, diag_kind="kde")

In [None]:
sub5 = ['time_top'+str(i) for i in range(1,5)]
sns.pairplot(train10users_many_fe[sub5+['target']],hue='target',palette=color_dic, diag_kind="kde")

In [None]:
sub6 = ['session_timespan', '#unique_sites', 'start_hour','day_of_week']
sns.pairplot(train10users_many_fe[sub6+['target']],hue='target',palette=color_dic, diag_kind="kde")

In [None]:
from plotly.offline import iplot
import plotly.graph_objs as go

number = []
users = []
for (user, sub_df) in  train10users_many_fe.groupby('target'): 
    number.append(sub_df.shape[0]) 
    users.append(user)
    
trace = go.Bar(
    x = users,
    y = number
)
layout = go.Layout(
    title='Количество сессий в зависимости от пользователя',
    
)

fig = go.Figure(data = [trace], layout = layout)
fig.update_layout(plot_bgcolor='white')
fig.update_xaxes(showgrid=True, gridwidth=0.5, gridcolor='#e9e9e9')
fig.update_yaxes(showgrid=True, gridwidth=0.5, gridcolor='#e9e9e9')
fig.update_xaxes(showline=True, linewidth=2, linecolor='black')
fig.update_yaxes(showline=True, linewidth=2, linecolor='black')

fig.data[0].marker.line.color = "black"
fig.data[0].marker.color = "LightSeaGreen"
fig.data[0].hoverlabel.bgcolor = "white"



iplot(fig)
print(sum(number))

In [None]:
from __future__ import print_function
from ipywidgets import interactive
import ipywidgets as widgets

def f(num):
    plt.figure(figsize=(10,7))
    sns.barplot(users,train10users_many_fe.groupby('target')['time_diff'+str(num)].mean(),palette = color_dic)
    plt.ylabel('Среднее время за сессию')
    plt.title(f'Средние значения для времени просмотра {num} сайта из сессии')    
    plt.grid(True)
    plt.show()

interactive_plot = interactive(f, num=widgets.IntSlider(min=1, max=9, step=1, value=1))
interactive_plot

In [None]:
def f2(num):
    plt.figure(figsize=(10,7))
    sns.barplot(users,train10users_many_fe.groupby('target')['freq__top10_'+str(num)].mean(),palette = color_dic)
    plt.ylabel('Средняя частота за сессию')
    plt.title(f'Средняя частота просмотра сайта {num} за сессию из топ 10 сайтов')    
    plt.grid(True)
    plt.show()

interactive_plot = interactive(f2, num=widgets.IntSlider(min=1, max=10, step=1, value=1))
interactive_plot

In [None]:
def f3(num):
    plt.figure(figsize=(10,7))
    sns.barplot(users,train10users_many_fe.groupby('target')['freq__mid10_'+str(num)].mean(),palette = color_dic)
    plt.ylabel('Средняя частота за сессию')
    plt.title(f'Средняя частота просмотра сайта {num} за сессию из 10 менее популярных сайтов')    
    plt.grid(True)
    plt.show()

interactive_plot = interactive(f3, num=widgets.IntSlider(min=1, max=10, step=1, value=1))
interactive_plot

In [None]:
def f4(num):
    plt.figure(figsize=(10,7))
    sns.barplot(users,train10users_many_fe.groupby('target')['time_top'+str(num)].mean(),palette = color_dic)
    plt.ylabel('Среднее время за сессию')
    plt.title(f'Средние значения для времени просмотра сайта {num} из топ 30 сайтов')    
    plt.grid(True)
    plt.show()

interactive_plot = interactive(f4, num=widgets.IntSlider(min=1, max=30, step=1, value=1))
interactive_plot

In [None]:
fig, axes = plt.subplots(nrows=3, ncols=4, figsize=(16, 10))


for idx, (user, sub_df) in  enumerate(train10users_many_fe.groupby('target')): 
    ax = axes[idx//4,idx%4]    
    ax.hist(sub_df[sub_df.time_diff1<1000][sub_df.time_diff1>100].time_diff1,color = color_dic[user])
    ax.set(xlabel='time_diff1',ylabel='количество сессий')
    
    ax.grid(True)
    ax.legend([user])

In [None]:
fig, axes = plt.subplots(nrows=3, ncols=4, figsize=(16, 10))


for idx, (user, sub_df) in  enumerate(train10users_many_fe.groupby('target')): 
    ax = axes[idx//4,idx%4]    
    ax.hist(sub_df[sub_df.freq__top10_1>0].freq__top10_1,color = color_dic[user])
    ax.set(xlabel='freq__top10_1',ylabel='количество сессий')
    
    ax.grid(True)
    ax.legend([user])

In [None]:
fig, axes = plt.subplots(nrows=3, ncols=4, figsize=(16, 10))


for idx, (user, sub_df) in  enumerate(train10users_many_fe.groupby('target')): 
    ax = axes[idx//4,idx%4]    
    ax.hist(sub_df[sub_df.freq__mid10_1>0].freq__mid10_1,color = color_dic[user])
    ax.set(xlabel='freq__mid10_1',ylabel='количество сессий')
    
    ax.grid(True)
    ax.legend([user])

In [None]:
fig, axes = plt.subplots(nrows=3, ncols=4, figsize=(16, 10))


for idx, (user, sub_df) in  enumerate(train10users_many_fe.groupby('target')): 
    ax = axes[idx//4,idx%4]    
    ax.hist(sub_df.ind_top1,color = color_dic[user])
    ax.set(xlabel='ind_top1',ylabel='количество сессий')
    
    ax.grid(True)
    ax.legend([user])

In [None]:
fig, axes = plt.subplots(nrows=3, ncols=4, figsize=(16, 10))


for idx, (user, sub_df) in  enumerate(train10users_many_fe.groupby('target')): 
    ax = axes[idx//4,idx%4]    
    ax.hist(sub_df[sub_df.time_top1>0].time_top1,color = color_dic[user])
    ax.set(xlabel='time_top1',ylabel='количество сессий')
    
    ax.grid(True)
    ax.legend([user])

In [None]:
from sklearn.manifold import TSNE
X_new_dim = TSNE(n_components=2,random_state=42).fit_transform(new_features_10users.values)
X_new_dim.shape

In [None]:
df_new_dim_feat = pd.DataFrame(X_new_dim,columns=['feat1','feat2'])
df_new_dim =pd.concat([df_new_dim_feat,train10users_many_fe.target],axis=1) 
df_new_dim.head()

In [None]:
plt.figure(figsize=(16,10))
   
for (user,sub_df) in df_new_dim.groupby('target'):
    plt.scatter(sub_df.feat1,sub_df.feat2,color = color_dic[user],alpha=0.5,label=user)
plt.legend()
plt.show()

In [None]:
def funk(num):
    plt.figure(figsize=(16,10))
    sub_df = df_new_dim[df_new_dim.target == users[num-1]]
    plt.scatter(sub_df.feat1,sub_df.feat2,color = color_dic[users[num-1]],alpha=0.3) 
    plt.legend([users[num-1]])
    plt.grid(True)
    plt.show()

interactive_plot = interactive(funk, num=widgets.IntSlider(min=1, max=10, step=1, value=1))
interactive_plot

In [None]:
X_new_dim_cos = TSNE(n_components=2,random_state=42,metric ='cosine').fit_transform(new_features_10users.values)
X_new_dim_cos.shape

In [None]:
df_new_dim_feat_cos = pd.DataFrame(X_new_dim_cos,columns=['feat1','feat2'])
df_new_dim_cos =pd.concat([df_new_dim_feat_cos,train10users_many_fe.target],axis=1) 
df_new_dim_cos.head()

In [None]:
plt.figure(figsize=(16,10))
   
for (user,sub_df) in df_new_dim_cos.groupby('target'):
    plt.scatter(sub_df.feat1,sub_df.feat2,color = color_dic[user],alpha=0.5,label=user)
plt.legend()
plt.show()

In [None]:
def funk2(num):
    plt.figure(figsize=(16,10))
    sub_df = df_new_dim_cos[df_new_dim_cos.target == users[num-1]]
    plt.scatter(sub_df.feat1,sub_df.feat2,color = color_dic[users[num-1]],alpha=0.3) 
    plt.legend([users[num-1]])
    plt.grid(True)
    plt.show()

interactive_plot = interactive(funk2, num=widgets.IntSlider(min=1, max=10, step=1, value=1))
interactive_plot

Неделя 4

In [None]:
def plot_validation_curves(param_values, grid_cv_results_):
    train_mu, train_std = grid_cv_results_['mean_train_score'], grid_cv_results_['std_train_score']
    valid_mu, valid_std = grid_cv_results_['mean_test_score'], grid_cv_results_['std_test_score']
    train_line = plt.plot(param_values, train_mu, '-', label='train', color='green')
    valid_line = plt.plot(param_values, valid_mu, '-', label='test', color='red')
    plt.fill_between(param_values, train_mu - train_std, train_mu + train_std, edgecolor='none',
                     facecolor=train_line[0].get_color(), alpha=0.2)
    plt.fill_between(param_values, valid_mu - valid_std, valid_mu + valid_std, edgecolor='none',
                     facecolor=valid_line[0].get_color(), alpha=0.2)
    plt.legend()

In [None]:
plot_validation_curves(svm_params1['C'], svm_grid_searcher1.cv_results_)

In [None]:
from sklearn.model_selection import learning_curve

def plot_learning_curve(val_train, val_test, train_sizes, 
                        xlabel='Training Set Size', ylabel='score'):
    def plot_with_err(x, data, **kwargs):
        mu, std = data.mean(1), data.std(1)
        lines = plt.plot(x, mu, '-', **kwargs)
        plt.fill_between(x, mu - std, mu + std, edgecolor='none',
                         facecolor=lines[0].get_color(), alpha=0.2)
    plot_with_err(train_sizes, val_train, label='train')
    plot_with_err(train_sizes, val_test, label='valid')
    plt.xlabel(xlabel); plt.ylabel(ylabel)
    plt.legend(loc='lower right');

In [None]:
plot_learning_curve(val_train, val_test, n_train, 
                    xlabel='train_size', ylabel='accuracy')

In [None]:
def prepare_sparse_train_set_time(path_to_csv_files, 
                                    session_time=5):
       
    #Загрузка файлов пользователей
    files_paths = list(glob(os.path.join(path_to_csv_files,'user*.csv')))
    files = []
    user_IDs = []
    for path in tqdm_notebook(files_paths):
        f_name = os.path.split(path)[1]
        ID = int(re.findall("[0-9]+",f_name)[0])
        user_IDs.append(ID)
        files.append(pd.read_csv(path))
    user_IDs = np.array(user_IDs)
            
    #Получение словаря сайтов с ID и частотами
   
    sites_dictionary = {}
    
    for user in tqdm_notebook(files):
        for site in user.site.values:
            if site in sites_dictionary:
               
                sites_dictionary[site] += 1                
                
            else:
                sites_dictionary[site] = 1
              
    d =  sorted(sites_dictionary.items(), key=lambda item: item[1],reverse = True)    
    sites_dict_sorted = {}
    
    for ii,pair in enumerate(d):
        
        sites_dict_sorted[pair[0]] = [ii+1,pair[1]]  #заполнение словаря ID и частотами из отсортированного d
    
    
    
            
    # Заполнение таблицы пользовательских сессий ID просмотренных пользователем сайтов при заданной длительности сессии
    Session_list = [] 
    for ind,user in enumerate(tqdm_notebook(files)):
        session = []#инициализация сессии
        session.append(user_IDs[ind])#нулевой столбец для ID пользователей
        
        sites = user.site.values#стоблец сайтов пользователя
        
        
        
        user.timestamps = pd.to_datetime(user.timestamp)#Столбец даты и времени начала просмотра сайта
        
        seconds = user.timestamps.apply(lambda x:x.timestamp()).values
        
        
       
        ufile_length = sites.shape[0] #длина файла пользователя
        site_position = 0 #позиция в сессии
        pos_in_ufile = 0 #позиция в файле
        
        duration_time_session = 0
        duration_site_on_last_session = 0
        while pos_in_ufile < ufile_length:
            if pos_in_ufile > 0:
                #Время на сайте в текущей сессии
                duration_time_site =(seconds[pos_in_ufile] - seconds[pos_in_ufile-1])/60. - duration_site_on_last_session 
                duration_site_on_last_session  = 0 #новая сессии, величина пока не нужна
                
                duration_time_session += duration_time_site #увеличиваем время сессии
                
                
            if duration_time_session  > session_time:  #время сессии превышено            
                Session_list.append(session) #Сессия закончена и помещена в список
                #print('new')
                session = [] #инициализация новой сессии
                session.append(user_IDs[ind]) #нулевой столбец для ID пользователей
                site_to_start = sites[pos_in_ufile-1]#последний сайт прошлой переходит на новую сессию
                duration_time_site_over_edge = duration_time_session - session_time#время выхода за пределы
                duration_site_on_last_session  = (seconds[pos_in_ufile] - seconds[pos_in_ufile-1])/60.\
                                                    - duration_time_site_over_edge #время, проведенное на прошлой сессии
                
                duration_time_session = 0
                session.append(sites_dict_sorted[site_to_start][0])
                
            elif duration_time_session == session_time:
                Session_list.append(session)
                #print('new')
                session = [] #инициализация новой сессии
                session.append(user_IDs[ind]) #нулевой столбец для ID пользователей
                site_to_start = sites[pos_in_ufile]
                duration_time_session = 0
                session.append(sites_dict_sorted[site_to_start][0])
                pos_in_ufile += 1                
            else:
                site_to_add =  sites[pos_in_ufile] #сайт для добавления в сессию
                session.append(sites_dict_sorted[site_to_add][0])#добавление
                pos_in_ufile += 1 #переходим на следующую позицию и на след шаге будет проверять время на сайте
            
        if len(session) > 0: #обрабатываем последнюю сессию для данного пользователя
            
                
            Session_list.append(session) #Добавляем последнюю сессию в список  
                
   
    
    Sessions = pd.DataFrame(Session_list).fillna(0).values
      
    
    #Отделений ID пользователей от таблицы
    X, y = Sessions[:,1:],Sessions[:,0]
    
    #Получение разреженной матрицы частот встречаемости сайтов в каждой сессии      
    indptr = [0]
    indices = []
    data = []
    vocabulary = {}
    for s in tqdm_notebook(X):
        for ID in s:
            index = vocabulary.setdefault(ID, ID)
            indices.append(index)
            data.append(1)
        indptr.append(len(indices))

    X_sparse = csr_matrix((data, indices, indptr), dtype=int)[:,1:] 
    
    
    return X_sparse, y, Sessions

In [None]:
def model_assessment(estimator, path_to_X_pickle, path_to_y_pickle, cv, random_state=17, test_size=0.3):
    
    #Загрузка
    with open(path_to_X_pickle,'rb') as X_pickle:
        X_sparse = pickle.load(X_pickle)
    
    with open(path_to_y_pickle,'rb') as y_pickle:
        y = pickle.load(y_pickle)
    
    #Разбиение на обучающую и валидационную выборки    
    X_train,X_valid,y_train,y_valid = train_test_split(X_sparse,y,\
                                                       random_state=random_state,test_size=test_size,stratify=y)
    
    #Средняя точность на кроссвалидации для обучающей выборки    
    mean_cv_accuracy = cross_val_score(estimator,X_train,y_train, cv=cv).mean()
    
    #Точность для валидационной выборки
    estimator.fit(X_train,y_train)
    val_accuracy = accuracy_score(y_valid,estimator.predict(X_valid))
    
    return mean_cv_accuracy,val_accuracy

Неделя 5

In [None]:
PATH_TO_DATA = 'capstone_user_identification'

In [None]:
train_df = pd.read_csv(os.path.join(PATH_TO_DATA, 'train_sessions.csv'),
                       index_col='session_id')
test_df = pd.read_csv(os.path.join(PATH_TO_DATA, 'test_sessions.csv'),
                      index_col='session_id')

In [None]:
train_df['target'].value_counts()

In [None]:
train_df.head()

In [None]:
test_df.head()

In [None]:
test_df.info()

In [None]:
train_df.info()

In [None]:
train_test_df = pd.concat([train_df, test_df]) 

In [None]:
train_test_df_sites = train_test_df[['site%d' % i for i in range(1, 11)]].fillna(0).astype('int')

In [None]:
train_test_df_sites.head()

In [None]:
train_test_sparse = spar(train_test_df_sites.values)

In [None]:
train_test_sparse

In [None]:
X_train_sparse = train_test_sparse[:253561]
X_test_sparse = train_test_sparse[253561:]
y = train_df['target']

In [None]:
print(f'{X_train_sparse.shape[0]} {X_train_sparse.shape[1]} {X_test_sparse.shape[0]} {X_test_sparse.shape[1]}')

In [None]:
train_share = int(.7 * X_train_sparse.shape[0])
X_train, y_train = X_train_sparse[:train_share, :], y[:train_share]
X_valid, y_valid  = X_train_sparse[train_share:, :], y[train_share:]

In [None]:
sgd_logit = SGDClassifier(loss='log',random_state=17,n_jobs=-1)
sgd_logit.fit(X_train, y_train)

In [None]:
logit_valid_pred_proba = sgd_logit.predict_proba(X_valid)
logit_valid_pred_proba

In [None]:
ROC_AUC = roc_auc_score(y_valid,logit_valid_pred_proba[:,1])
print(round(ROC_AUC,3))

In [None]:
%%time
sgd_logit.fit(X_train_sparse,y)
logit_test_pred_proba = sgd_logit.predict_proba(X_test_sparse)
logit_test_pred_proba[:,1]

In [None]:
def write_to_submission_file(predicted_labels, out_file,
                             target='target', index_label="session_id"):
    # turn predictions into data frame and save as csv file
    predicted_df = pd.DataFrame(predicted_labels,
                                index = np.arange(1, predicted_labels.shape[0] + 1),
                                columns=[target])
    predicted_df.to_csv(out_file, index_label=index_label)

In [None]:
param_grid = {'alpha': [0.00001,0.0001,0.00015,0.0002,0.001,0.1,1],
             'l1_ratio':[0,0.1,0.15,0.5],
             'class_weight': ['balanced',None,{1:1000,0:0.001}]}
clf = SGDClassifier(loss='log',n_jobs=-1,random_state=17)

In [None]:
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.metrics import make_scorer
skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=17)


sgd_grid_searcher = GridSearchCV(clf,param_grid,cv=skf,n_jobs=-1,return_train_score=True,scoring='roc_auc')
sgd_grid_searcher.fit(X_train, y_train)
sgd_grid_searcher.best_estimator_

In [None]:
sgd_grid_searcher.best_score_,sgd_grid_searcher.best_params_

In [None]:
ROC_AUC1 = roc_auc_score(y_valid,sgd_grid_searcher.best_estimator_.predict_proba(X_valid)[:,1])
print(round(ROC_AUC1,3))

In [None]:
param_grid_check = {'learning_rate': ['optimal','invscaling','adaptive']}
             
clf_check = SGDClassifier(loss='log',random_state=91,learning_rate = 'adaptive',eta0=100)

In [None]:
sgd_grid_searcher_ch = GridSearchCV(clf_check,param_grid_check,cv=skf,n_jobs=-1,return_train_score=True,scoring='roc_auc')
sgd_grid_searcher_ch.fit(X_train, y_train)
sgd_grid_searcher_ch.cv_results_

In [None]:
%matplotlib inline
from matplotlib import pyplot as plt
plt.plot(range(3),sgd_grid_searcher_ch.cv_results_['mean_test_score'])
plt.grid()

In [None]:
%%time
sgd_grid2 = SGDClassifier(loss='log',alpha=0.00005,class_weight='balanced',random_state=17,\
                          learning_rate = 'adaptive',eta0=0.01)
sgd_grid2.fit(X_train,y_train) 
ROC_AUC2 = roc_auc_score(y_valid,sgd_grid2.predict_proba(X_valid)[:,1])
print(round(ROC_AUC2,3))

In [None]:
ROC_AUC2 = roc_auc_score(y_valid,sgd_grid2.predict_proba(X_valid)[:,1])
print(round(ROC_AUC2,3))

In [None]:
from sklearn.feature_extraction.text import TfidfTransformer
transformer = TfidfTransformer()
train_test_sparse_tfidf = transformer.fit_transform(train_test_sparse)
train_test_sparse_tfidf

In [None]:
ind_top1 = np.zeros(sessions_sites.shape[0],dtype=int)
ind_top2 = np.zeros(sessions_sites.shape[0],dtype=int)
unique_sites = np.zeros(sessions.shape[0],dtype=int)
for i,session in enumerate(sessions_sites):
    unique_sites[i] = np.unique(session).shape[0]
    if 1 in session:
        ind_top1[i] = 1
    if 2 in session:
        ind_top2[i] = 1
        
sum(ind_top1),sum(ind_top2)   

In [None]:
isWeekend = pd.to_datetime(train_test_df['time1']).apply(lambda x: 1 if x.weekday() in [5,6] else 0).values
isWeekend 

In [None]:
month = pd.to_datetime(train_test_df['time1']).apply(lambda x: x.month).values
month

In [None]:
year = pd.to_datetime(train_test_df['time1']).apply(lambda x: x.year).values
year

In [None]:
added_fe1 = np.vstack((ind_top1.T,ind_top2.T,isWeekend,month,year))[:3].T
added_fe1

In [None]:
added_fe_cat1 = pd.get_dummies(np.vstack((ind_top1.T,ind_top2.T,isWeekend,month,year))[-1].T)
added_fe_cat1.head()

In [None]:
added_fe_cat2 = pd.get_dummies(month)
added_fe_cat2.head()

In [None]:
start_hour =pd.to_datetime(train_test_df['time1']).apply(lambda x:x.time().hour).values

In [None]:
added_fe_cat3 = pd.get_dummies(start_hour)
added_fe_cat3.head()

In [None]:
added_fe_cat4 = pd.get_dummies(unique_sites)
added_fe_cat4.head()

In [None]:
day_of_week = pd.to_datetime(train_test_df['time1']).apply(lambda x:x.weekday()).values

In [None]:
added_fe_cat5 = pd.get_dummies(day_of_week)
added_fe_cat5.head()

In [None]:
TimeOfDay = pd.to_datetime(train_test_df['time1']).apply(lambda x:x.time().hour).apply(lambda x: 0 if x in range(7) else\
                                                                                      (1 if x in range(7,12) else \
                                                                                      (2 if x in range(12,18)\
                                                                                      else 3))).values
TimeOfDay

In [None]:
added_fe_cat6 = pd.get_dummies(TimeOfDay)
added_fe_cat6.head()

In [None]:
season = pd.to_datetime(train_test_df['time1']).apply(lambda x: x.month).apply(lambda x: 1 if x in [12,1,2] else\
                                                                                      (2 if x in [3,4,5] else \
                                                                                      (3 if x in [6,7,8]\
                                                                                      else 4))).values
season

In [None]:
added_fe_cat7 = pd.get_dummies(season)
added_fe_cat7.head()

In [None]:
added_fe_counts_top5 = train_test_sparse[:,:5].todense()
sum(added_fe_counts_top5)

In [None]:
for i,session in enumerate(sessions_sites):
    num_sites[i] = sum(session>0)
    for ID in range(1,11):
        if ID in session:
            num_sites_top10[i] += 1           
(np.unique(num_sites),np.bincount(num_sites)) , (np.unique(num_sites_top10),np.bincount(num_sites_top10))

In [None]:
added_fe_cat8 = pd.get_dummies(num_sites)
added_fe_cat8.head()

In [None]:
added_fe_cat9 = pd.get_dummies(num_sites_top10)
added_fe_cat9.head()

In [None]:
train_test_df_time = train_test_df[['time%d' % i for i in range(1, 11)]]

tdf = train_test_df_time.fillna(0)
for i,col in tqdm_notebook(enumerate(tdf.columns)):
    tdf[f'sec{i+1}'] = pd.to_datetime(tdf[col]).apply(lambda x: x.timestamp())

session_times = tdf[['sec%d' % i for i in range(1, 11)]]
session_times.head()    

In [None]:
%%time
num_sites_10sec = np.zeros(session_times.shape[0],dtype=int)

for i,session in tqdm_notebook(enumerate(session_times.values)):     
    num = 0
    j = 0
    duration = 0
    while (duration < 10) and (duration >= 0):
        num += 1
        j += 1
        if (j == session.shape[0]):
            break
        duration += session[j] - session[j-1]
    
    num_sites_10sec[i] = num   

num_sites_10sec

In [None]:
added_fe_cat10 = pd.get_dummies(num_sites_10sec)
added_fe_cat10.head()

In [None]:
for ind, col in enumerate(session_times.columns):
    if ind > 0:
        session_times[f'dur{ind}'] =  session_times[col] - session_times[session_times.columns[ind-1]]
session_times.head()
        

In [None]:
durations = session_times[[f'dur{i}' for i in range(1,10)]]
durations.head()

In [None]:
dur_array = np.array(durations,dtype = float)
dur_array 

In [None]:
for i,session in enumerate(dur_array):
    for j,dur in enumerate(session):
        if dur < 0:
            if j==0:
                dur_array[i,j]=30*60
            else:
                dur_array[i,j]=30*60 - sum(session[:j])
pd.DataFrame(dur_array).head()

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler().fit(dur_array)
time_fee_to_add= scaler.transform(dur_array)
time_fee_to_add

In [None]:
X_train_sparse7 = hstack((train_test_sparse_tfidf[:253561],added_fe1[:253561],\
                          added_fe_cat1.values[:253561],added_fe_cat2.values[:253561],\
                          added_fe_cat3.values[:253561],added_fe_cat4.values[:253561],\
                          added_fe_cat5.values[:253561],added_fe_cat6.values[:253561],\
                          added_fe_cat7.values[:253561],added_fe_counts_top5[:253561],\
                          added_fe_cat8.values[:253561],added_fe_cat9.values[:253561],added_fe_cat10.values[:253561],\
                          time_fee_to_add[:253561]))
X_test_sparse7 = hstack((train_test_sparse_tfidf[253561:],added_fe1[253561:],\
                         added_fe_cat1.values[253561:],added_fe_cat2.values[253561:],\
                         added_fe_cat3.values[253561:],added_fe_cat4.values[253561:],\
                         added_fe_cat5.values[253561:],added_fe_cat6.values[253561:],\
                         added_fe_cat7.values[253561:],added_fe_counts_top5[253561:],\
                         added_fe_cat8.values[253561:],added_fe_cat9.values[253561:],added_fe_cat10.values[253561:],\
                         time_fee_to_add[253561:]))

In [None]:
%%time
sgd_logit_grid55 = SGDClassifier(loss='log',alpha=0.00005,class_weight='balanced',random_state=17,\
                          learning_rate = 'adaptive',eta0=0.01)
sgd_logit_grid55.fit(X_train_sparse7,y)
logit_test_pred_proba55 = sgd_logit_grid55.predict_proba(X_test_sparse7)
logit_test_pred_proba55[:,1]

In [None]:
estimator_pick = xgb.XGBClassifier(random_state=17,n_estimators=10)
param_grid_boost = {'learning_rate':[0.01,0.1,0.2,0.5],
                    'max_depth':[3,5,7], 
                    'min_child_weight':[1,3,5]
                   }
gb_grid_searcher = GridSearchCV(estimator_pick,param_grid_boost,cv=skf,n_jobs=-1,return_train_score=True,scoring='roc_auc')
gb_grid_searcher.fit(X_train3, y_train3)
gb_grid_searcher.best_score_,gb_grid_searcher.best_params_

In [None]:
n_estimators = [10,20,50,100]
rocs = []
for n in n_estimators:
    gb_clf = xgb.XGBClassifier(learning_rate=0.5,max_depth=7, min_child_weight=1, random_state=17,n_estimators=n)
    gb_clf.fit(X_train4,y_train4)
    ROC_AUC_test_fe7 = roc_auc_score(y_valid4,gb_clf.predict_proba(X_valid4)[:,1])
    rocs.append(ROC_AUC_test_fe7)
    print(round(ROC_AUC_test_fe7,3))

In [None]:
%%time
gb_clf55 = xgb.XGBClassifier(learning_rate=0.5,max_depth=7, min_child_weight=1, random_state=17,n_estimators=100)
gb_clf55.fit(X_train_sparse7,y)
gb_test_pred_proba55 = gb_clf55.predict_proba(X_test_sparse7)
gb_test_pred_proba55[:,1]

In [None]:
forest_clf1 = RandomForestClassifier(n_estimators = 10,  random_state=17)
params_forest = {"class_weight": [None, "balanced", "balanced_subsample"],
                 "criterion": ["gini", "entropy"],
                 "min_samples_split":[7,5,3]
                }
forest_grid_searcher = GridSearchCV(forest_clf1,param_grid = params_forest,cv=skf,n_jobs=-1,scoring='roc_auc')
forest_grid_searcher.fit(X_train4, y_train4)
forest_grid_searcher.best_score_,forest_grid_searcher.best_params_

In [None]:
n_estimators = [10,20,50,100,120]
rocs = []
for n in n_estimators:
    forest_clf2 = RandomForestClassifier(n_estimators = n, class_weight='balanced',criterion='gini',min_samples_split= 7, random_state=17)
    forest_clf2.fit(X_train4,y_train4)
    ROC_AUC_test_fe9 = roc_auc_score(y_valid4,forest_clf2.predict_proba(X_valid4)[:,1])
    rocs.append(ROC_AUC_test_fe9)
    print(round(ROC_AUC_test_fe9,3))

In [None]:
%%time
forest_clf55 = RandomForestClassifier(n_estimators = 150, class_weight='balanced',criterion='gini',min_samples_split= 7, random_state=17)
forest_clf55.fit(X_train_sparse7,y)
forest_test_pred_proba55 = forest_clf55.predict_proba(X_test_sparse7)
forest_test_pred_proba55[:,1]

In [None]:
blended_test_pred_proba555 = (logit_test_pred_proba55[:,1]+gb_test_pred_proba55[:,1] + forest_test_pred_proba55[:,1])/3
blended_test_pred_proba555

In [None]:
write_to_submission_file(blended_test_pred_proba555,out_file='submission31_tfidf_fe_new_new_blend_l_b_f.csv')

Неделя 6

In [None]:
def arrays_to_vw_add_fe(Xs,Xc,Xn, y=None, train=True, out_file='palpatin.vw'):
    
    Xs = np.array(Xs,dtype=int)
    Xc = np.array(Xc)
    Xn = np.array(Xn,dtype=float)
    
    if train:
        with open(out_file, 'w') as vw_train_data:         
            for yy,xs,xc,xn in zip(y,Xs,Xc,Xn): 
                obj = str(yy)+' |session '+ ' '.join(map(str,xs)) +\
                      ' |cat ' + ' '.join([f'Fc{i+1}={val}' for i,val in enumerate(xc)]) +\
                      ' |num ' + ' '.join([f'Fn{i+1}:{val}' for i,val in enumerate(xn)]) + '\n'
                vw_train_data.write(obj)
    else:
        with open(out_file, 'w') as vw_test_data:         
            for xs,xc,xn in zip(Xs,Xc,Xn): 
                obj = '1 |session '+ ' '.join(map(str,xs)) +\
                      ' |cat ' + ' '.join([f'Fc{i+1}={val}' for i,val in enumerate(xc)]) +\
                      ' |num ' + ' '.join([f'Fn{i+1}:{val}' for i,val in enumerate(xn)]) + '\n'
                vw_test_data.write(obj)
                
    print('Good, Anakin, good!')  

In [None]:
%%time
arrays_to_vw(train_df_part.fillna(0).values, y_train_part_for_vw,\
             out_file = os.path.join(PATH_TO_DATA2,'train_part.vw'))
arrays_to_vw(valid_df.fillna(0).values, y_valid_for_vw,\
             out_file = os.path.join(PATH_TO_DATA2,'valid.vw')) 
arrays_to_vw(train_df_400[sites].fillna(0).values, y_for_vw,\
             out_file = os.path.join(PATH_TO_DATA2,'train.vw'))
arrays_to_vw(test_df_400[sites].fillna(0).values,\
             train = False, out_file = os.path.join(PATH_TO_DATA2,'test.vw'))  

In [None]:
!head -4 $PATH_TO_DATA2/train_part.vw

In [None]:
train_part_vw = os.path.join(PATH_TO_DATA2, 'train_part.vw')
valid_vw = os.path.join(PATH_TO_DATA2, 'valid.vw')
train_vw = os.path.join(PATH_TO_DATA2, 'train.vw')
test_vw = os.path.join(PATH_TO_DATA2, 'test.vw')
model = os.path.join(PATH_TO_DATA2, 'vw_model.vw')
valid_pred = os.path.join(PATH_TO_DATA2,'vw_valid_pred.csv')
pred = os.path.join(PATH_TO_DATA2, 'vw_pred.csv')

In [None]:
for (loss, passes, bits) in itertools.product(['squared','logistic','hinge'],[2,3,5],[18,26,30]):
    
    !vw --oaa 400 --passes $passes -c -k -b $bits --loss_function $loss --random_seed 17 -d $train_part_vw -f $model --quiet
    
    !vw -i $model -t -d $valid_vw -p $valid_pred --quiet
    
    
    print(f'Params: {loss} {passes} {bits}, Accuracy: {accuracy_score(y_valid_for_vw,pd.read_csv(valid_pred,header=None))}')
    print('---------------')   

In [None]:
%%time
for reg in [1e-10,1e-15,1e-20]:
    
    !vw --oaa 400 --passes 5 -c -k -b 26 --loss_function logistic --l1 $reg --random_seed 17\
    -d $train_part_vw -f $model --quiet
    
    !vw -i $model -t -d $valid_vw -p $valid_pred --quiet
    
    
    print(f'Param reg L1: {reg} , Accuracy: {accuracy_score(y_valid_for_vw,pd.read_csv(valid_pred,header=None))}')
    print('---------------') 

In [None]:
%%time
for reg in [5e-8,1e-9,5e-9,1e-10,2e-10,3e-10,4e-10,5e-10,1e-11]:
    
    !vw --oaa 400 --passes 2 -c -k -b 18 --loss_function logistic --l2 $reg --random_seed 17\
    -d $train_part_vw -f $model --quiet
    
    !vw -i $model -t -d $valid_vw -p $valid_pred --quiet
    
    
    print(f'Param reg L2: {reg} , Accuracy: {accuracy_score(y_valid_for_vw,pd.read_csv(valid_pred,header=None))}')
    print('---------------') 

Дополнительно

In [None]:
!vw --help
!cp --help

with open(os.path.join(PATH_TO_DATA2, 'X_sparse_150users_s10_w10.pkl'), 'rb') as X_sparse_150users_pkl:
     X_sparse_150users = pickle.load(X_sparse_150users_pkl)
with open(os.path.join(PATH_TO_DATA2, 'y_150users_s10_w10.pkl'), 'rb') as y_150users_pkl:
    y_150users = pickle.load(y_150users_pkl)

class_distr = np.bincount(y_train_150.astype('int'))


train10users_many_fe[['time_top'+str(i) for i in range(1,31)]].head()

for num_users in [10,150]:
    for window_size,session_length in itertools.product([10, 7, 5], [15, 10, 7, 5]):
        if (window_size<=session_length) and ((session_length,window_size) != (10,10)):
            print(num_users,session_length,window_size)
            
assert X_sparse_10users.shape[1] == len(site_freq_10users)-1



df['education'].value_counts().plot.barh();



try:
    label_encoder.transform(df['education'].replace('high.school', 'high_school'))
except Exception as e:
    print('Error:', e)
    
    
    
def logistic_regression_accuracy_on(dataframe, labels):
    features = dataframe.as_matrix()
    train_features, test_features, train_labels, test_labels = \
        train_test_split(features, labels)

    logit = LogisticRegression()
    logit.fit(train_features, train_labels)
    return classification_report(test_labels, logit.predict(test_features))

print(logistic_regression_accuracy_on(df[categorical_columns], labels))



onehot_encoder = OneHotEncoder(sparse=False)
encoded_categorical_columns = pd.DataFrame(onehot_encoder.fit_transform(df[categorical_columns]))
encoded_categorical_columns.head()


%%bash
ls

!echo '1 1.0 |Subject WHAT car is this |Organization University of Maryland:0.5 College Park' | vw



def to_vw_format(document, label=None):
    return str(label or '') + ' |text ' + ' '.join(re.findall('\w{3,}', document.lower())) + '\n'

to_vw_format(text, 1 if target == 'rec.autos' else -1)



with open('./20news_test_predictions.txt') as pred_file:
    test_prediction = [float(label) 
                             for label in pred_file.readlines()]

auc = roc_auc_score(test_labels, test_prediction)
roc_curve_gr = roc_curve(test_labels, test_prediction)

with plt.xkcd():
    plt.plot(roc_curve_gr[0], roc_curve_gr[1]);
    plt.plot([0,1], [0,1])
    plt.xlabel('FPR'); plt.ylabel('TPR'); plt.title('test AUC = %f' % (auc)); plt.axis([-0.05,1.05,-0.05,1.05]);
    
    
    
M = confusion_matrix(test_labels_mult, test_prediction_mult)
for i in np.where(M[0,:] > 0)[0][1:]:
    print(newsgroups['target_names'][i], M[0,i], )
    
    
reviews_test = load_files(os.path.join(path_to_movies, 'test'))
text_test, y_test = reviews_test.data, reviews_train.target
print("Number of documents in test data: %d" % len(text_test))
print(np.bincount(y_test))



!cat --help


!du -hs $PATH_TO_DATA/stackoverflow_10mln_*.vw