## Постановка задачи
У нас появился запрос из отдела продаж и маркетинга. Как вы знаете «МегаФон» предлагает обширный набор различных услуг своим абонентам. При этом разным пользователям интересны разные услуги. Поэтому необходимо построить  алгоритм, который **для каждой пары пользователь-услуга определит вероятность подключения услуги**.

### Исходные данные

#### наборы данных
- **features.csv** (22Г): id, buy_time, <feature_list>
нормализованный анонимизированный набор признаков, характеризующий профиль потребления абонента. Эти данные привязаны к определенному времени, поскольку профиль абонента может меняться с течением времени.
- **data_train.csv** (27М): id, vas_id, buy_time, target
информация об отклике абонентов на предложение подключения одной из услуг. Каждому пользователю может быть сделано несколько предложений в разное время, каждое из которых он может или принять, или отклонить.
- **data_test.csv**: id, vas_id, buy_time
тестовый набор

#### переменные
   - **target** - целевая переменная, где 1 означает подключение услуги, 0 - абонент не подключил услугу соответственно.
   - **buy_time** - время покупки, представлено в формате timestamp, для работы с этим столбцом понадобится функция datetime.fromtimestamp из модуля datetime.
   - **id** - идентификатор абонента
   - **vas_id** - подключаемая услуга




In [72]:
import os.path

import pandas as pd
from matplotlib import pyplot as plt
import numpy as np
import datetime


Считаем исходные данные

In [73]:
data_train = pd.read_csv('data_train.csv')
data_train.head(3)

Unnamed: 0.1,Unnamed: 0,id,vas_id,buy_time,target
0,0,540968,8.0,1537131600,0.0
1,1,1454121,4.0,1531688400,0.0
2,2,2458816,1.0,1534107600,0.0


In [74]:
class FeaturesInspect:
    dataframe = None
    feat_nunique = None
    feats = dict()

    def __init__(self, dataframe, num_cat=10):
        self.dataframe = dataframe
        self.feat_nunique = self.dataframe.apply(lambda x: x.nunique(dropna=False))
        self.num_cat = num_cat

    def show_nunique(self):
        plt.title("Распределение уникальных значений признаков")
        self.feat_nunique.hist(bins=100, figsize=(10, 5))

    def collect(self):
        self.feats['all'] = set(self.feat_nunique.index.tolist())
        self._collect_const()
        self._collect_numeric()
        self.feats['other'] = self.feats['all'] - (self.feats['numeric'] | self.feats['const'])
        self._collect_binary()
        self._collect_categorical()
        self.feats['extra'] = self.feats['categorical']
        self.feats['ok'] = self.feats['binary'] | self.feats['categorical'] | self.feats['numeric']

    def print(self):
        print('Всего уникальных признаков :', len(self.feats['all']))
        print('...константные  признаки :', len(self.feats['const']))
        print('...вещественные признаки :', len(self.feats['numeric']))
        self.feats['other'] = self.feats['all'] - (self.feats['numeric'] | self.feats['const'])
        print('...другие признаки :', len(self.feats['other']))
        print('...бинарные признаки :', len(self.feats['binary']))
        print('...категориальные признаки :', len(self.feats['categorical']))
        self.feats['extra'] = self.feats['categorical']
        self.feats['ok'] = self.feats['binary'] | self.feats['categorical'] | self.feats['numeric']

    def _collect_const(self):
        self.feats['const'] = set(self.feat_nunique[self.feat_nunique == 1].index.tolist())
        return len(self.feats['const'])

    def _collect_numeric(self):
        f_numeric = (self.dataframe.fillna(0).astype(int).sum() - self.dataframe.fillna(0).sum()).abs()
        self.feats['numeric'] = set(f_numeric[f_numeric > 0].index.tolist())
        return len(self.feats['numeric'])

    def _collect_categorical(self):
        self.feats['categorical'] = set(
            self.feat_nunique.loc[self.feats['other']][
                self.feat_nunique.loc[self.feats['other']] <= self.num_cat].index.tolist())
        return len(self.feats['categorical'])

    def _collect_binary(self):
        f_other = self.feats['other']
        self.feats['binary'] = set(self.dataframe.loc[:, f_other].columns[(
                (self.dataframe.loc[:, f_other].max() == 1) &
                (self.dataframe.loc[:, f_other].min() == 0) &
                (self.dataframe.loc[:, f_other].isnull().sum() == 0))])
        return len(self.feats['binary'])


### Обработка файла большого объема features

#### 1. Чтение по частям и объединение

In [75]:
def data_id_filter(df_with_id, features_name='features.csv'):
    feat_by_parts = pd.read_csv('features.csv', chunksize=100000, iterator=True, sep="\t")
    iter = 1
    df_all = pd.DataFrame()
    for feat_part in feat_by_parts:
        # фильтрация по всем клиентам
        merge = feat_part['id'].isin(df_with_id.id)
        df_filtered = feat_part[merge]
        df_all = pd.concat([df_all, df_filtered])
        print(iter)
        iter += 1
    return df_all

In [76]:
%%time
feats_by_train = data_id_filter(data_train, features_name='features.csv')

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
Wall time: 6min 12s


In [77]:
feats_by_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 834168 entries, 13 to 4512521
Columns: 256 entries, Unnamed: 0 to 252
dtypes: float64(253), int64(3)
memory usage: 1.6 GB


In [78]:
feats_by_train.head(3)

Unnamed: 0.1,Unnamed: 0,id,buy_time,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,144,145,146,147,148,149,150,151,152,153,154,155,156,157,158,159,160,161,162,163,164,165,166,167,168,169,170,171,172,173,174,175,176,177,178,179,180,181,182,183,184,185,186,187,188,189,190,191,192,193,194,195,196,197,198,199,200,201,202,203,204,205,206,207,208,209,210,211,212,213,214,215,216,217,218,219,220,221,222,223,224,225,226,227,228,229,230,231,232,233,234,235,236,237,238,239,240,241,242,243,244,245,246,247,248,249,250,251,252
13,13,2046132,1534712400,300.820029,1599.480888,286.879214,1585.013202,281.461754,1563.90821,-16.08618,654.013903,-6.78366,-30.006538,-2.736081,-4.007526,-2.558912,259.130873,320.10189,-0.000725,-0.016435,-0.107041,-5.41746,-3.178521,-13.940815,-10.744164,-0.094251,-0.001733,-0.009327,-2.082209,0.200138,-0.00909,-0.351862,-0.214366,-0.211608,-0.001884,-2.3e-05,-3e-05,-2.65939,-0.065583,-0.700765,-192.026959,-2697.843724,-942.169157,-1755.674564,-89.504287,-119.724355,276.621311,179.764146,96.857176,-7.896282,-6.994502,-10.717958,-27.612844,-11.130284,-15.088986,-12.171711,-5.331295,-3.958702,-7.745251,-1.671324,-0.001656,63.318354,20.117335,0.434766,-0.148162,0.098356,0.392424,0.074338,-0.028454,-0.044465,-0.301128,-0.554677,-0.036834,3.869969,-2.783592,-2.60662,-5.390212,-4.022547,0.0,-2.824022,-10.706438,-1.2015,-0.998268,-0.203232,0.0,-0.248755,-0.222852,-0.134088,0.0,-0.030537,-0.125866,-0.096986,-0.679774,-0.626985,-0.691912,-0.506613,-0.185299,-0.598716,-0.000115,-0.250188,-0.348913,-0.828382,-42.275915,-3.950157,-0.253037,-0.318148,-2.29064,-3.447583,-0.040043,-9.408469,-0.212137,61.364686,-1.019293,-2.473446,5.37916,-943.365043,-1757.811263,0.40201,0.426143,-2701.176307,-2298.725139,-0.343415,-0.08972,-0.278878,-0.433135,-0.024048,-89.211948,-119.674411,-208.886358,-0.058077,-0.060451,0.334871,4.381836,42.318359,1.426717,-0.468443,-0.217283,-190.670372,-1.143223,-2.722591,-0.825973,-1.935988,0.0,-1.276187,-0.020137,-0.042636,94.319654,540.562498,34.03637,-34.888325,-3.861461,0.182836,-0.007024,0.356731,-8.417671,-0.212646,-0.019562,-4.4e-05,-0.000379,-2.548856,-0.261309,0.463685,-0.061481,-0.152157,-0.002595,-3.678214,-0.014542,0.180492,-21.183166,-44.376426,-25.320085,-51.984826,38.038772,179.704434,-7.614497,6.444947,-0.028857,-0.063214,-0.019198,-0.033778,-0.003149,-0.005184,-0.001431,-0.00189,-1.257363,-2.793637,-1.932758,-5.008096,39.021879,96.81594,-2.354808,2.496296,-0.034569,-0.163184,3886.402802,-0.466683,4.070952,-0.623737,-0.228106,0.748041,-0.000567,-0.433736,-0.000708,-0.02921,-0.104665,-0.001358,0.0,-0.960792,0.665644,-0.008999,-11953.712824,-45175.257711,-0.622901,-30.716053,-61790.157098,0.756864,-36221.166127,-9239.707081,-2.10805,-8.3e-05,-0.622896,-4e-05,-0.620772,-0.012257,-0.107878,-572669500.0,-58.544078,1092670000.0,-120441800.0,-3.91885,-1.465191,-33.302382,-230.128986,-32.772492,-0.364694,-0.133771,-0.209468,-28.356505,-109.884564,-876.69102,-5.368281,-247.110707,-108.409742,-512.437331,-106.617978,-17.295406,-977.373846,-613.770792,-25.996269,-35.630448,-295.747724,-17.832889,-0.694428,-4.175933,-0.45614,0.0
16,16,2050810,1540760400,-86.209971,91.820888,-84.480786,110.333202,-89.898246,89.22821,-16.08618,-65.076097,-6.78366,-30.006538,-2.736081,-4.007526,-2.558912,-66.189127,-66.92811,-0.000725,-0.016435,-0.107041,-5.41746,-3.178521,1.729185,-10.744164,-0.094251,-0.001733,-0.009327,-2.082209,0.200138,-0.00909,-0.351862,-0.214366,-0.211608,-0.001884,-2.3e-05,-3e-05,-2.65939,-0.065583,-0.700765,-192.026959,-2655.593724,-991.295137,-1664.298584,-89.504287,-119.724355,-0.028689,2.730812,-2.759494,-7.896282,-7.909355,-10.717958,-28.571103,-10.130284,-15.088986,-41.171711,-8.331295,-4.958702,-8.745251,-1.671324,-0.001656,7.318354,0.117335,-0.265234,-0.168162,-0.001644,-0.237576,-0.255662,-0.028454,-0.044465,-0.301128,-0.554677,-0.036834,-0.130031,-2.783592,-2.60662,-5.390212,-4.022547,0.0,-2.824022,-10.706438,-1.2015,-0.998268,-0.203232,0.0,-0.248755,-0.222852,-0.134088,0.0,-0.030537,-0.125866,-0.096986,-0.679774,-0.626985,-0.691912,-0.506613,-0.185299,-0.598716,-0.000115,-0.250188,-0.348913,-0.828382,-42.275915,-3.950157,-0.253037,-0.318148,4.47936,-3.447583,-0.040043,7.951531,-0.212137,5.404686,-1.019293,3.526554,0.37916,-992.506623,-1666.435283,0.56201,0.286143,-2658.941887,-2011.894039,-0.343415,-0.08972,-0.278878,-0.433135,-0.024048,-89.211948,-119.674411,-208.886358,-0.058077,-0.060451,0.344871,4.381836,-0.681641,1.426717,-0.468443,-0.017283,2163.329628,-1.143223,-2.722591,-0.825973,-1.935988,0.0,-1.276187,-0.020137,-0.042636,4.886314,-60.820805,18.03637,1.111675,-3.861461,-0.317164,-0.007024,-0.143269,-7.417671,-0.212646,-0.019562,-4.4e-05,-0.000379,-2.548856,-0.261309,-0.536315,-0.061481,-0.152157,-0.002595,-4.678214,-0.014542,-0.169508,9.816834,12.95691,13.679915,10.315174,5.038772,2.6711,8.385503,0.111615,-0.028857,-0.063214,-0.019198,-0.033778,-0.003149,-0.005184,-0.001431,-0.00189,-1.257363,-2.793637,-1.932758,-5.008096,8.021879,-2.80072,13.645192,-7.687035,-0.034569,-0.163184,-109.036398,0.533317,3.070952,-0.623737,-0.228106,0.748041,-0.000567,-0.433736,-0.000708,0.97079,-0.104665,-0.001358,0.0,0.039208,-0.334356,-0.008999,-11953.712824,-45175.257711,-0.622901,-30.716053,-61790.157098,-0.243136,-42051.166127,-9239.707081,-2.10805,-8.3e-05,-0.622896,-4e-05,-0.620772,-0.012257,-0.107878,968263700.0,-57.274078,-440560400.0,-120441800.0,-6.900668,-1.465191,-33.302382,-244.128986,-38.772492,-0.364694,-0.133771,-0.209468,-34.356505,-109.884564,-876.69102,-5.368281,-247.110707,-108.409742,-512.437331,-106.617978,-17.295406,-977.373846,-613.770792,-23.996269,190.369552,-286.747724,-25.832889,-0.694428,-12.175933,-0.45614,0.0
19,19,2070757,1540760400,-96.799971,-408.179112,-110.740786,-460.786798,-114.038246,-479.77179,-16.08618,-65.076097,-6.78366,-30.006538,-2.736081,-4.007526,-2.558912,-66.189127,-77.51811,-0.000725,-0.016435,-0.107041,-3.29746,-3.178521,-13.940815,-10.744164,-0.094251,-0.001733,-0.009327,-2.082209,0.200138,-0.00909,-0.351862,-0.214366,-0.211608,-0.001884,-2.3e-05,-3e-05,-2.65939,-0.065583,-0.700765,-192.026959,-2942.440404,-1186.765837,-1755.674564,-46.22108,839.465095,-119.662019,-54.602524,-65.059494,32.270382,-7.997875,-10.717958,-28.571103,3.869716,-0.088986,-26.171711,-15.331295,-3.958702,-8.745251,-1.671324,-0.001656,-32.681646,-4.882665,-0.265234,-0.408162,-0.091644,-0.237576,-0.295662,-0.028454,-0.044465,-0.301128,-0.554677,-0.036834,-0.130031,-2.783592,-2.60662,-5.390212,-4.022547,0.0,-2.824022,-10.706438,-1.2015,-0.998268,-0.203232,0.0,-0.248755,-0.222852,-0.134088,0.0,-0.030537,-0.125866,-0.096986,-0.679774,-0.626985,-0.691912,-0.506613,-0.185299,-0.598716,-0.000115,1.869812,-0.348913,-0.828382,-42.275915,-3.950157,-0.253037,-0.318148,-2.29064,-3.447583,-0.040043,-9.408469,-0.212137,-11.955314,-1.019293,-2.473446,-5.62084,-1187.961723,-1757.811263,0.09201,-0.263857,-2945.772987,-2298.725139,-0.343415,-0.08972,-0.278878,-0.433135,-0.024048,-45.928741,839.515039,793.586302,0.071923,-0.040451,-0.405129,-5.618164,-14.681641,-3.573283,-0.468443,-0.417283,-190.670372,-1.143223,-2.722591,-0.825973,-1.935988,0.0,-1.276187,-0.020137,-0.042636,-29.797016,-116.020802,-42.96363,-34.888325,-3.861461,-0.317164,-0.007024,-0.143269,-43.417671,-0.212646,-0.019562,-4.4e-05,-0.000379,47.451144,0.738691,-0.536315,-0.061481,-0.152157,-0.002595,-4.678214,-0.014542,0.090492,-21.183166,-44.376426,-25.320085,-51.984826,-25.961228,-54.662236,-13.614497,-30.821719,-0.028857,-0.063214,-0.019198,-0.033778,-0.003149,-0.005184,-0.001431,-0.00189,5.742637,23.506363,4.067242,8.85857,-30.978121,-65.10072,-16.354808,-35.303704,-0.034569,-0.163184,-109.036398,0.533317,2.070952,0.376263,0.771894,0.748041,-0.000567,0.566264,-0.000708,-0.02921,-0.104665,-0.001358,0.0,0.039208,0.665644,-0.008999,2262.866176,-45175.257711,-0.622901,-30.716053,-61790.157098,-0.243136,-42051.166127,-9239.707081,-2.10805,-8.3e-05,-0.622896,-4e-05,-0.620772,-0.012257,-0.107878,-572669500.0,-58.544078,-440560400.0,-120441800.0,2.065998,1.534809,-33.302382,430.871014,-31.772492,-0.364694,-0.133771,-0.209468,-33.356505,-109.884564,-833.69102,3.631719,-213.110707,-108.409742,-512.437331,-86.617978,-14.295406,-925.373846,-561.770792,-21.996269,-37.630448,-151.747724,-24.832889,0.305572,-12.175933,-0.45614,1.0


#### 2. Исключение лишних и константных признаков

In [79]:
feats_by_train.drop(columns=['Unnamed: 0'], inplace=True)

In [80]:
feat_inspector = FeaturesInspect(feats_by_train, num_cat=10)
feat_inspector.collect()
feat_inspector.print()

Всего уникальных признаков : 255
...константные  признаки : 5
...вещественные признаки : 247
...другие признаки : 3
...бинарные признаки : 0
...категориальные признаки : 0


In [81]:
feat_inspector.feats['const']

{'139', '203', '75', '81', '85'}

In [82]:
feats_by_train.drop(columns=list(feat_inspector.feats['const']), inplace=True)

#### 3. Преобразование признака времени

In [83]:
feats_by_train['buy_time'] = feats_by_train['buy_time'].apply(lambda x: datetime.datetime.fromtimestamp(x))

In [84]:
feats_by_train.head(3)

Unnamed: 0,id,buy_time,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,76,77,78,79,80,82,83,84,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,128,129,130,131,132,133,134,135,136,137,138,140,141,142,143,144,145,146,147,148,149,150,151,152,153,154,155,156,157,158,159,160,161,162,163,164,165,166,167,168,169,170,171,172,173,174,175,176,177,178,179,180,181,182,183,184,185,186,187,188,189,190,191,192,193,194,195,196,197,198,199,200,201,202,204,205,206,207,208,209,210,211,212,213,214,215,216,217,218,219,220,221,222,223,224,225,226,227,228,229,230,231,232,233,234,235,236,237,238,239,240,241,242,243,244,245,246,247,248,249,250,251,252
13,2046132,2018-08-20,300.820029,1599.480888,286.879214,1585.013202,281.461754,1563.90821,-16.08618,654.013903,-6.78366,-30.006538,-2.736081,-4.007526,-2.558912,259.130873,320.10189,-0.000725,-0.016435,-0.107041,-5.41746,-3.178521,-13.940815,-10.744164,-0.094251,-0.001733,-0.009327,-2.082209,0.200138,-0.00909,-0.351862,-0.214366,-0.211608,-0.001884,-2.3e-05,-3e-05,-2.65939,-0.065583,-0.700765,-192.026959,-2697.843724,-942.169157,-1755.674564,-89.504287,-119.724355,276.621311,179.764146,96.857176,-7.896282,-6.994502,-10.717958,-27.612844,-11.130284,-15.088986,-12.171711,-5.331295,-3.958702,-7.745251,-1.671324,-0.001656,63.318354,20.117335,0.434766,-0.148162,0.098356,0.392424,0.074338,-0.028454,-0.044465,-0.301128,-0.554677,-0.036834,3.869969,-2.783592,-2.60662,-5.390212,-4.022547,-2.824022,-10.706438,-1.2015,-0.998268,-0.203232,-0.248755,-0.222852,-0.134088,-0.030537,-0.125866,-0.096986,-0.679774,-0.626985,-0.691912,-0.506613,-0.185299,-0.598716,-0.000115,-0.250188,-0.348913,-0.828382,-42.275915,-3.950157,-0.253037,-0.318148,-2.29064,-3.447583,-0.040043,-9.408469,-0.212137,61.364686,-1.019293,-2.473446,5.37916,-943.365043,-1757.811263,0.40201,0.426143,-2701.176307,-2298.725139,-0.343415,-0.08972,-0.278878,-0.433135,-0.024048,-89.211948,-119.674411,-208.886358,-0.058077,-0.060451,0.334871,4.381836,42.318359,1.426717,-0.468443,-0.217283,-190.670372,-1.143223,-2.722591,-0.825973,-1.935988,-1.276187,-0.020137,-0.042636,94.319654,540.562498,34.03637,-34.888325,-3.861461,0.182836,-0.007024,0.356731,-8.417671,-0.212646,-0.019562,-4.4e-05,-0.000379,-2.548856,-0.261309,0.463685,-0.061481,-0.152157,-0.002595,-3.678214,-0.014542,0.180492,-21.183166,-44.376426,-25.320085,-51.984826,38.038772,179.704434,-7.614497,6.444947,-0.028857,-0.063214,-0.019198,-0.033778,-0.003149,-0.005184,-0.001431,-0.00189,-1.257363,-2.793637,-1.932758,-5.008096,39.021879,96.81594,-2.354808,2.496296,-0.034569,-0.163184,3886.402802,-0.466683,4.070952,-0.623737,-0.228106,0.748041,-0.000567,-0.433736,-0.000708,-0.02921,-0.104665,-0.001358,-0.960792,0.665644,-0.008999,-11953.712824,-45175.257711,-0.622901,-30.716053,-61790.157098,0.756864,-36221.166127,-9239.707081,-2.10805,-8.3e-05,-0.622896,-4e-05,-0.620772,-0.012257,-0.107878,-572669500.0,-58.544078,1092670000.0,-120441800.0,-3.91885,-1.465191,-33.302382,-230.128986,-32.772492,-0.364694,-0.133771,-0.209468,-28.356505,-109.884564,-876.69102,-5.368281,-247.110707,-108.409742,-512.437331,-106.617978,-17.295406,-977.373846,-613.770792,-25.996269,-35.630448,-295.747724,-17.832889,-0.694428,-4.175933,-0.45614,0.0
16,2050810,2018-10-29,-86.209971,91.820888,-84.480786,110.333202,-89.898246,89.22821,-16.08618,-65.076097,-6.78366,-30.006538,-2.736081,-4.007526,-2.558912,-66.189127,-66.92811,-0.000725,-0.016435,-0.107041,-5.41746,-3.178521,1.729185,-10.744164,-0.094251,-0.001733,-0.009327,-2.082209,0.200138,-0.00909,-0.351862,-0.214366,-0.211608,-0.001884,-2.3e-05,-3e-05,-2.65939,-0.065583,-0.700765,-192.026959,-2655.593724,-991.295137,-1664.298584,-89.504287,-119.724355,-0.028689,2.730812,-2.759494,-7.896282,-7.909355,-10.717958,-28.571103,-10.130284,-15.088986,-41.171711,-8.331295,-4.958702,-8.745251,-1.671324,-0.001656,7.318354,0.117335,-0.265234,-0.168162,-0.001644,-0.237576,-0.255662,-0.028454,-0.044465,-0.301128,-0.554677,-0.036834,-0.130031,-2.783592,-2.60662,-5.390212,-4.022547,-2.824022,-10.706438,-1.2015,-0.998268,-0.203232,-0.248755,-0.222852,-0.134088,-0.030537,-0.125866,-0.096986,-0.679774,-0.626985,-0.691912,-0.506613,-0.185299,-0.598716,-0.000115,-0.250188,-0.348913,-0.828382,-42.275915,-3.950157,-0.253037,-0.318148,4.47936,-3.447583,-0.040043,7.951531,-0.212137,5.404686,-1.019293,3.526554,0.37916,-992.506623,-1666.435283,0.56201,0.286143,-2658.941887,-2011.894039,-0.343415,-0.08972,-0.278878,-0.433135,-0.024048,-89.211948,-119.674411,-208.886358,-0.058077,-0.060451,0.344871,4.381836,-0.681641,1.426717,-0.468443,-0.017283,2163.329628,-1.143223,-2.722591,-0.825973,-1.935988,-1.276187,-0.020137,-0.042636,4.886314,-60.820805,18.03637,1.111675,-3.861461,-0.317164,-0.007024,-0.143269,-7.417671,-0.212646,-0.019562,-4.4e-05,-0.000379,-2.548856,-0.261309,-0.536315,-0.061481,-0.152157,-0.002595,-4.678214,-0.014542,-0.169508,9.816834,12.95691,13.679915,10.315174,5.038772,2.6711,8.385503,0.111615,-0.028857,-0.063214,-0.019198,-0.033778,-0.003149,-0.005184,-0.001431,-0.00189,-1.257363,-2.793637,-1.932758,-5.008096,8.021879,-2.80072,13.645192,-7.687035,-0.034569,-0.163184,-109.036398,0.533317,3.070952,-0.623737,-0.228106,0.748041,-0.000567,-0.433736,-0.000708,0.97079,-0.104665,-0.001358,0.039208,-0.334356,-0.008999,-11953.712824,-45175.257711,-0.622901,-30.716053,-61790.157098,-0.243136,-42051.166127,-9239.707081,-2.10805,-8.3e-05,-0.622896,-4e-05,-0.620772,-0.012257,-0.107878,968263700.0,-57.274078,-440560400.0,-120441800.0,-6.900668,-1.465191,-33.302382,-244.128986,-38.772492,-0.364694,-0.133771,-0.209468,-34.356505,-109.884564,-876.69102,-5.368281,-247.110707,-108.409742,-512.437331,-106.617978,-17.295406,-977.373846,-613.770792,-23.996269,190.369552,-286.747724,-25.832889,-0.694428,-12.175933,-0.45614,0.0
19,2070757,2018-10-29,-96.799971,-408.179112,-110.740786,-460.786798,-114.038246,-479.77179,-16.08618,-65.076097,-6.78366,-30.006538,-2.736081,-4.007526,-2.558912,-66.189127,-77.51811,-0.000725,-0.016435,-0.107041,-3.29746,-3.178521,-13.940815,-10.744164,-0.094251,-0.001733,-0.009327,-2.082209,0.200138,-0.00909,-0.351862,-0.214366,-0.211608,-0.001884,-2.3e-05,-3e-05,-2.65939,-0.065583,-0.700765,-192.026959,-2942.440404,-1186.765837,-1755.674564,-46.22108,839.465095,-119.662019,-54.602524,-65.059494,32.270382,-7.997875,-10.717958,-28.571103,3.869716,-0.088986,-26.171711,-15.331295,-3.958702,-8.745251,-1.671324,-0.001656,-32.681646,-4.882665,-0.265234,-0.408162,-0.091644,-0.237576,-0.295662,-0.028454,-0.044465,-0.301128,-0.554677,-0.036834,-0.130031,-2.783592,-2.60662,-5.390212,-4.022547,-2.824022,-10.706438,-1.2015,-0.998268,-0.203232,-0.248755,-0.222852,-0.134088,-0.030537,-0.125866,-0.096986,-0.679774,-0.626985,-0.691912,-0.506613,-0.185299,-0.598716,-0.000115,1.869812,-0.348913,-0.828382,-42.275915,-3.950157,-0.253037,-0.318148,-2.29064,-3.447583,-0.040043,-9.408469,-0.212137,-11.955314,-1.019293,-2.473446,-5.62084,-1187.961723,-1757.811263,0.09201,-0.263857,-2945.772987,-2298.725139,-0.343415,-0.08972,-0.278878,-0.433135,-0.024048,-45.928741,839.515039,793.586302,0.071923,-0.040451,-0.405129,-5.618164,-14.681641,-3.573283,-0.468443,-0.417283,-190.670372,-1.143223,-2.722591,-0.825973,-1.935988,-1.276187,-0.020137,-0.042636,-29.797016,-116.020802,-42.96363,-34.888325,-3.861461,-0.317164,-0.007024,-0.143269,-43.417671,-0.212646,-0.019562,-4.4e-05,-0.000379,47.451144,0.738691,-0.536315,-0.061481,-0.152157,-0.002595,-4.678214,-0.014542,0.090492,-21.183166,-44.376426,-25.320085,-51.984826,-25.961228,-54.662236,-13.614497,-30.821719,-0.028857,-0.063214,-0.019198,-0.033778,-0.003149,-0.005184,-0.001431,-0.00189,5.742637,23.506363,4.067242,8.85857,-30.978121,-65.10072,-16.354808,-35.303704,-0.034569,-0.163184,-109.036398,0.533317,2.070952,0.376263,0.771894,0.748041,-0.000567,0.566264,-0.000708,-0.02921,-0.104665,-0.001358,0.039208,0.665644,-0.008999,2262.866176,-45175.257711,-0.622901,-30.716053,-61790.157098,-0.243136,-42051.166127,-9239.707081,-2.10805,-8.3e-05,-0.622896,-4e-05,-0.620772,-0.012257,-0.107878,-572669500.0,-58.544078,-440560400.0,-120441800.0,2.065998,1.534809,-33.302382,430.871014,-31.772492,-0.364694,-0.133771,-0.209468,-33.356505,-109.884564,-833.69102,3.631719,-213.110707,-108.409742,-512.437331,-86.617978,-14.295406,-925.373846,-561.770792,-21.996269,-37.630448,-151.747724,-24.832889,0.305572,-12.175933,-0.45614,1.0


In [85]:
feats_by_train.rename(columns={'buy_time': 'profile_time'}, inplace=True)

#### 4. Преобразование формата данных

In [86]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.
    """
    start_mem = df.memory_usage().sum() / 1024 ** 2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))

    for col in df.columns:
        col_type = df[col].dtype

        if pd.api.types.is_datetime64_ns_dtype(df[col]):
            continue
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024 ** 2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

    return df

In [87]:
feats_by_train=reduce_mem_usage(feats_by_train)

Memory usage of dataframe is 1597.41 MB
Memory usage after optimization is: 491.63 MB
Decreased by 69.2%


In [88]:
feats_by_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 834168 entries, 13 to 4512521
Columns: 250 entries, id to 252
dtypes: datetime64[ns](1), float16(197), float32(51), int32(1)
memory usage: 491.6 MB


In [89]:
feats_by_train.head(3)

Unnamed: 0,id,profile_time,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,76,77,78,79,80,82,83,84,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,128,129,130,131,132,133,134,135,136,137,138,140,141,142,143,144,145,146,147,148,149,150,151,152,153,154,155,156,157,158,159,160,161,162,163,164,165,166,167,168,169,170,171,172,173,174,175,176,177,178,179,180,181,182,183,184,185,186,187,188,189,190,191,192,193,194,195,196,197,198,199,200,201,202,204,205,206,207,208,209,210,211,212,213,214,215,216,217,218,219,220,221,222,223,224,225,226,227,228,229,230,231,232,233,234,235,236,237,238,239,240,241,242,243,244,245,246,247,248,249,250,251,252
13,2046132,2018-08-20,300.820038,1599.480835,286.879211,1585.013184,281.461761,1563.908203,-16.08618,654.013916,-6.78366,-30.006538,-2.736328,-4.007526,-2.558594,259.25,320.0,-0.000725,-0.016434,-0.107056,-5.417969,-3.177734,-13.9375,-10.742188,-0.094238,-0.001733,-0.009331,-2.082031,0.200195,-0.009094,-0.351807,-0.214355,-0.211548,-0.001884,-2.3e-05,-3e-05,-2.660156,-0.065613,-0.700684,-192.026962,-2697.84375,-942.169128,-1755.674561,-89.504288,-119.724358,276.5,179.75,96.875,-7.894531,-6.994502,-10.717958,-27.612844,-11.132812,-15.085938,-12.171875,-5.332031,-3.958984,-7.746094,-1.670898,-0.001656,63.3125,20.125,0.434814,-0.148193,0.098328,0.392334,0.074341,-0.028458,-0.044464,-0.301025,-0.554688,-0.036835,3.869141,-2.783203,-2.607422,-5.390625,-4.023438,-2.824219,-10.703125,-1.2015,-0.998268,-0.203247,-0.248779,-0.2229,-0.134033,-0.030533,-0.125854,-0.096985,-0.679688,-0.626953,-0.691895,-0.506836,-0.185303,-0.598633,-0.000115,-0.250244,-0.348877,-0.828613,-42.275913,-3.949219,-0.25293,-0.318115,-2.291016,-3.447266,-0.040039,-9.40625,-0.212158,61.375,-1.019531,-2.472656,5.378906,-943.365051,-1757.811279,0.4021,0.426025,-2701.17627,-2298.725098,-0.343506,-0.089722,-0.278809,-0.433105,-0.024048,-89.211945,-119.674408,-208.886353,-0.058075,-0.060455,0.334961,4.382812,42.3125,1.426758,-0.468506,-0.217285,-190.625,-1.143555,-2.722656,-0.826172,-1.935547,-1.276367,-0.020142,-0.042633,94.3125,540.5,34.03125,-34.875,-3.861328,0.182861,-0.007023,0.356689,-8.414062,-0.212646,-0.019562,-4.4e-05,-0.000379,-2.548828,-0.26123,0.463623,-0.061493,-0.1521,-0.002596,-3.677734,-0.014542,0.180542,-21.1875,-44.375,-25.3125,-52.0,38.03125,179.75,-7.613281,6.445312,-0.028854,-0.063232,-0.019196,-0.033783,-0.003149,-0.005184,-0.001431,-0.00189,-1.257812,-2.792969,-1.932617,-5.007812,39.03125,96.8125,-2.355469,2.496094,-0.034576,-0.163208,3886.0,-0.466797,4.070312,-0.623535,-0.228149,0.748047,-0.000567,-0.433838,-0.000708,-0.029205,-0.104675,-0.001357,-0.960938,0.665527,-0.008995,-11953.712891,-45175.257812,-0.623047,-30.716053,-61790.15625,0.756836,-36221.167969,-9239.707031,-2.107422,-8.3e-05,-0.623047,-4e-05,-0.620605,-0.01226,-0.107849,-572669504.0,-58.53125,1092670000.0,-120441800.0,-3.917969,-1.464844,-33.30238,-230.128983,-32.78125,-0.364746,-0.133789,-0.209473,-28.359375,-109.884567,-876.69104,-5.367188,-247.125,-108.4375,-512.437317,-106.617981,-17.296875,-977.37384,-613.770813,-25.996269,-35.625,-295.747711,-17.832888,-0.694336,-4.175933,-0.456055,0.0
16,2050810,2018-10-29,-86.209969,91.820885,-84.480789,110.333199,-89.898247,89.22821,-16.08618,-65.076096,-6.78366,-30.006538,-2.736328,-4.007526,-2.558594,-66.1875,-66.9375,-0.000725,-0.016434,-0.107056,-5.417969,-3.177734,1.729492,-10.742188,-0.094238,-0.001733,-0.009331,-2.082031,0.200195,-0.009094,-0.351807,-0.214355,-0.211548,-0.001884,-2.3e-05,-3e-05,-2.660156,-0.065613,-0.700684,-192.026962,-2655.59375,-991.295166,-1664.298584,-89.504288,-119.724358,-0.028687,2.730469,-2.759766,-7.894531,-7.909355,-10.717958,-28.571102,-10.132812,-15.085938,-41.15625,-8.328125,-4.957031,-8.742188,-1.670898,-0.001656,7.316406,0.11731,-0.265137,-0.168213,-0.001644,-0.237549,-0.255615,-0.028458,-0.044464,-0.301025,-0.554688,-0.036835,-0.130005,-2.783203,-2.607422,-5.390625,-4.023438,-2.824219,-10.703125,-1.2015,-0.998268,-0.203247,-0.248779,-0.2229,-0.134033,-0.030533,-0.125854,-0.096985,-0.679688,-0.626953,-0.691895,-0.506836,-0.185303,-0.598633,-0.000115,-0.250244,-0.348877,-0.828613,-42.275913,-3.949219,-0.25293,-0.318115,4.480469,-3.447266,-0.040039,7.953125,-0.212158,5.40625,-1.019531,3.527344,0.37915,-992.506653,-1666.435303,0.562012,0.286133,-2658.941895,-2011.894043,-0.343506,-0.089722,-0.278809,-0.433105,-0.024048,-89.211945,-119.674408,-208.886353,-0.058075,-0.060455,0.344971,4.382812,-0.681641,1.426758,-0.468506,-0.017288,2164.0,-1.143555,-2.722656,-0.826172,-1.935547,-1.276367,-0.020142,-0.042633,4.886719,-60.8125,18.03125,1.111328,-3.861328,-0.317139,-0.007023,-0.143311,-7.417969,-0.212646,-0.019562,-4.4e-05,-0.000379,-2.548828,-0.26123,-0.536133,-0.061493,-0.1521,-0.002596,-4.679688,-0.014542,-0.169556,9.820312,12.953125,13.679688,10.3125,5.039062,2.671875,8.382812,0.111633,-0.028854,-0.063232,-0.019196,-0.033783,-0.003149,-0.005184,-0.001431,-0.00189,-1.257812,-2.792969,-1.932617,-5.007812,8.023438,-2.800781,13.648438,-7.6875,-0.034576,-0.163208,-109.0625,0.533203,3.070312,-0.623535,-0.228149,0.748047,-0.000567,-0.433838,-0.000708,0.970703,-0.104675,-0.001357,0.039215,-0.334473,-0.008995,-11953.712891,-45175.257812,-0.623047,-30.716053,-61790.15625,-0.243164,-42051.167969,-9239.707031,-2.107422,-8.3e-05,-0.623047,-4e-05,-0.620605,-0.01226,-0.107849,968263680.0,-57.28125,-440560400.0,-120441800.0,-6.902344,-1.464844,-33.30238,-244.128983,-38.78125,-0.364746,-0.133789,-0.209473,-34.34375,-109.884567,-876.69104,-5.367188,-247.125,-108.4375,-512.437317,-106.617981,-17.296875,-977.37384,-613.770813,-23.996269,190.375,-286.747711,-25.832888,-0.694336,-12.175933,-0.456055,0.0
19,2070757,2018-10-29,-96.799973,-408.179108,-110.740784,-460.786804,-114.038246,-479.77179,-16.08618,-65.076096,-6.78366,-30.006538,-2.736328,-4.007526,-2.558594,-66.1875,-77.5,-0.000725,-0.016434,-0.107056,-3.296875,-3.177734,-13.9375,-10.742188,-0.094238,-0.001733,-0.009331,-2.082031,0.200195,-0.009094,-0.351807,-0.214355,-0.211548,-0.001884,-2.3e-05,-3e-05,-2.660156,-0.065613,-0.700684,-192.026962,-2942.44043,-1186.765869,-1755.674561,-46.221081,839.465088,-119.6875,-54.59375,-65.0625,32.28125,-7.997875,-10.717958,-28.571102,3.869141,-0.088989,-26.171875,-15.328125,-3.958984,-8.742188,-1.670898,-0.001656,-32.6875,-4.882812,-0.265137,-0.408203,-0.091675,-0.237549,-0.295654,-0.028458,-0.044464,-0.301025,-0.554688,-0.036835,-0.130005,-2.783203,-2.607422,-5.390625,-4.023438,-2.824219,-10.703125,-1.2015,-0.998268,-0.203247,-0.248779,-0.2229,-0.134033,-0.030533,-0.125854,-0.096985,-0.679688,-0.626953,-0.691895,-0.506836,-0.185303,-0.598633,-0.000115,1.870117,-0.348877,-0.828613,-42.275913,-3.949219,-0.25293,-0.318115,-2.291016,-3.447266,-0.040039,-9.40625,-0.212158,-11.953125,-1.019531,-2.472656,-5.621094,-1187.96167,-1757.811279,0.09198,-0.263916,-2945.772949,-2298.725098,-0.343506,-0.089722,-0.278809,-0.433105,-0.024048,-45.928741,839.515015,793.586304,0.071899,-0.040436,-0.405029,-5.617188,-14.679688,-3.574219,-0.468506,-0.417236,-190.625,-1.143555,-2.722656,-0.826172,-1.935547,-1.276367,-0.020142,-0.042633,-29.796875,-116.0,-42.96875,-34.875,-3.861328,-0.317139,-0.007023,-0.143311,-43.40625,-0.212646,-0.019562,-4.4e-05,-0.000379,47.4375,0.73877,-0.536133,-0.061493,-0.1521,-0.002596,-4.679688,-0.014542,0.090515,-21.1875,-44.375,-25.3125,-52.0,-25.96875,-54.65625,-13.617188,-30.828125,-0.028854,-0.063232,-0.019196,-0.033783,-0.003149,-0.005184,-0.001431,-0.00189,5.742188,23.5,4.066406,8.859375,-30.984375,-65.125,-16.359375,-35.3125,-0.034576,-0.163208,-109.0625,0.533203,2.070312,0.376221,0.771973,0.748047,-0.000567,0.566406,-0.000708,-0.029205,-0.104675,-0.001357,0.039215,0.665527,-0.008995,2262.866211,-45175.257812,-0.623047,-30.716053,-61790.15625,-0.243164,-42051.167969,-9239.707031,-2.107422,-8.3e-05,-0.623047,-4e-05,-0.620605,-0.01226,-0.107849,-572669504.0,-58.53125,-440560400.0,-120441800.0,2.066406,1.535156,-33.30238,430.871002,-31.765625,-0.364746,-0.133789,-0.209473,-33.34375,-109.884567,-833.69104,3.630859,-213.125,-108.4375,-512.437317,-86.617981,-14.296875,-925.37384,-561.770813,-21.996269,-37.625,-151.747726,-24.832888,0.305664,-12.175933,-0.456055,1.0


Запись

In [91]:
feats_by_train.to_csv('feats_by_train.csv.compress', compression='gzip')

Чтение

In [69]:
feats_by_train=pd.read_csv('feats_by_train.csv')

In [92]:
data_train.head(3)

Unnamed: 0.1,Unnamed: 0,id,vas_id,buy_time,target
0,0,540968,8.0,1537131600,0.0
1,1,1454121,4.0,1531688400,0.0
2,2,2458816,1.0,1534107600,0.0


In [93]:
data_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 831653 entries, 0 to 831652
Data columns (total 5 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   Unnamed: 0  831653 non-null  int64  
 1   id          831653 non-null  int64  
 2   vas_id      831653 non-null  float64
 3   buy_time    831653 non-null  int64  
 4   target      831653 non-null  float64
dtypes: float64(2), int64(3)
memory usage: 31.7 MB


In [20]:
all_deals = pd.merge(data_all, feats, how='outer', on='id')

In [21]:
all_deals.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 933718 entries, 0 to 933717
Columns: 255 entries, Unnamed: 0 to 252
dtypes: bool(1), datetime64[ns](2), float16(197), float32(51), float64(2), int64(2)
memory usage: 583.3 MB


In [22]:
all_deals = all_deals.loc[all_deals.buy_time >= all_deals.profile_time]

In [23]:
all_deals.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 488139 entries, 1 to 933717
Columns: 255 entries, Unnamed: 0 to 252
dtypes: bool(1), datetime64[ns](2), float16(197), float32(51), float64(2), int64(2)
memory usage: 304.9 MB


#### Сохранение данных

In [24]:
all_deals.to_pickle("user_deals.pkl.compress", compression="gzip")

#### Чтение данных

In [25]:
all_deals = pd.read_pickle("user_deals.pkl.compress", compression="gzip")

#### Определение действующего профиля

In [26]:
all_deals.reset_index(inplace=True)
all_deals.rename(columns={'id': 'user_id'}, inplace=True)
all_deals.head(5)

Unnamed: 0.1,index,Unnamed: 0,user_id,vas_id,buy_time,target,is_train,profile_time,0,1,...,243,244,245,246,247,248,249,250,251,252
0,1,1,1454121,4.0,2018-07-16,0.0,True,2018-07-09,547.27002,238.430893,...,-972.37384,-613.770813,-25.996269,-19.625,-278.747711,-24.832888,-0.694336,-11.175933,-0.456055,0.0
1,3,3,3535012,5.0,2018-09-03,0.0,True,2018-08-06,54.880028,12.970888,...,-977.37384,-613.770813,-25.996269,-18.625,-133.747726,-14.832889,-0.694336,-1.175933,-0.456055,0.0
2,6,6,2611143,2.0,2018-12-17,0.0,True,2018-08-06,-96.799973,-408.179108,...,-977.37384,-613.770813,-25.996269,-37.625,-306.747711,-25.832888,-0.694336,-12.175933,-0.456055,0.0
3,7,7,3577737,5.0,2018-12-24,0.0,True,2018-07-09,-96.799973,-10.719112,...,-949.37384,-613.770813,-25.996269,-35.625,-274.747711,106.167114,-0.694336,119.824066,-0.456055,1.0
4,8,8,2000856,1.0,2018-08-20,0.0,True,2018-07-16,-21.36997,108.780891,...,-977.37384,-613.770813,-25.996269,-37.625,-306.747711,-25.832888,-0.694336,-12.175933,-0.456055,0.0


Сортировка по сделкам и активным профилям

In [27]:
all_deals.sort_values(by=['user_id', 'vas_id', 'buy_time', 'target', 'profile_time'], ascending=False, inplace=True)

In [28]:
all_deals[all_deals.user_id == 4362538]

Unnamed: 0.1,index,Unnamed: 0,user_id,vas_id,buy_time,target,is_train,profile_time,0,1,...,243,244,245,246,247,248,249,250,251,252
417520,860840,828284,4362538,5.0,2018-12-31,0.0,True,2018-12-24,9.070029,-137.079117,...,-977.37384,-613.770813,-24.996269,-37.625,-290.747711,-25.832888,-0.694336,-12.175933,-0.456055,1.0
417519,860839,828284,4362538,5.0,2018-12-31,0.0,True,2018-11-19,-96.799973,-160.309113,...,-977.37384,-613.770813,-25.996269,-37.625,-280.747711,-25.832888,-0.694336,-12.175933,-0.456055,0.0


> Проверка - вверху каждой группы самый поздний профиль

Поиск всех дублей по группе

In [29]:
all_deals[all_deals.duplicated(subset=['user_id', 'vas_id', 'buy_time', 'target'], keep='first')]

Unnamed: 0.1,index,Unnamed: 0,user_id,vas_id,buy_time,target,is_train,profile_time,0,1,...,243,244,245,246,247,248,249,250,251,252
486303,931844,69389,4362664,2.0,2019-01-14,,False,2018-07-30,-93.409973,-398.849121,...,-977.373840,-613.770813,-25.996269,-37.625,-306.747711,-25.832888,-0.694336,-12.175933,-0.456055,0.0
417519,860839,828284,4362538,5.0,2018-12-31,0.0,True,2018-11-19,-96.799973,-160.309113,...,-977.373840,-613.770813,-25.996269,-37.625,-280.747711,-25.832888,-0.694336,-12.175933,-0.456055,0.0
409357,850681,818676,4362200,6.0,2018-12-24,0.0,True,2018-08-27,199.810028,-94.879112,...,-722.373840,-360.770782,-25.996269,-37.625,72.252274,31.167112,1.305664,42.824066,-0.456055,0.0
404782,844828,813135,4362012,2.0,2018-10-22,0.0,True,2018-07-23,-2.729971,-188.689117,...,-977.373840,-613.770813,-25.996269,-37.625,-306.747711,-25.832888,-0.694336,-12.175933,-0.456055,0.0
477252,922562,59994,4361999,6.0,2019-01-14,,False,2018-11-19,411.670044,107.920891,...,612.626160,616.229187,23.003731,79.375,622.252258,-25.832888,-0.694336,-12.175933,-0.456055,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
60791,131860,127055,3249,4.0,2018-10-22,0.0,True,2018-08-06,302.780029,1165.970947,...,-926.373840,-604.770813,-21.996269,-26.625,4197.252441,-19.832888,-0.694336,-6.175933,-0.456055,1.0
50064,108888,104959,2677,1.0,2018-11-12,0.0,True,2018-07-23,-35.779972,-6.479112,...,-968.373840,-612.770813,-25.996269,-29.625,-168.747726,-18.832888,-0.694336,-12.175933,6.542969,1.0
42263,92314,89000,2311,2.0,2018-12-17,0.0,True,2018-11-12,-40.019970,323.420898,...,57.626152,325.229218,-22.996269,-37.625,-150.747726,-9.832889,-0.694336,-12.175933,-0.456055,0.0
35515,77844,75073,1946,1.0,2018-10-29,0.0,True,2018-08-20,-69.509972,94.090889,...,-977.373840,-613.770813,-25.996269,-37.625,-306.747711,-25.832888,-0.694336,-12.175933,-0.456055,0.0


Выполнение удаление дубликатов

In [30]:
all_deals.drop_duplicates(subset=['user_id', 'vas_id', 'buy_time', 'target'], keep='first', inplace=True)

In [31]:
all_deals[all_deals.user_id == 4362538]

Unnamed: 0.1,index,Unnamed: 0,user_id,vas_id,buy_time,target,is_train,profile_time,0,1,...,243,244,245,246,247,248,249,250,251,252
417520,860840,828284,4362538,5.0,2018-12-31,0.0,True,2018-12-24,9.070029,-137.079117,...,-977.37384,-613.770813,-24.996269,-37.625,-290.747711,-25.832888,-0.694336,-12.175933,-0.456055,1.0


Сохранение результата

In [32]:
all_deals.to_pickle("user_deals_clear.pkl.compress", compression="gzip")

Повторный контоль на типы переменных

In [211]:
feat_inspector = FeaturesInspect(all_deals.drop(columns=['buy_time', 'profile_time', 'target']), num_cat=10)
feat_inspector.collect()
feat_inspector.print()

  return umr_sum(a, axis, dtype, out, keepdims, initial, where)


Всего уникальных признаков : 250
...константные  признаки : 0
...вещественные признаки : 248
...другие признаки : 2
...бинарные признаки : 0
...категориальные признаки : 1


Преобразование признаков делать не понадобится

In [33]:
all_deals.head(3)

Unnamed: 0.1,index,Unnamed: 0,user_id,vas_id,buy_time,target,is_train,profile_time,0,1,...,243,244,245,246,247,248,249,250,251,252
487616,933188,70715,4362720,2.0,2019-01-07,,False,2018-11-12,-96.799973,-94.61911,...,-977.37384,-613.770813,-25.996269,-37.625,-304.747711,-25.832888,-0.694336,-12.175933,-0.456055,0.0
487434,933000,70535,4362712,5.0,2019-01-14,,False,2018-07-23,-96.799973,-408.179108,...,-977.37384,-613.770813,-25.996269,-37.625,-306.747711,-25.832888,-0.694336,-12.175933,-0.456055,0.0
487053,932613,70157,4362697,5.0,2019-01-07,,False,2018-12-31,-90.699974,-262.139099,...,-964.37384,-611.770813,-21.996269,-37.625,-63.747723,31.167112,-0.694336,-12.175933,-0.456055,1.0


In [34]:
all_deals[~all_deals.is_train].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 69535 entries, 487616 to 420664
Columns: 256 entries, index to 252
dtypes: bool(1), datetime64[ns](2), float16(197), float32(51), float64(2), int64(3)
memory usage: 44.0 MB


In [35]:
all_deals[all_deals.is_train].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 408724 entries, 420299 to 64
Columns: 256 entries, index to 252
dtypes: bool(1), datetime64[ns](2), float16(197), float32(51), float64(2), int64(3)
memory usage: 258.4 MB


In [36]:
deals_train = all_deals[all_deals.is_train]

In [38]:
deals_train.columns

Index(['index', 'Unnamed: 0', 'user_id', 'vas_id', 'buy_time', 'target',
       'is_train', 'profile_time', '0', '1',
       ...
       '243', '244', '245', '246', '247', '248', '249', '250', '251', '252'],
      dtype='object', length=256)

In [39]:
deals_train.drop(columns=['index', 'Unnamed: 0', 'user_id', 'buy_time'], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [49]:
deals_train.drop(columns=['is_train', 'profile_time'], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [50]:
deals_train.head(3)

Unnamed: 0,vas_id,0,1,2,3,4,5,6,7,8,...,243,244,245,246,247,248,249,250,251,252
420299,2.0,-25.60997,-103.109116,-39.550785,-108.276794,-44.968246,-129.38179,-16.08618,-65.076096,-6.78366,...,12.626154,-613.770813,-23.996269,-37.625,-238.747726,-25.832888,-0.694336,-12.175933,-0.456055,0.0
420160,5.0,-96.799973,-103.099113,-27.720785,34.053204,-33.138245,46.848209,-16.08618,-65.076096,-6.78366,...,-598.37384,-283.770782,-22.996269,-34.625,-208.747726,-10.832889,-0.694336,2.824067,-0.456055,1.0
419335,6.0,288.800018,-9.859112,363.839203,284.973206,358.421753,263.868225,-16.08618,-65.076096,-6.78366,...,1351.626099,745.229187,-25.996269,-28.625,26.252275,34.16711,-0.694336,47.824066,-0.456055,0.0


In [48]:
data_test[data_test.duplicated(subset=['id'], keep=False)]

Unnamed: 0.1,Unnamed: 0,id,vas_id,buy_time,is_train
123,123,3043970,5.0,1547413200,False
124,124,3043970,6.0,1546808400,False
249,249,921445,2.0,1546808400,False
250,250,921445,6.0,1546808400,False
429,429,2806386,2.0,1546808400,False
...,...,...,...,...,...
70955,70955,158241,6.0,1547413200,False
71120,71120,563350,2.0,1546808400,False
71121,71121,563350,6.0,1548018000,False
71129,71129,653076,2.0,1546808400,False
