In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import ipaddress
from datetime import datetime
import time
import re
import os
import pickle
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction import DictVectorizer as DV
import keras

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [20]:
def load_data():
    return pd.read_csv("./data_full.csv", sep = ";", error_bad_lines=False)

In [3]:
catalog = pd.read_csv("./catalog.csv", sep = ";", error_bad_lines=False)

In [5]:
catalog.head()

Unnamed: 0,idec,track_id,video_id,series_id,title,season,episode,episode_global,date_efir,duration
0,PR592505,10258734,cd59084eb50e6ebe9fe20ad04e0c17db,10456850000.0,"\tБыть или не быть, 1 сезон, 1 серия",1.0,1.0,1.0,02.04.2017 22:00:34,
1,PR609630,10843255,a5d2e6bae4752d9ec0fae9b53f00700d,10592470000.0,,11.0,1.0,1.0,,
2,PR597128,10573153,bcd45606c1f7cbb5cb9d1dbd7d346cc3,10479560000.0,,7.0,1.0,1.0,06.09.2017 22:46:05,
3,PR257331,6617650,609217ac7b8c4e55d44c4a29bf52554b,,"""Интерны. История болезни""",,,,12.06.2012 22:00:44,
4,PR529698,7991493,bde3620ba038b3e8dd41d03bf516a7eb,10241020000.0,#ЖАННАПОЖЕНИ,1.0,1.0,101.0,05.09.2015 13:30:00,


In [6]:
def add_device(df):
    regexpDesktop = re.compile('(Windows|Linux|Mozilla)')
    regexpMobile = re.compile('(iPhone|RutubeAndroid|okhttp|CFNetwork|UCBrowser)')
    regexpTablet = re.compile('(iPad)')
    
    df['device'] = pd.Series('0', index=df.index)
    
    for i, row in df.iterrows():
        try:
            if regexpDesktop.search(df['uagent'].iloc[i]):
                df['device'][i] = 'desktop'
            elif regexpMobile.search(df['uagent'].iloc[i]):
                df['device'][i] = 'mobile'
            elif regexpTablet.search(df['uagent'].iloc[i]):
                df['device'][i] = 'tablet'
        except:
            pass
            
    return df

In [7]:
def mergeData(data1, data2, field):
    data1[field] = pd.to_numeric(data1[field], errors = 'coerce')
    data2[field] = pd.to_numeric(data2[field], errors = 'coerce')
    return pd.merge(data1, data2, left_on=field, right_on=field)

In [8]:
def convIpToNumber(data):
    return data.apply(lambda x: int(ipaddress.IPv4Address(x)))

In [9]:
def applyDate(x):
    try:
        data = time.strptime(x, "%d.%m.%Y %H:%M:%S")
        return int(time.mktime(data))
    except:
        return 1e-6

In [10]:
def confDateStringToTimestamp(data):
    return data.apply(lambda x: applyDate(x))

In [11]:
def filter_users_with_n_views(data, user_column = "ip", seq_len = 5):
    by_user = data.groupby(user_column)
    by_user = by_user.filter(lambda x: len(x) >= seq_len)
    return by_user

def extract_sequences_by_user(data, user_column = "ip"):
    return data.groupby(user_column)

In [12]:
def filter_videos_with_n_views(data, n = 500):
    data["track_id"] = pd.to_numeric(data["track_id"], errors = 'coerce', downcast = 'integer')
    data = data[data["track_id"].notnull()]
    data_top_n = data.groupby("track_id").filter(lambda x: len(x) >= n)
    return data_top_n

In [13]:
cat_cols = [ 'series_id', 'idec', 'author_id']
num_cols = [ 'duration', 'date_efir']

def label_encoder(data):
    le = preprocessing.LabelEncoder()
    le.fit(data)
    return le

def create_cat_vectorizer(data):
    
    cat = data[cat_cols]
    cat.fillna('NA', inplace = True)
    x_cat = cat.to_dict(orient = 'records')
    
    vectorizer = DV(sparse = False)
    vectorizer.fit(x_cat)
    
    return vectorizer

def cat_vectorize(data, vectorizer):
    
    cat = data[cat_cols]
    cat.fillna( 'NA', inplace = True )
    x_cat = cat.to_dict(orient = 'records')
    
    return vectorizer.transform(x_cat)
    

In [14]:
def replace_null_value(x):
    if (x == 'null'):
        return 0
    else:
        return x

def scale_data(data):
    scaler = StandardScaler()
    data.fillna(0, inplace = True )
    data = data.apply(lambda x: replace_null_value(x))
    data = scaler.fit_transform(data.reshape(-1, 1))
    return data

In [15]:
def prepare_num(data):
    data['date_efir'] = scale_data(data['date_efir'])
    data['duration'] = scale_data(data['duration'])
    return data

In [22]:
def prepare_data(data, catalog):
    
    data['ip'] = convIpToNumber(data['ip'])
    
    data = mergeData(data, catalog, "track_id")
    
    data = filter_videos_with_n_views(data)
    print(len(data))
    
    data = filter_users_with_n_views(data)
    print(len(data))
    
    data['date_efir'] = confDateStringToTimestamp(data['date_efir'])
    
    data = prepare_num(data)
    
#     data = add_device(data)
    
    return data

prepared = None
if os.path.exists('prepared.pickle'):
    with open('prepared.pickle', 'rb') as f:
        prepared = pickle.load(f)
else:
    prepared = prepare_data(load_data(), catalog)
    with open('prepared.pickle', 'wb') as f:
        pickle.dump(prepared, f, protocol=pickle.HIGHEST_PROTOCOL)

b'Skipping line 10780784: expected 10 fields, saw 11\nSkipping line 10793876: expected 10 fields, saw 11\n'
b'Skipping line 10821923: expected 10 fields, saw 11\n'
  if self.run_code(code, result):


6413777
3963775


  # This is added back by InteractiveShellApp.init_path()


In [23]:
prepared.head()

Unnamed: 0,timestamp,ip,track_id,cookie,live,user_id,referer,uagent,rightholder,author_id,idec,video_id,series_id,title,season,episode,episode_global,date_efir,duration
7,1517778003,3450234470,10912114,fae6aff224474a5b9420e5e51963d095,0,,https%3A%2F%2Frutube.ru%2Fvideo%2F0f5c9c5839b1...,Mozilla/5.0 (Linux; Android 7.0; SM-A510F Buil...,2,1480930,PR602344,0f5c9c5839b11f767d30d7093c9e34cb,10021850000.0,ДОМ-2 Город любви 5018 дня,124.0,4.0,3681.0,0.181163,0.969393
8,1517778004,18957780,10912114,ac2fd72a7d728057938fa8401fcfbef7,0,,http%3A%2F%2Fdom2hd.su%2Fvideo%2F04-02-2018%3F...,Mozilla/5.0 (iPad; CPU OS 11_1_1 like Mac OS X...,2,1480930,PR602344,0f5c9c5839b11f767d30d7093c9e34cb,10021850000.0,ДОМ-2 Город любви 5018 дня,124.0,4.0,3681.0,0.181163,0.969393
11,1517778005,3862637538,10912114,0e62c1b53377aa48ef50d84d12cf188e,0,,http%3A%2F%2Fvideo-dom2.ru%2Fonlinetv%2Fdom2_o...,Mozilla/5.0 (iPhone; CPU iPhone OS 9_3_2 like ...,2,1480930,PR602344,0f5c9c5839b11f767d30d7093c9e34cb,10021850000.0,ДОМ-2 Город любви 5018 дня,124.0,4.0,3681.0,0.181163,0.969393
13,1517778007,3904348709,10912114,c9bc203d464b12649aeefb54a65d0b0c,0,,http%3A%2F%2Fnash-dom2.su%2Fefiry%2F49742-dom-...,Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebK...,2,1480930,PR602344,0f5c9c5839b11f767d30d7093c9e34cb,10021850000.0,ДОМ-2 Город любви 5018 дня,124.0,4.0,3681.0,0.181163,0.969393
17,1517778009,81457654,10912114,8de0448a6a1cb7f6f21a0c7e8df8bc54,0,,http%3A%2F%2Fdom2.love%2Fsvejiye-serii%2F10268...,Mozilla/5.0 (Mobile; Windows Phone 8.1; Androi...,2,1480930,PR602344,0f5c9c5839b11f767d30d7093c9e34cb,10021850000.0,ДОМ-2 Город любви 5018 дня,124.0,4.0,3681.0,0.181163,0.969393


In [24]:
cat_vectorizer = create_cat_vectorizer(prepared)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  downcast=downcast, **kwargs)


In [25]:
print(cat_vectorize(prepared[26:27], cat_vectorizer))
print(cat_vectorize(prepared[4259450:4259451], cat_vectorizer))

[[0.00000000e+00 0.00000000e+00 0.00000000e+00 ... 0.00000000e+00
  1.00218494e+10 0.00000000e+00]]
[]


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  downcast=downcast, **kwargs)


In [26]:
track_ids = prepared["track_id"]

vocab_size = track_ids.nunique()

le = label_encoder(track_ids.values)

print(le.transform(track_ids.values[0:5]))
print(vocab_size)

[1310 1310 1310 1310 1310]
1335


In [27]:
def create_x_video_vectors(x, vocab_size):
    x = x["track_id"].values
    x = le.transform(x)
    x = x.astype(np.int32)
    return x

def create_x_num_cut_vectors(x, cat_vectorizer):
    vec_x_cat = cat_vectorize(x, cat_vectorizer)
    vec_x_num = x[num_cols].as_matrix()
    return np.hstack((vec_x_num, vec_x_cat))

def create_x(x, vocab_size):
    return [create_x_video_vectors(x, vocab_size).astype(np.int32), create_x_num_cut_vectors(x, cat_vectorizer)]
    
x_examp = create_x(prepared.head(3), vocab_size)    
print(x_examp[0])
print(x_examp[0].shape)
print(x_examp[1])
print(x_examp[1].shape)

x_vid_examp = create_x_video_vectors(prepared.head(3), vocab_size)
x_num_cut_examp = create_x_num_cut_vectors(prepared.head(3), cat_vectorizer)
print(x_vid_examp[0])
print(x_vid_examp[0].shape)
print(x_num_cut_examp[1])
print(x_num_cut_examp[1].shape)

[1310 1310 1310]
(3,)
[[9.69393122e-01 1.81162930e-01 0.00000000e+00 ... 0.00000000e+00
  1.00218494e+10 0.00000000e+00]
 [9.69393122e-01 1.81162930e-01 0.00000000e+00 ... 0.00000000e+00
  1.00218494e+10 0.00000000e+00]
 [9.69393122e-01 1.81162930e-01 0.00000000e+00 ... 0.00000000e+00
  1.00218494e+10 0.00000000e+00]]
(3, 1395)
1310
()
[9.69393122e-01 1.81162930e-01 0.00000000e+00 ... 0.00000000e+00
 1.00218494e+10 0.00000000e+00]
(1395,)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  downcast=downcast, **kwargs)


In [28]:
def create_y(y, vocab_size):
    y = y["track_id"].values
    y = le.transform(y)
    if (len(y) == 0):
        return np.zeros(vocab_size)
    return np.max(keras.utils.to_categorical(y, vocab_size), axis = 0)

print(create_y(prepared.head(3), vocab_size))

[0. 0. 0. ... 0. 0. 0.]


In [29]:
def steps(data, batch_size):
    return data["ip"].nunique() // batch_size + 1

In [30]:
def train_generator(seq_by_user, batch_size, x_len = 4):
    
    seq_len = len(seq_by_user)
    
    step = 0
    
    x_vid_list = []
    x_num_list = []
    y_list = []
    
    iterator = iter(seq_by_user)
    
    while(True):
        ip, group = next(iterator)
        
        group = group.sort_values("timestamp")
        x_vid = create_x_video_vectors(group[0:x_len], vocab_size)
        x_num = create_x_num_cut_vectors(group[0:x_len], cat_vectorizer)
        y = create_y(group[x_len:min(x_len * 2, len(group))], vocab_size)
        
        x_vid_list.append(x_vid)
        x_num_list.append(x_num)
        y_list.append(y)
        
        step = step + 1
        
        if (step % batch_size == 0 or step >= seq_len):
            batch = ([np.array(x_vid_list), np.array(x_num_list)], np.array(y_list))
            x_vid_list = []
            x_num_list = []
            y_list = []
            yield (batch)
            
        if (step >= seq_len):
            step = 0
            x_vid_list = []
            x_num_list = []
            y_list = []
            iterator = iter(seq_by_user)
    
        
generation = next(train_generator(extract_sequences_by_user(prepared), 20))
print(generation[0][0][0].shape)
print(generation[0][0][0])
print(generation[0][0][1].shape)
print(generation[0][0][1])
print(generation[0][1][0].shape)
print(generation[0][1][0])
print(generation[0][1][1].shape)
print(generation[0][1][1])
print(generation[0][1][1].shape)
print(generation[0][1][1])
print(generation[1].shape)
print(generation[1])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  downcast=downcast, **kwargs)


(4,)
[1270 1271 1282 1288]
(4,)
[442 478 488 639]
(4, 1395)
[[-6.14869789e-01  2.09921434e-01  0.00000000e+00 ...  0.00000000e+00
   1.05250913e+10  0.00000000e+00]
 [-6.16746878e-01  2.10024143e-01  0.00000000e+00 ...  0.00000000e+00
   1.05250913e+10  0.00000000e+00]
 [-6.10177067e-01  2.10126852e-01  0.00000000e+00 ...  0.00000000e+00
   1.05250913e+10  0.00000000e+00]
 [-4.44993256e-01  2.10229561e-01  0.00000000e+00 ...  0.00000000e+00
   1.05250913e+10  0.00000000e+00]]
(4, 1395)
[[-3.90557682e-01  1.30116587e-01  0.00000000e+00 ...  0.00000000e+00
   1.01170771e+10  0.00000000e+00]
 [-4.52501611e-01  1.55177568e-01  0.00000000e+00 ...  0.00000000e+00
   1.01170771e+10  0.00000000e+00]
 [-4.17775469e-01  1.59696762e-01  0.00000000e+00 ...  0.00000000e+00
   1.01170771e+10  0.00000000e+00]
 [-3.83987872e-01  1.64729500e-01  0.00000000e+00 ...  0.00000000e+00
   1.03968476e+10  0.00000000e+00]]
(4, 1395)
[[-3.90557682e-01  1.30116587e-01  0.00000000e+00 ...  0.00000000e+00
   1.011

In [31]:
def add_train_flag(data):
    split_val = prepared["ip"].quantile(.8)
    print(split_val)
    prepared["train"] = prepared["ip"] < split_val
    
def train_test_split(data):
    add_train_flag(data)
    return (prepared[prepared["train"] == True], prepared[prepared["train"] == False])

In [32]:
train, test = train_test_split(prepared)
print(len(train))
print(len(test))

3819881376.0
3171018
792757


In [33]:
print(steps(train, 20))
print(steps(test, 20))

14701
4244


In [None]:
num_cat_vec_len = len(cat_vectorizer.get_feature_names()) + len(num_cols)
print(num_cat_vec_len)

1395


In [None]:
vid_input = keras.layers.Input(shape=(4,))
embedding = keras.layers.Embedding(vocab_size, 100, input_length=4)(vid_input)

num_cut_input = keras.layers.Input(shape=(4, num_cat_vec_len,))

concat = keras.layers.concatenate([num_cut_input, embedding])

gru_1 = keras.layers.CuDNNGRU(128, return_sequences=True)(concat)
gru_1_act = keras.layers.Activation("relu")(gru_1)
gru_2 = keras.layers.CuDNNGRU(256)(gru_1_act)
gru_2_act = keras.layers.Activation("relu")(gru_2)

dense_1 = keras.layers.Dense(512, activation = "relu")(gru_2_act)

drop = keras.layers.Dropout(0.5)(dense_1)

dense_2 = keras.layers.Dense(vocab_size, activation = "sigmoid")(drop)

model = keras.models.Model(inputs=[vid_input, num_cut_input], outputs=dense_2)

model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['top_k_categorical_accuracy'])

model.summary()

model.fit_generator(train_generator(extract_sequences_by_user(train), batch_size=50), steps_per_epoch=steps(train, 50), validation_data = train_generator(extract_sequences_by_user(test), batch_size = 20), validation_steps=steps(test, 20), epochs=3, verbose=1)



__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 4)            0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, 4, 1395)      0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 4, 100)       133500      input_1[0][0]                    
__________________________________________________________________________________________________
concatenate_1 (Concatenate)     (None, 4, 1495)      0           input_2[0][0]                    
                                                                 embedding_1[0][0]                
__________

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  downcast=downcast, **kwargs)


  65/5881 [..............................] - ETA: 19:04:30 - loss: 0.0992 - top_k_categorical_accuracy: 0.0889

In [207]:
val_batch = next(train_generator(extract_sequences_by_user(test)))
y_true = val_batch[1]
y_predicted = model.predict(val_batch[0])

print(y_true.shape)
print(y_predicted.shape)

(20, 955)
(20, 955)


In [217]:
y_predicted = np.rint(y_predicted)

true_y_values = np.transpose(np.nonzero(y_true))[:, 1:]
pred_y_values = np.transpose(np.nonzero(y_predicted))[:, :1]



# y_true = np.array(np.nonzero(y_true)).flatten()
# y_predicted = np.array(np.nonzero(y_predicted)).flatten()

# print(y_true.shape)
# print(y_predicted.shape)

# def inverse(x):
#     if x > 0 : 
#         return le.inverse_transform(x) 
#     else :
#         return 0
        
# inverse = np.vectorize(inverse)
# y_true = inverse(y_true)
# y_predicted = inverse(y_predicted) if len (y_predicted) > 0 else y_predicted

# print(y_true)
# print(y_predicted)

[[  0 139]
 [  0 887]
 [  0 944]
 [  0 950]
 [  1 332]
 [  1 628]
 [  1 647]
 [  1 887]
 [  2 173]
 [  2 177]
 [  2 196]
 [  2 839]
 [  3 539]
 [  3 887]
 [  4 170]
 [  4 391]
 [  4 680]
 [  4 753]
 [  5 154]
 [  5 702]
 [  5 929]
 [  5 933]
 [  6 139]
 [  6 721]
 [  6 891]
 [  6 944]
 [  7 901]
 [  7 906]
 [  7 933]
 [  7 944]
 [  8 534]
 [  8 676]
 [  8 887]
 [  9 742]
 [  9 743]
 [  9 842]
 [  9 887]
 [ 10  48]
 [ 10 515]
 [ 10 671]
 [ 10 944]
 [ 11 742]
 [ 11 941]
 [ 11 944]
 [ 12 894]
 [ 12 933]
 [ 12 944]
 [ 12 950]
 [ 13 887]
 [ 13 906]
 [ 14 327]
 [ 14 760]
 [ 14 887]
 [ 15 887]
 [ 15 908]
 [ 16 702]
 [ 16 887]
 [ 17 552]
 [ 17 887]
 [ 17 894]
 [ 17 944]
 [ 18 173]
 [ 18 944]
 [ 19 596]
 [ 19 764]
 [ 19 887]
 [ 19 894]]
[[  0 130]
 [  0 150]
 [  0 348]
 ...
 [ 16 949]
 [ 16 950]
 [ 16 952]]


In [None]:
def get_next_video(track_id, video_catalog):
    video_catalog['season'] = pd.to_numeric(video_catalog['season'], errors='coerce', downcast = 'integer')
    video_catalog['episode'] = pd.to_numeric(video_catalog['episode'], errors='coerce', downcast = 'integer')

    curent_video = video_catalog.loc[video_catalog['track_id'] == track_id]

    if curent_video.size == 0:
        return False

    curent_video_index = curent_video.index[0]
    
    curent_title = curent_video['title'][[curent_video_index][0]]
    curent_season = curent_video['season'][[curent_video_index][0]]
    curent_episod = curent_video['episode'][[curent_video_index][0]]
    
    this_show_season = video_catalog.loc[(video_catalog['season'] == curent_season) & (video_catalog.title == curent_title)]
    next_episods = this_show_season.loc[this_show_season['episode'] > curent_episod]

    if next_episods.episode.size != 0:
        next_min_episode = next_episods.episode.min()
        next_min_episode_index = next_episods.loc[next_episods.episode == next_min_episode].index[0]
        next = video_catalog.ix[next_min_episode_index]
    else:
        next_seasons = video_catalog.loc[(video_catalog['season'] > curent_season) & (video_catalog.title == curent_title)]
        if next_seasons.season.size != 0:
            next_min_season_min_episode = next_seasons.episode.min()
            next = next_seasons.loc[next_seasons['episode'] == next_min_season_min_episode]
        else:
            next = False

    return next