In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import ipaddress
from datetime import datetime
import time
import keras
import re
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction import DictVectorizer as DV

In [None]:
data = pd.read_csv("./data_full.csv", sep = ";", error_bad_lines=False)

In [103]:
catalog = pd.read_csv("./catalog.csv", sep = ";", error_bad_lines=False)

In [4]:
data.head()

Unnamed: 0,timestamp,ip,track_id,cookie,live,user_id,referer,uagent,rightholder,author_id
0,1517778000,230.59.74.120,10912114,42536dc7a8578b0cfac05f704977429a,0,,https%3A%2F%2Frutube.ru%2Fvideo%2F0f5c9c5839b1...,Mozilla/5.0 (Windows NT 6.1; Win64; x64) Apple...,2,1480930
1,1517778000,218.222.225.36,10885813,8f92f3f2b1e7a2498761b8cb3b1d03c3,0,,https%3A%2F%2Frutube.ru%2Fvideo%2Fd8068436dcf5...,Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.3...,2,301323
2,1517778000,55.63.241.224,6948236,336d5ebc5436534e61d16e63ddfca327,-,2231332000.0,http%3A%2F%2Fandroid.rutube.ru%2F,okhttp/2.6.0,-,-
3,1517778000,230.34.98.99,8481402,cf8dd809edf104873a1c57921b34f25e,0,,https%3A%2F%2Frutube.ru%2Fvideo%2F1058cf249e46...,Mozilla/5.0 (Linux; Android 7.1.2; Redmi 4X Bu...,,721346
4,1517778000,232.3.41.226,10908060,4ca525a1d60de6d74e3c32529fe47ab5,0,,https%3A%2F%2Fyastatic.net%2Fvideo-player%2F0x...,Mozilla/5.0 (Windows NT 10.0; Win64; x64) Appl...,135,599848


In [5]:
catalog.head()

Unnamed: 0,idec,track_id,video_id,series_id,title,season,episode,episode_global,date_efir,duration
0,PR592505,10258734,cd59084eb50e6ebe9fe20ad04e0c17db,10456850000.0,"\tБыть или не быть, 1 сезон, 1 серия",1.0,1.0,1.0,02.04.2017 22:00:34,
1,PR609630,10843255,a5d2e6bae4752d9ec0fae9b53f00700d,10592470000.0,,11.0,1.0,1.0,,
2,PR597128,10573153,bcd45606c1f7cbb5cb9d1dbd7d346cc3,10479560000.0,,7.0,1.0,1.0,06.09.2017 22:46:05,
3,PR257331,6617650,609217ac7b8c4e55d44c4a29bf52554b,,"""Интерны. История болезни""",,,,12.06.2012 22:00:44,
4,PR529698,7991493,bde3620ba038b3e8dd41d03bf516a7eb,10241020000.0,#ЖАННАПОЖЕНИ,1.0,1.0,101.0,05.09.2015 13:30:00,


In [6]:
def add_device(df):
    regexpDesktop = re.compile('(Windows|Linux|Mozilla)')
    regexpMobile = re.compile('(iPhone|RutubeAndroid|okhttp|CFNetwork|UCBrowser)')
    regexpTablet = re.compile('(iPad)')
    
    df['device'] = pd.Series('0', index=df.index)
    
    for i, row in df.iterrows():
        try:
            if regexpDesktop.search(df['uagent'].iloc[i]):
                df['device'][i] = 'desktop'
            elif regexpMobile.search(df['uagent'].iloc[i]):
                df['device'][i] = 'mobile'
            elif regexpTablet.search(df['uagent'].iloc[i]):
                df['device'][i] = 'mobile'
        except:
            pass
            
    return df

In [7]:
def mergeData(data1, data2, field):
    data1[field] = pd.to_numeric(data1[field], errors = 'coerce')
    data2[field] = pd.to_numeric(data2[field], errors = 'coerce')
    return pd.merge(data1, data2, left_on=field, right_on=field)

In [8]:
def convIpToNumber(data):
    return data.apply(lambda x: int(ipaddress.IPv4Address(x)))

In [9]:
def applyDate(x):
    try:
        data = time.strptime(x, "%d.%m.%Y %H:%M:%S")
        return int(time.mktime(data))
    except:
        return 1e-6

In [10]:
def confDateStringToTimestamp(data):
    return data.apply(lambda x: applyDate(x))

In [11]:
def filter_users_with_n_views(data, user_column = "ip", seq_len = 5):
    by_user = data.groupby(user_column)
    by_user = by_user.filter(lambda x: len(x) >= seq_len)
    return by_user

def extract_sequences_by_user(data, user_column = "ip"):
    return data.groupby(user_column)

In [12]:
def filter_videos_with_n_views(data, n = 1000):
    data["track_id"] = pd.to_numeric(data["track_id"], errors = 'coerce', downcast = 'integer')
    data = data[data["track_id"].notnull()]
    data_top_n = data.groupby("track_id").filter(lambda x: len(x) >= n)
    return data_top_n

In [16]:
cat_cols = [ 'series_id', 'idec', 'author_id']
num_cols = [ 'duration', 'date_efir']

def label_encoder(data):
    le = preprocessing.LabelEncoder()
    le.fit(data)
    return le

def create_cat_vectorizer(data):
    
    cat = data[cat_cols]
    cat.fillna( 'NA', inplace = True )
    x_cat = cat.to_dict(orient = 'records')
    
    vectorizer = DV( sparse = False )
    vectorizer.fit(x_cat)
    
    return vectorizer

def cat_vectorize(data, vectorizer):
    
    cat = data[cat_cols]
    cat.fillna( 'NA', inplace = True )
    x_cat = cat.to_dict(orient = 'records')
    
    return vectorizer.transform(x_cat)
    

In [17]:
def replace_null_value(x):
    if (x == 'null'):
        return 0
    else:
        return x

def data_scale(data):
    scaler = StandardScaler()
    data.fillna(0, inplace = True )
    data = data.apply(lambda x: replace_null_value(x))
    print(data)
    data = scaler.fit_transform(data.reshape(-1, 1))
    return data

In [18]:
def data_to_num_and_cat(data):
    data['date_efir'] = data_scale(data['date_efir'])
    data['duration'] = data_scale(data['duration'])
    return data

In [19]:
def prepare_data(data, catalog):
    
    data['ip'] = convIpToNumber(data['ip'])
    
    data = filter_videos_with_n_views(data)
    print(len(data))
    
    data = filter_users_with_n_views(data)
    print(len(data))
    
    return data

prepared = prepare_data(data, catalog)

7142931
4344933


In [20]:
prepared.head()

Unnamed: 0,timestamp,ip,track_id,cookie,live,user_id,referer,uagent,rightholder,author_id
1,1517778000,3672039716,10885813.0,8f92f3f2b1e7a2498761b8cb3b1d03c3,0,,https%3A%2F%2Frutube.ru%2Fvideo%2Fd8068436dcf5...,Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.3...,2,301323
4,1517778000,3892521442,10908060.0,4ca525a1d60de6d74e3c32529fe47ab5,0,,https%3A%2F%2Fyastatic.net%2Fvideo-player%2F0x...,Mozilla/5.0 (Windows NT 10.0; Win64; x64) Appl...,135,599848
5,1517778000,929300702,10912230.0,0cc973c79fe7540052da1bed3c83b55b,0,,https%3A%2F%2Frutube.ru%2Fvideo%2Fcb9d5108c0ee...,Mozilla/5.0 (iPad; CPU OS 9_3_5 like Mac OS X)...,2,1480930
6,1517778000,165491643,10912230.0,40212c8fe37bae13f80ffb78c0560812,0,,http%3A%2F%2Fvideo-dom2.ru%2Fonlinetv%2Fdom2_o...,Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.3...,2,1480930
12,1517778000,938789414,10906545.0,56cdf26f4ab3a341c15fd390f79ba5a0,0,,https%3A%2F%2Frutube.ru%2Fvideo%2Fecaa72f910b0...,Mozilla/5.0 (Linux; Android 6.0.1; D6503 Build...,2,1480930


In [None]:
cat_vectorizer = create_cat_vectorizer(prepared)

In [None]:
print(cat_vectorize(prepared[26:27], cat_vectorizer))
print(cat_vectorize(prepared[4259450:4259451], cat_vectorizer))

In [23]:
track_ids = prepared["track_id"]

vocab_size = track_ids.nunique()

le = label_encoder(track_ids.values)

print(le.transform(track_ids.values[0:5]))
print(vocab_size)

[690 804 860 860 784]
955


In [None]:
def create_x_video_vectors(x, vocab_size):
    x = x["track_id"].values
    x = le.transform(x)
    x = x.astype(np.int32)
    return x.astype(np.int32)

def create_x_num_cut_vectors(x):
    vec_x_cat = cat_vectorize(x, cat_vectorizer)
    vec_x_num = x[num_cols].as_matrix()
    return np.hstack((vec_x_num, vec_x_cat))

def create_x(x, vocab_size):
    return create_x_video_vectors(x, vocab_size).astype(np.int32)

print(create_x_video_vectors(prepared.head(5), vocab_size))
print(create_x_num_cut_vectors(prepared.head(5)))
print(create_x_video_vectors(prepared.head(5), vocab_size).shape)
print(create_x_num_cut_vectors(prepared.head(5)).shape)

In [None]:
def create_y(y, vocab_size):
    y = y["track_id"].values
    y = le.transform(y)
    if (len(y) == 0):
        return np.zeros(vocab_size)
    return np.max(keras.utils.to_categorical(y, vocab_size), axis = 0)

print(create_y(prepared.head(3), vocab_size))

In [26]:
def steps(data, batch_size = 20):
    return data["ip"].nunique() / batch_size

In [27]:
def train_generator(seq_by_user, x_len = 4, batch_size = 20):
    
    step = 0
    
    seq_len = len(seq_by_user)
    
    x_list = []
    y_list = []
    
    iterator = iter(seq_by_user)
    
    while(True):
        ip, group = next(iterator)
        
        group = group.sort_values("timestamp")
        x = create_x(group[0:x_len], vocab_size)
        y = create_y(group[x_len:min(x_len + 4, len(group))], vocab_size)
        
        x_list.append(x)
        y_list.append(y)
        
        step = step + 1
        
        if (step % batch_size == 0 or step >= seq_len):
            batch = (np.array(x_list), np.array(y_list))
            x_list = []
            y_list = []
            yield (batch)
            
        if (step >= seq_len):
            iterator = iter(seq_by_user)
            step = 0
        
generation = next(train_generator(extract_sequences_by_user(prepared)))
print(generation[0].shape)
print(generation[1].shape)
print(generation[0])
print(generation[1])

(20, 4)
(20, 955)
[[752 753 766 781]
 [348 348 726 825]
 [869 869 513 513]
 [888 897 832 810]
 [886 913 937 933]
 [781 844 781 781]
 [783 741 751 348]
 [879 880 913 933]
 [369 370 372 243]
 [708 703 702 698]
 [860 880 860 906]
 [691 693 697 697]
 [348 348 348 348]
 [860 879 906 913]
 [168 170 170 173]
 [394 321 326 326]
 [858 860 860 913]
 [696 701 707 751]
 [659 659 667 672]
 [521 523 542 545]]
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [29]:
def add_train_flag(data):
    split_val = prepared["ip"].quantile(.8)
    print(split_val)
    prepared["train"] = prepared["ip"] < split_val
    
def train_test_split(data):
    add_train_flag(data)
    return (prepared[prepared["train"] == True], prepared[prepared["train"] == False])

In [30]:
train, test = train_test_split(prepared)
print(len(train))
print(len(test))

3819871939.0
3475943
868990


In [31]:
print(steps(train))
print(steps(test))

15714.8
4575.6


In [32]:
emb_input = keras.layers.Input(shape=(4,))

embedding = keras.layers.Embedding(vocab_size, 100, input_length=4)(emb_input)

gru_1 = keras.layers.GRU(128, return_sequences=True)(embedding)
gru_2 = keras.layers.GRU(128)(gru_1)

dense_1 = keras.layers.Dense(512, activation = "relu")(gru_2)
drop = keras.layers.Dropout(0.5)(dense_1)
dense_4 = keras.layers.Dense(vocab_size, activation = "tanh")(drop)

model = keras.models.Model(inputs=emb_input, outputs=dense_4)

model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['top_k_categorical_accuracy'])

model.summary()

model.fit_generator(train_generator(extract_sequences_by_user(train), batch_size=100), steps_per_epoch=steps(train, 100), validation_data = train_generator(extract_sequences_by_user(test)), validation_steps=steps(test), epochs=1, verbose=1)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 4)                 0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 4, 100)            95500     
_________________________________________________________________
gru_1 (GRU)                  (None, 4, 128)            87936     
_________________________________________________________________
gru_2 (GRU)                  (None, 128)               98688     
_________________________________________________________________
dense_1 (Dense)              (None, 512)               66048     
_________________________________________________________________
dropout_1 (Dropout)          (None, 512)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 955)               489915    
Total para

<keras.callbacks.History at 0x28e23f9fc50>

In [None]:
tg = train_generator(extract_sequences_by_user(test), batch_size = 1)

def predict_next():
    val_batch = next(tg)
    y_predicted = model.predict(val_batch[0])
    x_true = val_batch[0]
    y_true = val_batch[1]
    return (le.inverse_transform(x_true[-1])[-1], y_predicted, y_true)

x_true, y_predicted, y_true = predict_next()

print(x_true)
print(y_true.shape)
print(y_predicted.shape)

In [87]:
def decode_y(y_predicted, y_true):
    y_predicted = np.rint(y_predicted)

    true_y_values = np.transpose(np.nonzero(y_true))[:, 1:]
    pred_y_values = np.transpose(np.nonzero(y_predicted))[:, :1]

    true_y_track_values = le.inverse_transform(true_y_values)
    pred_y_track_values = le.inverse_transform(pred_y_values)
    return (pred_y_track_values, true_y_track_values)

In [None]:
y_predicted = np.rint(y_predicted)

true_y_values = np.transpose(np.nonzero(y_true))[:, 1:]
pred_y_values = np.transpose(np.nonzero(y_predicted))[:, :1]

true_y_track_values = le.inverse_transform(true_y_values)
pred_y_track_values = le.inverse_transform(pred_y_values)

In [48]:
def get_next_video(video_id, video_catalog):
    video_catalog['season'] = pd.to_numeric(video_catalog['season'], errors='coerce', downcast = 'integer')
    video_catalog['episode'] = pd.to_numeric(video_catalog['episode'], errors='coerce', downcast = 'integer')

    curent_video = video_catalog.loc[video_catalog['video_id'] == video_id]

    if curent_video.size == 0:
        return False

    curent_video_index = curent_video.index[0]
    
    curent_title = curent_video['title'][[curent_video_index][0]]
    curent_season = curent_video['season'][[curent_video_index][0]]
    curent_episod = curent_video['episode'][[curent_video_index][0]]
    
    this_show_season = video_catalog.loc[(video_catalog['season'] == curent_season) & (video_catalog.title == curent_title)]
    next_episods = this_show_season.loc[this_show_season['episode'] > curent_episod]

    if next_episods.episode.size != 0:
        next_min_episode = next_episods.episode.min()
        next_min_episode_index = next_episods.loc[next_episods.episode == next_min_episode].index[0]
        next = video_catalog.ix[next_min_episode_index]
    else:
        next_seasons = video_catalog.loc[(video_catalog['season'] > curent_season) & (video_catalog.title == curent_title)]
        if next_seasons.season.size != 0:
            next_min_season_min_episode = next_seasons.episode.min()
            next = next_seasons.loc[next_seasons['episode'] == next_min_season_min_episode]
        else:
            next = False

    return next

In [37]:
def get_top_popular_video(video_history):
    return video_history.groupby('track_id').apply(lambda x: x.count()).sort_values(by=['ip'], ascending=False)

In [38]:
def find_video_id_by_track_id(id): 
    id = int(id)
    video_id = catalog.loc[catalog['track_id'] == id]['video_id']
    return video_id.values[0]

def find_one_video_id_by_track_id(id): 
    id = int(id)
    video_id = catalog.loc[catalog['track_id'] == id]['video_id']
    return video_id.values

In [112]:
def get_videos(data):
    video_array = []
    
    for d in data:
        try:
            d = find_video_id_by_track_id(int(d))
            video_array.append(d)
        except:
            pass
        
    return video_array

In [113]:
def data_iretation(count, array):
    video_array = []
    
    for idx, data in enumerate(array):
        if (idx < count):
            video_array.append(data) 
    
    return video_array

In [None]:
top_videos = get_top_popular_video(data)

In [114]:
def find_recommend_videos(id, y_pred):

    video_array = []
    
    top_videos_first_five = top_videos['track_id'].index[:5]
    
    next_video = get_next_video(id, catalog)    
        
    nn_recommend_videos = y_pred
    
    if (next_video):
        video_array.append(next_video)

    for idx, data in enumerate(nn_recommend_videos):
        if (idx <= 2):
            video_array.append(data) 
    
    for idx, data in enumerate(top_videos_first_five):
        if (len(video_array) <= 4):
            video_array.append(data)   
    
        
    videos = get_videos(video_array)
    
    return videos

In [136]:
x_true, y_predicted, _ = predict_next()

pred_y_track_values, _ = decode_y(y_predicted, y_true)

pred = true_y_track_values[0] if len(pred_y_track_values) else []

test_id = int(x_true)

print ('Было video_id: ', find_one_video_id_by_track_id(test_id))

result = find_recommend_videos(test_id, pred)

print ('Нашли результат: ', result)

Было video_id:  ['67ca2c5cccd0090948029aa9ee19f581']
Нашли результат:  ['2e0993d0d71e107f7aa6de7603b72c93', 'e083cdcb311c893de6a9d2b252a5014c', 'b645131018a0ce65ab748117e2514fa8', '45a6428b27eb6a09b9e0cc0af5247c3b']


  if diff:
  if diff:
  if diff:
