In [1]:
%matplotlib inline
from gensim.utils import simple_preprocess
from gensim.models import Word2Vec
from tqdm.notebook import tqdm

from sklearn.model_selection import train_test_split
from sklearn import linear_model

from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR, LinearSVR
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
import lightgbm as lgbm

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

import pickle
import os

  import pandas.util.testing as tm


In [2]:
vids = pd.read_csv('../vids.csv')
X = vids[vids.category_id.notna()]
X = X.drop(([
     'video_id', 'channel_title', 'trending_date', 'publish_time', 'comments_disabled',
     'comment_count', 'likes', 'dislikes', 'video_error_or_removed', 'thumbnail_link',
     'ratings_disabled']), 1)
Y = vids.views

**Dummy encode categories**

In [3]:
encoded_cateogries = pd.get_dummies(X.category_id, prefix="category_id")
X = X.drop(['category_id'], 1)
for column in encoded_cateogries.columns:
    X[column] = encoded_cateogries[column]

**Encode words with w2v model**

In [4]:
processed_titles = []
processed_descriptions = []
processed_tags = []

for i in tqdm(range(len(X))):
    try:
        processed_titles.append(simple_preprocess(X.title.iloc[i]))
    except:
        processed_titles.append([""])

    try:
        processed_descriptions.append(simple_preprocess(X.description.iloc[i]))
    except:
        processed_descriptions.append([""])

    try:
        processed_tags.append(simple_preprocess(X.tags.iloc[i]))
    except:
        processed_tags.append([""])

HBox(children=(FloatProgress(value=0.0, max=41037.0), HTML(value='')))




In [5]:
w2v_model = Word2Vec.load(os.path.join('..', 'mdl', 'word2vec.model'))
all_one_feature = []
dscription_feature = []
title_feature = []
tags_feature = []

for i in tqdm(range(len(X))):
    filtered = [word for word in processed_titles[i] if word in w2v_model.wv.vocab]
    if len(filtered) > 0:
        title_vec = np.mean(w2v_model.wv[filtered], axis=0)
    else:
        title_vec = np.zeros(w2v_model.vector_size)
        
    filtered = [word for word in processed_descriptions[i] if word in w2v_model.wv.vocab]
    if len(filtered) > 0:
        description_vec = np.mean(w2v_model.wv[filtered], axis=0)
    else:
        description_vec = np.zeros(w2v_model.vector_size)
        
    filtered = [word for word in processed_tags[i] if word in w2v_model.wv.vocab]
    if len(filtered) > 0:
        tags_vec = np.mean(w2v_model.wv[filtered], axis=0)
    else:
        tags_vec = np.zeros(w2v_model.vector_size)
    
    all_one_feature.append(np.mean([title_vec, description_vec, tags_vec], axis=0))
    dscription_feature.append(description_vec)
    title_feature.append(title_vec)
    tags_feature.append(tags_vec)

all_one_feature = np.array(all_one_feature)
dscription_feature = np.array(dscription_feature)
title_feature = np.array(title_feature)
tags_feature = np.array(tags_feature)

HBox(children=(FloatProgress(value=0.0, max=41037.0), HTML(value='')))




In [6]:
X = X.drop(['title', 'tags', 'description'], 1)

for i in tqdm(range(w2v_model.vector_size)):
    X[f'word_encodings{i}'] = all_one_feature[:,i]
    X[f'description{i}'] = dscription_feature[:, i]
    X[f'title{i}'] = title_feature[:,i]
    X[f'tags{i}'] = tags_feature[:,i]

X = X.dropna()

HBox(children=(FloatProgress(value=0.0), HTML(value='')))




**Split data**

In [7]:
train, test = train_test_split(X, test_size=0.2)

train_y = train.views
train_x = train.drop('views', 1)
test_y = test.views
test_x = test.drop('views', 1)

In [8]:
to_drop = np.array([[f'title{i}', f'tags{i}', f'description{i}'] for i in range(w2v_model.vector_size)]).flatten()
train_x_all = train_x.drop(to_drop,1)
test_x_all = test_x.drop(to_drop,1)

to_drop = [f'word_encodings{i}' for i in range(w2v_model.vector_size)]
train_x_sep = train_x.drop(to_drop,1)
test_x_sep = test_x.drop(to_drop,1)

**Store Train/Test Split**

We may use it for some future analysis.

In [9]:
if not os.path.exists(os.path.join('..', 'data')):
    os.mkdir(os.path.join('..', 'data'))
    
train_x_all.to_csv(os.path.join('..', 'data','train_x_all.csv'), index=False)
train_x_sep.to_csv(os.path.join('..', 'data','train_x_sep.csv'), index=False)
test_x_all.to_csv(os.path.join('..', 'data','test_x_all.csv'), index=False)
test_x_sep.to_csv(os.path.join('..', 'data','test_x_sep.csv'), index=False)
train_y.to_csv(os.path.join('..', 'data','train_y.csv'), index=False)
test_y.to_csv(os.path.join('..', 'data','test_y.csv'), index=False)

**Train**

In [10]:
lr_all = LinearRegression()
lr_sep = LinearRegression()

svr_all = LinearSVR(C=1.0, epsilon=0.2)
svr_sep = LinearSVR(C=1.0, epsilon=0.2)

rfr_all = RandomForestRegressor(n_estimators=20, max_depth=3)
rfr_sep = RandomForestRegressor(n_estimators=20, max_depth=3)

xgb_all = XGBRegressor(n_estimators=300)
xgb_sep = XGBRegressor(n_estimators=300)

gbm_all = lgbm.LGBMRegressor()
gbm_sep = lgbm.LGBMRegressor()

In [11]:
print('training lr_all')
lr_all.fit(train_x_all, train_y)

print('training lr_sep')
lr_sep.fit(train_x_sep, train_y)

print('training svr_all')
svr_all.fit(train_x_all, train_y)

print('training svr_sep')
svr_sep.fit(train_x_sep, train_y)

print('training rfr_all')
rfr_all.fit(train_x_all, train_y)

print('training rfr_sep')
rfr_sep.fit(train_x_sep, train_y)

print('training xgb_all')
xgb_all.fit(train_x_all, train_y)

print('training xgb_sep')
xgb_sep.fit(train_x_sep, train_y)

print('training lgbm_all')
gbm_all.fit(train_x_all, train_y)

print('training lgbm_sep')
gbm_sep.fit(train_x_sep, train_y)

training lr_all
training lr_sep
training svr_all
training svr_sep
training rfr_all
training rfr_sep
training xgb_all
training xgb_sep
training lgbm_all
training lgbm_sep


LGBMRegressor()

**save models**

In [12]:
mdl_path = os.path.join('..', 'mdl')
pickle.dump(lr_all, open(os.path.join(mdl_path, 'lr_all.pkl'), 'wb'))
pickle.dump(lr_sep, open(os.path.join(mdl_path, 'lr_sep.pkl'), 'wb'))
pickle.dump(svr_all, open(os.path.join(mdl_path, 'svr_all.pkl'), 'wb'))
pickle.dump(svr_sep, open(os.path.join(mdl_path, 'svr_sep.pkl'), 'wb'))
pickle.dump(rfr_all, open(os.path.join(mdl_path, 'rfr_all.pkl'), 'wb'))
pickle.dump(rfr_sep, open(os.path.join(mdl_path, 'rfr_sep.pkl'), 'wb'))
pickle.dump(xgb_all, open(os.path.join(mdl_path, 'xgb_all.pkl'), 'wb'))
pickle.dump(xgb_sep, open(os.path.join(mdl_path, 'xgb_sep.pkl'), 'wb'))
pickle.dump(gbm_all, open(os.path.join(mdl_path, 'gbm_all.pkl'), 'wb'))
pickle.dump(gbm_sep, open(os.path.join(mdl_path, 'gbm_sep.pkl'), 'wb'))