# Views/likes/dislikes prediction

In [1]:
import pandas as pd
import numpy as np
import scipy
import sklearn
import langdetect

## Preprocessing

### Load data

In [2]:
us_data = pd.read_csv('data/USvideos.csv')
gb_data = pd.read_csv('data/GBvideos.csv')
ca_data = pd.read_csv('data/CAvideos.csv')

df = pd.concat([us_data, gb_data, ca_data])

### Language filter

Warning: slow

In [3]:
def detect_lang(row):
    t = '\n'.join([str(row['title']), str(row['description'])])
    try:
        return langdetect.detect(t)
    except:
        return 'err'

df['lang'] = df.apply(detect_lang, axis=1)

In [4]:
df = df[df['lang'] == 'en']

### Preprocess

In [5]:
df.description.fillna('', inplace=True)
df.title.fillna('', inplace=True)
df.tags.fillna('', inplace=True)
df['alltext'] = df['title'] + ' ' + df['description'] + ' ' + df['tags']

In [6]:
df['channel'] = df['channel_title'].astype('category')
df['channel'] = df['channel'].cat.codes

### Train / dev / test split

In [7]:
dfs = df.sample(frac=1)
X = dfs[['alltext', 'channel', 'category_id']]
Y = dfs[['views', 'likes', 'dislikes']]

In [8]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=1)
X_test, X_dev, Y_test, Y_dev = train_test_split(X_test, Y_test, test_size=0.5, random_state=1)

In [9]:
print(X_train.shape)
print(X_test.shape)
print(X_dev.shape)
print(Y_train.shape)
print(Y_test.shape)
print(Y_dev.shape)

(65564, 3)
(8196, 3)
(8196, 3)
(65564, 3)
(8196, 3)
(8196, 3)


## Evaluation

In [10]:
from sklearn.metrics import mean_squared_error

def eval_metric(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

def train_eval_model(model, features, target):
    assert target in ['views', 'likes', 'dislikes']
    model.fit(X_train[features].values, Y_train[target])
    return (eval_metric(Y_train[target], model.predict(X_train[features].values)),
            eval_metric(Y_dev[target], model.predict(X_dev[features].values)))

## Baselines

In [11]:
from sklearn.linear_model import RidgeCV
from sklearn.ensemble import RandomForestRegressor
from sklearn import svm

In [12]:
def select_model(models, names, features):
    best = {
        'views': None,
        'likes': None,
        'dislikes': None,
    }
    for target in ['views', 'likes', 'dislikes']:
        print("Selecting model for ", target)
        for model, name in zip(sklearn.base.clone(models), names):
            print("Training ", name)
            train_err, dev_err = train_eval_model(model, features, target)
            print("Train err: %f, Dev err: %f" % (train_err, dev_err))
            if best[target] is None or best[target]['dev'] > dev_err:
                best[target] = {
                    'model': model,
                    'name': name,
                    'dev': dev_err,
                    'train': train_err,
                }
    print("Views model: %s, train: %f, dev: %f" %
              (best['views']['name'], best['views']['train'], best['views']['dev']))
    print("Likes model: %s, train: %f, dev: %f" %
              (best['likes']['name'], best['likes']['train'], best['likes']['dev']))
    print("Dislikes model: %s, train: %f, dev: %f" %
              (best['dislikes']['name'], best['dislikes']['train'], best['dislikes']['dev']))
    return best['views']['model'], best['likes']['model'], best['dislikes']['model']

In [13]:
models, names = zip(*[
    (RandomForestRegressor(n_estimators=10), "Random Forest"),
    (RidgeCV(alphas=[0.0001, 0.001, 0.01, 0.1, 1.0, 10]), "RidgeCV"),
])

### Categorical

#### Channel

In [14]:
chan_views, chan_likes, chan_dislikes = select_model(models, names, ['channel'])

Selecting model for  views
Training  Random Forest
Train err: 5182425.779468, Dev err: 6093255.321696
Training  RidgeCV
Train err: 8528297.649530, Dev err: 9938056.005447
Selecting model for  likes
Training  Random Forest
Train err: 103136.809334, Dev err: 116303.196309
Training  RidgeCV
Train err: 198230.436906, Dev err: 225573.735759
Selecting model for  dislikes
Training  Random Forest
Train err: 20122.750107, Dev err: 21383.727413
Training  RidgeCV
Train err: 32807.857864, Dev err: 32753.659842
Views model: Random Forest, train: 5182425.779468, dev: 6093255.321696
Likes model: Random Forest, train: 103136.809334, dev: 116303.196309
Dislikes model: Random Forest, train: 20122.750107, dev: 21383.727413


#### Category

In [15]:
cat_views, cat_likes, cat_dislikes = select_model(models, names, ['category_id'])

Selecting model for  views
Training  Random Forest
Train err: 8317615.420495, Dev err: 9681581.376228
Training  RidgeCV
Train err: 8435252.012566, Dev err: 9830008.528452
Selecting model for  likes
Training  Random Forest
Train err: 190001.452207, Dev err: 216984.595891
Training  RidgeCV
Train err: 195494.390111, Dev err: 222549.713575
Selecting model for  dislikes
Training  Random Forest
Train err: 32603.418526, Dev err: 32776.852052
Training  RidgeCV
Train err: 32812.762948, Dev err: 32754.428125
Views model: Random Forest, train: 8317615.420495, dev: 9681581.376228
Likes model: Random Forest, train: 190001.452207, dev: 216984.595891
Dislikes model: RidgeCV, train: 32812.762948, dev: 32754.428125


#### Channel AND Category

In [16]:
chancat_views, chancat_likes, chancat_dislikes = select_model(models, names, ['channel', 'category_id'])

Selecting model for  views
Training  Random Forest
Train err: 5188674.173167, Dev err: 6034105.147710
Training  RidgeCV
Train err: 8435167.838002, Dev err: 9829832.913317
Selecting model for  likes
Training  Random Forest
Train err: 100706.788303, Dev err: 115303.665551
Training  RidgeCV
Train err: 195451.974301, Dev err: 222501.077833
Selecting model for  dislikes
Training  Random Forest
Train err: 19845.189330, Dev err: 21115.229526
Training  RidgeCV
Train err: 32803.104194, Dev err: 32740.237110
Views model: Random Forest, train: 5188674.173167, dev: 6034105.147710
Likes model: Random Forest, train: 100706.788303, dev: 115303.665551
Dislikes model: Random Forest, train: 19845.189330, dev: 21115.229526


### TF-IDF

In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import SGDRegressor
from sklearn.model_selection import GridSearchCV

In [18]:
vectorizer = TfidfVectorizer(max_features=30000, stop_words='english')
vectorizer.fit(X_train['alltext'].values)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=30000, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words='english', strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [19]:
tfidf_X_train = vectorizer.transform(X_train['alltext'])

In [20]:
tfidf_X_dev = vectorizer.transform(X_dev['alltext'])
tfidf_X_test = vectorizer.transform(X_test['alltext'])

Using SGD regressor because number of features is very large

In [21]:
tfidf_views_model = SGDRegressor(max_iter=2000, tol=1e-3, alpha=0.00001)
tfidf_views_model.fit(tfidf_X_train, Y_train['views'].values)



SGDRegressor(alpha=1e-05, average=False, early_stopping=False, epsilon=0.1,
       eta0=0.01, fit_intercept=True, l1_ratio=0.15,
       learning_rate='invscaling', loss='squared_loss', max_iter=2000,
       n_iter=None, n_iter_no_change=5, penalty='l2', power_t=0.25,
       random_state=None, shuffle=True, tol=0.001, validation_fraction=0.1,
       verbose=0, warm_start=False)

In [22]:
print("Views train err: ", eval_metric(Y_train['views'].values, tfidf_views_model.predict(tfidf_X_train)))
print("Views dev err: ", eval_metric(Y_dev['views'].values, tfidf_views_model.predict(tfidf_X_dev)))

Views train err:  4661384.501312322
Views dev err:  5359398.33774294


In [23]:
likes_model = SGDRegressor(max_iter=2000, tol=1e-3, alpha=0)
likes_model.fit(tfidf_X_train, Y_train['likes'].values)



SGDRegressor(alpha=0, average=False, early_stopping=False, epsilon=0.1,
       eta0=0.01, fit_intercept=True, l1_ratio=0.15,
       learning_rate='invscaling', loss='squared_loss', max_iter=2000,
       n_iter=None, n_iter_no_change=5, penalty='l2', power_t=0.25,
       random_state=None, shuffle=True, tol=0.001, validation_fraction=0.1,
       verbose=0, warm_start=False)

In [24]:
print("Likes train err: ", eval_metric(Y_train['likes'].values, likes_model.predict(tfidf_X_train)))
print("Likes dev err: ", eval_metric(Y_dev['likes'].values, likes_model.predict(tfidf_X_dev)))

Likes train err:  78805.76721026869
Likes dev err:  90689.80721719166


In [25]:
dislikes_model = SGDRegressor(max_iter=2000, tol=1e-3, alpha=0)
dislikes_model.fit(tfidf_X_train, Y_train['dislikes'].values)



SGDRegressor(alpha=0, average=False, early_stopping=False, epsilon=0.1,
       eta0=0.01, fit_intercept=True, l1_ratio=0.15,
       learning_rate='invscaling', loss='squared_loss', max_iter=2000,
       n_iter=None, n_iter_no_change=5, penalty='l2', power_t=0.25,
       random_state=None, shuffle=True, tol=0.001, validation_fraction=0.1,
       verbose=0, warm_start=False)

In [26]:
print("Dislikes train err: ", eval_metric(Y_train['dislikes'].values, dislikes_model.predict(tfidf_X_train)))
print("Dislikes dev err: ", eval_metric(Y_dev['dislikes'].values, dislikes_model.predict(tfidf_X_dev)))

Dislikes train err:  15534.158980529544
Dislikes dev err:  13089.850712527501


## PV-DM

In [27]:
from pvdm import PVDM

In [28]:
pvdm = PVDM('data/fasttext_en.vec')

In [30]:
pvdm_X_train = pvdm.train(list(X_train['alltext'].values), max_iter=3000, lr=0.01)

iter: 0, loss: 11.500093
iter: 100, loss: 11.255790
iter: 200, loss: 10.927731
iter: 300, loss: 10.351659
iter: 400, loss: 10.041838
iter: 500, loss: 9.816584
iter: 600, loss: 9.247374
iter: 700, loss: 8.836740
iter: 800, loss: 8.550824
iter: 900, loss: 9.107790
iter: 1000, loss: 8.945185
iter: 1100, loss: 9.160015
iter: 1200, loss: 8.884859
iter: 1300, loss: 8.106627
iter: 1400, loss: 8.402533
iter: 1500, loss: 8.156935
iter: 1600, loss: 8.956823
iter: 1700, loss: 8.423058
iter: 1800, loss: 8.266140
iter: 1900, loss: 9.126919
iter: 2000, loss: 9.154280
iter: 2100, loss: 8.596383
iter: 2200, loss: 8.777518
iter: 2300, loss: 9.195590
iter: 2400, loss: 8.264833
iter: 2500, loss: 8.894335
iter: 2600, loss: 8.197789
iter: 2700, loss: 8.140443
iter: 2800, loss: 8.227644
iter: 2900, loss: 8.323967
iter: 2999, loss: 7.700732


In [31]:
pvdm_X_dev = pvdm.vectorize(list(X_dev['alltext'].values))

iter: 0, loss: 8.226328
iter: 100, loss: 8.449644
iter: 200, loss: 7.915804
iter: 300, loss: 7.915294
iter: 400, loss: 7.762508
iter: 500, loss: 8.304609
iter: 600, loss: 8.304291
iter: 700, loss: 8.211000
iter: 800, loss: 8.316038
iter: 900, loss: 8.293601
iter: 999, loss: 7.377135


In [48]:
pvdm_views_model = RidgeCV(alphas=[0.001, 0.01, 0.1, 1, 10])
pvdm_views_model.fit(pvdm_X_train, Y_train['views'].values)

RidgeCV(alphas=array([1.e-03, 1.e-02, 1.e-01, 1.e+00, 1.e+01]), cv=None,
    fit_intercept=True, gcv_mode=None, normalize=False, scoring=None,
    store_cv_values=False)

In [49]:
print("Views train err: ", eval_metric(Y_train['views'].values, pvdm_views_model.predict(pvdm_X_train)))
print("Views dev err: ", eval_metric(Y_dev['views'].values, pvdm_views_model.predict(pvdm_X_dev)))

Views train err:  8506653.041300824
Views dev err:  9966533.197021779


In [50]:
pvdm_likes_model = RidgeCV(alphas=[0.001, 0.01, 0.1, 1, 10])
pvdm_likes_model.fit(pvdm_X_train, Y_train['likes'].values)

RidgeCV(alphas=array([1.e-03, 1.e-02, 1.e-01, 1.e+00, 1.e+01]), cv=None,
    fit_intercept=True, gcv_mode=None, normalize=False, scoring=None,
    store_cv_values=False)

In [51]:
print("Likes train err: ", eval_metric(Y_train['likes'].values, pvdm_likes_model.predict(pvdm_X_train)))
print("Likes dev err: ", eval_metric(Y_dev['likes'].values, pvdm_likes_model.predict(pvdm_X_dev)))

Likes train err:  197785.03082993082
Likes dev err:  225953.51705522492


In [52]:
pvdm_dislikes_model = RidgeCV(alphas=[0.001, 0.01, 0.1, 1, 10])
pvdm_dislikes_model.fit(pvdm_X_train, Y_train['dislikes'].values)

RidgeCV(alphas=array([1.e-03, 1.e-02, 1.e-01, 1.e+00, 1.e+01]), cv=None,
    fit_intercept=True, gcv_mode=None, normalize=False, scoring=None,
    store_cv_values=False)

In [53]:
print("Dislikes train err: ", eval_metric(Y_train['dislikes'].values, pvdm_dislikes_model.predict(pvdm_X_train)))
print("Dislikes dev err: ", eval_metric(Y_dev['dislikes'].values, pvdm_dislikes_model.predict(pvdm_X_dev)))

Dislikes train err:  32739.557838638317
Dislikes dev err:  32874.26176056329


## Results

| Model | Views | Likes | Dislikes |
| ---   |  ---  | ---   | ---      |
| Channel | 6093255.321696 | 116303.196309 | 21383.727413 |
| Category | 9681581.376228 | 216984.595891 | 32754.428125 |
| Channel+Category | 6034105.147710 | 115303.665551 | 21115.229526 |
| TF-IDF | 5359398.33774294 | 90689.80721719166 | 13089.850712527501 |
| PV-DV | 9966533.197021779 | 225953.51705522492 | 32874.26176056329 |

Note: it's probably possible to achieve better results with PV-DM, I did not do proper hyperparamtere tuning and model selection because it takes too long.

### TF-IDF Test score

In [57]:
print("Views test err: ", eval_metric(Y_test['views'].values, tfidf_views_model.predict(tfidf_X_test)))
print("Likes test err: ", eval_metric(Y_test['likes'].values, likes_model.predict(tfidf_X_test)))
print("Dislikes test err: ", eval_metric(Y_test['dislikes'].values, dislikes_model.predict(tfidf_X_test)))

Views test err:  4954561.639136972
Likes test err:  87758.77243487755
Dislikes test err:  15821.163421204124
