# Views/likes/dislikes prediction

In [1]:
import pandas as pd
import numpy as np
import scipy
import sklearn
import langdetect

## Preprocessing

### Load data

In [2]:
us_data = pd.read_csv('data/USvideos.csv')
gb_data = pd.read_csv('data/GBvideos.csv')
ca_data = pd.read_csv('data/CAvideos.csv')

df = pd.concat([us_data, gb_data, ca_data])

### Language filter

In [6]:
def detect_lang(row):
    t = '\n'.join([str(row['title']), str(row['description'])])
    try:
        return langdetect.detect(t)
    except:
        return 'err'

df['lang'] = df.apply(detect_lang, axis=1)

In [7]:
df = df[df['lang'] == 'en']

### Time to trend

In [8]:
df['trending_date'] = pd.to_datetime(df['trending_date'], format='%y.%d.%m')
df['publish_time'] = pd.to_datetime(df['publish_time'], format='%Y-%m-%dT%H:%M:%S.%fZ')

In [9]:
df['time_to_trend'] = df['trending_date'] - df['publish_time']

### Preprocess

In [3]:
df.description.fillna('', inplace=True)
df.title.fillna('', inplace=True)
df.tags.fillna('', inplace=True)
df['alltext'] = df['title'] + ' ' + df['description'] + ' ' + df['tags']

In [4]:
df['channel'] = df['channel_title'].astype('category')
df['channel'] = df['channel'].cat.codes

### Train / dev / test split

In [5]:
dfs = df.sample(frac=1)
X = dfs[['alltext', 'channel', 'category_id']]
Y = dfs[['views', 'likes', 'dislikes']]

In [6]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=1)
X_test, X_dev, Y_test, Y_dev = train_test_split(X_test, Y_test, test_size=0.5, random_state=1)

In [7]:
print(X_train.shape)
print(X_test.shape)
print(X_dev.shape)
print(Y_train.shape)
print(Y_test.shape)
print(Y_dev.shape)

(70261, 3)
(8783, 3)
(8783, 3)
(70261, 3)
(8783, 3)
(8783, 3)


## Evaluation

In [8]:
from sklearn.metrics import mean_squared_error

def eval_metric(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

def train_eval_model(model, features, target):
    assert target in ['views', 'likes', 'dislikes']
    model.fit(X_train[features].values, Y_train[target])
    return (eval_metric(Y_train[target], model.predict(X_train[features].values)),
            eval_metric(Y_dev[target], model.predict(X_dev[features].values)))

## Baselines

In [11]:
from sklearn.linear_model import RidgeCV
from sklearn.ensemble import RandomForestRegressor
from sklearn import svm

In [9]:
def select_model(models, names, features):
    best = {
        'views': None,
        'likes': None,
        'dislikes': None,
    }
    for target in ['views', 'likes', 'dislikes']:
        print("Selecting model for ", target)
        for model, name in zip(sklearn.base.clone(models), names):
            print("Training ", name)
            train_err, dev_err = train_eval_model(model, features, target)
            print("Train err: %f, Dev err: %f" % (train_err, dev_err))
            if best[target] is None or best[target]['dev'] > dev_err:
                best[target] = {
                    'model': model,
                    'name': name,
                    'dev': dev_err,
                    'train': train_err,
                }
    print("Views model: %s, train: %f, dev: %f" %
              (best['views']['name'], best['views']['train'], best['views']['dev']))
    print("Likes model: %s, train: %f, dev: %f" %
              (best['likes']['name'], best['likes']['train'], best['likes']['dev']))
    print("Dislikes model: %s, train: %f, dev: %f" %
              (best['dislikes']['name'], best['dislikes']['train'], best['dislikes']['dev']))
    return best['views']['model'], best['likes']['model'], best['dislikes']['model']

In [12]:
models, names = zip(*[
    (RandomForestRegressor(n_estimators=10), "Random Forest"),
    (RidgeCV(), "RidgeCV"),
])

### Categorical

#### Channel

In [13]:
chan_views, chan_likes, chan_dislikes = select_model(models, names, ['channel'])

Selecting model for  views
Training  Random Forest
Train err: 6192499.765740, Dev err: 7098472.222174
Training  RidgeCV
Train err: 10127596.334401, Dev err: 13145960.821158
Selecting model for  likes
Training  Random Forest
Train err: 104566.583243, Dev err: 111207.601193
Training  RidgeCV
Train err: 204313.931787, Dev err: 211667.960349
Selecting model for  dislikes
Training  Random Forest
Train err: 27260.155075, Dev err: 20683.684098
Training  RidgeCV
Train err: 40140.474151, Dev err: 32477.209702
Views model: Random Forest, train: 6192499.765740, dev: 7098472.222174
Likes model: Random Forest, train: 104566.583243, dev: 111207.601193
Dislikes model: Random Forest, train: 27260.155075, dev: 20683.684098


#### Category

In [14]:
cat_views, cat_likes, cat_dislikes = select_model(models, names, ['category_id'])

Selecting model for  views
Training  Random Forest
Train err: 9866500.230407, Dev err: 12853978.206174
Training  RidgeCV
Train err: 10018397.213103, Dev err: 13028218.963434
Selecting model for  likes
Training  Random Forest
Train err: 195484.798433, Dev err: 202117.482390
Training  RidgeCV
Train err: 201330.885721, Dev err: 208481.913170
Selecting model for  dislikes
Training  Random Forest
Train err: 39995.260741, Dev err: 32243.687084
Training  RidgeCV
Train err: 40140.194049, Dev err: 32473.375132
Views model: Random Forest, train: 9866500.230407, dev: 12853978.206174
Likes model: Random Forest, train: 195484.798433, dev: 202117.482390
Dislikes model: Random Forest, train: 39995.260741, dev: 32243.687084


#### Channel AND Category

In [15]:
chancat_views, chancat_likes, chancat_dislikes = select_model(models, names, ['channel', 'category_id'])

Selecting model for  views
Training  Random Forest
Train err: 6177665.852854, Dev err: 7171541.241929
Training  RidgeCV
Train err: 10018262.367244, Dev err: 13027258.791271
Selecting model for  likes
Training  Random Forest
Train err: 102707.069348, Dev err: 108652.568836
Training  RidgeCV
Train err: 201309.266142, Dev err: 208442.972291
Selecting model for  dislikes
Training  Random Forest
Train err: 27265.172905, Dev err: 20649.202800
Training  RidgeCV
Train err: 40135.166088, Dev err: 32465.433668
Views model: Random Forest, train: 6177665.852854, dev: 7171541.241929
Likes model: Random Forest, train: 102707.069348, dev: 108652.568836
Dislikes model: Random Forest, train: 27265.172905, dev: 20649.202800


### TF-IDF

In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import SGDRegressor
from sklearn.model_selection import GridSearchCV

In [17]:
vectorizer = TfidfVectorizer(max_features=30000, stop_words='english')
vectorizer.fit(df['alltext'].values)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=30000, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words='english', strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [18]:
tfidf_X_train = vectorizer.transform(X_train['alltext'])

In [21]:
tfidf_X_dev = vectorizer.transform(X_dev['alltext'])
tfidf_X_test = vectorizer.transform(X_test['alltext'])

Using SGD regressor because number of features is very large

In [57]:
tfidf_views_model = SGDRegressor(max_iter=2000, tol=1e-3, alpha=0.00001)
tfidf_views_model.fit(tfidf_X_train, Y_train['views'].values)



SGDRegressor(alpha=1e-05, average=False, early_stopping=False, epsilon=0.1,
       eta0=0.01, fit_intercept=True, l1_ratio=0.15,
       learning_rate='invscaling', loss='squared_loss', max_iter=2000,
       n_iter=None, n_iter_no_change=5, penalty='l2', power_t=0.25,
       random_state=None, shuffle=True, tol=0.001, validation_fraction=0.1,
       verbose=0, warm_start=False)

In [58]:
print("Views train err: ", eval_metric(Y_train['views'].values, tfidf_views_model.predict(tfidf_X_train)))
print("Views dev err: ", eval_metric(Y_dev['views'].values, tfidf_views_model.predict(tfidf_X_dev)))

Views train err:  5495007.174184686
Views dev err:  6606515.312405204


In [63]:
likes_model = SGDRegressor(max_iter=2000, tol=1e-3, alpha=0)
likes_model.fit(tfidf_X_train, Y_train['likes'].values)



SGDRegressor(alpha=0, average=False, early_stopping=False, epsilon=0.1,
       eta0=0.01, fit_intercept=True, l1_ratio=0.15,
       learning_rate='invscaling', loss='squared_loss', max_iter=2000,
       n_iter=None, n_iter_no_change=5, penalty='l2', power_t=0.25,
       random_state=None, shuffle=True, tol=0.001, validation_fraction=0.1,
       verbose=0, warm_start=False)

In [62]:
print("Likes train err: ", eval_metric(Y_train['likes'].values, likes_model.predict(tfidf_X_train)))
print("Likes dev err: ", eval_metric(Y_dev['likes'].values, likes_model.predict(tfidf_X_dev)))

Likes train err:  97584.03789649117
Likes dev err:  103454.07268251332


In [40]:
dislikes_model = SGDRegressor(max_iter=2000, tol=1e-3, alpha=0)
dislikes_model.fit(tfidf_X_train, Y_train['dislikes'].values)



SGDRegressor(alpha=0, average=False, early_stopping=False, epsilon=0.1,
       eta0=0.01, fit_intercept=True, l1_ratio=0.15,
       learning_rate='invscaling', loss='squared_loss', max_iter=2000,
       n_iter=None, n_iter_no_change=5, penalty='l2', power_t=0.25,
       random_state=None, shuffle=True, tol=0.001, validation_fraction=0.1,
       verbose=0, warm_start=False)

In [41]:
print("Dislikes train err: ", eval_metric(Y_train['dislikes'].values, dislikes_model.predict(tfidf_X_train)))
print("Dislikes dev err: ", eval_metric(Y_dev['dislikes'].values, dislikes_model.predict(tfidf_X_dev)))

Dislikes train err:  19957.785186487672
Dislikes dev err:  14388.956251537587
