**1. Load the dataset**
============

In [35]:
import pandas as pd
from langdetect import detect
import gzip

"""
Поскольку выделение англискиих названий занимает много времени, 
загружаем уже готовый датасет с только англискими title и description.
Код препроцессинга по языкам представлен ниже
"""

with gzip.open('dataset/en_language_dataset.csv.tar.gz') as f:
    dataset = pd.read_csv(f)

dataset.head()

Unnamed: 0,en_language_dataset.csv,video_id,trending_date,title,channel_title,category_id,publish_time,tags,views,likes,dislikes,comment_count,thumbnail_link,comments_disabled,ratings_disabled,video_error_or_removed,description
0,0,n1WpP7iowLc,17.14.11,Eminem - Walk On Water (Audio) ft. Beyoncé,EminemVEVO,10.0,2017-11-10T17:00:03.000Z,"Eminem|""Walk""|""On""|""Water""|""Aftermath/Shady/In...",17158579.0,787425.0,43420.0,125882.0,https://i.ytimg.com/vi/n1WpP7iowLc/default.jpg,False,False,False,Eminem's new track Walk on Water ft. Beyoncé i...
1,2,5qpjK5DgCt4,17.14.11,"Racist Superman | Rudy Mancuso, King Bach & Le...",Rudy Mancuso,23.0,2017-11-12T19:05:24.000Z,"racist superman|""rudy""|""mancuso""|""king""|""bach""...",3191434.0,146035.0,5339.0,8181.0,https://i.ytimg.com/vi/5qpjK5DgCt4/default.jpg,False,False,False,WATCH MY PREVIOUS VIDEO ▶ \n\nSUBSCRIBE ► http...
2,4,2Vv-BfVoq4g,17.14.11,Ed Sheeran - Perfect (Official Music Video),Ed Sheeran,10.0,2017-11-09T11:04:14.000Z,"edsheeran|""ed sheeran""|""acoustic""|""live""|""cove...",33523622.0,1634130.0,21082.0,85067.0,https://i.ytimg.com/vi/2Vv-BfVoq4g/default.jpg,False,False,False,🎧: https://ad.gt/yt-perfect\n💰: https://atlant...
3,5,0yIWz1XEeyc,17.14.11,Jake Paul Says Alissa Violet CHEATED with LOGA...,DramaAlert,25.0,2017-11-13T07:37:51.000Z,"#DramaAlert|""Drama""|""Alert""|""DramaAlert""|""keem...",1309699.0,103755.0,4613.0,12143.0,https://i.ytimg.com/vi/0yIWz1XEeyc/default.jpg,False,False,False,► Follow for News! - https://twitter.com/KEEMS...
4,6,_uM5kFfkhB8,17.14.11,Vanoss Superhero School - New Students,VanossGaming,23.0,2017-11-12T23:52:13.000Z,"Funny Moments|""Montage video games""|""gaming""|""...",2987945.0,187464.0,9850.0,26629.0,https://i.ytimg.com/vi/_uM5kFfkhB8/default.jpg,False,False,False,Vanoss Merch Shop: https://vanoss.3blackdot.co...


**2. Preprocessing**
===========

**Оставить только записи с английским title и description**

In [8]:
def detect_en_lang(title, description):
    try:
        return detect(title) == 'en' and detect(description) == 'en'
    except:
        return False
    
    
# cavideos = pd.read_csv('dataset/CAvideos.csv')
# devideos = pd.read_csv('dataset/DEvideos.csv')
# frvideos = pd.read_csv('dataset/FRvideos.csv')
# gbvideos = pd.read_csv('dataset/GBvideos.csv')
# usvideos = pd.read_csv('dataset/USvideos.csv')
# dataset = dataset.append(cavideos).append(devideos).append(frvideos).append(gbvideos).append(usvideos)
# dataset = dataset[dataset.apply(lambda x: detect_en_lang(str(x['title']), str(x['description'])), axis=1)]

**Интерполируем к периоду в часах после publish_time**

In [9]:
import datetime

trending_date = pd.to_datetime(dataset['trending_date'], format='%y.%d.%m')
publish_time = pd.to_datetime(dataset['publish_time'])
trending_publish_diff = (trending_date - publish_time).map(lambda x: x.seconds / 3600)
dataset['trending_publish_diff'] = trending_publish_diff

**Baseline 1**
========

In [23]:
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error, mean_squared_log_error

le = preprocessing.LabelEncoder()
dataset['channel_title'] = le.fit_transform(dataset['channel_title'].fillna(''))

X_channel_and_category = dataset[['channel_title', 'category_id']]
X_channel_and_category = X_channel_and_category.dropna(how='any')
X_channel = X_channel_and_category[['channel_title']]
X_category = X_channel_and_category[['category_id']]

y = dataset[['views', 'likes', 'dislikes']]
y = y.dropna(how='any')

metrics = [r2_score, mean_absolute_error, mean_squared_error, mean_squared_log_error]

def predict_by(X, y, title):
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.5, test_size=0.5, random_state=0)
    X_test, X_development, y_test, y_development = train_test_split(X_test, y_test, train_size=0.5, test_size=0.5, random_state=0)

    knn = KNeighborsRegressor(n_neighbors=15)
    regr = MultiOutputRegressor(knn)
    regr.fit(X_train, y_train)
    y_test_pred = regr.predict(X_test)
    
    print(f"by {title}:")
    for metric in metrics:  
        print(f"{metric.__name__}: {metric(y_test, y_test_pred, multioutput='raw_values')}")
    
predict_by(X_channel, y, 'channel')
predict_by(X_category, y, 'category')
predict_by(X_channel_and_category, y, 'channel and category')


by channel:
r2_score: [0.44072617 0.60233555 0.54568759]
mean_absolute_error: [1384596.90384532   29968.62602074    2800.26666667]
mean_squared_error: [2.33938306e+13 1.07669480e+10 8.21213195e+08]
mean_squared_log_error: [1.52482739 1.8772831  2.20052332]
by category:
r2_score: [-0.02581641 -0.00330245 -0.00401286]
mean_absolute_error: [2007433.09885504   54903.56815727    3853.56465759]
mean_squared_error: [4.29088116e+13 2.71648758e+10 1.81484938e+09]
mean_squared_log_error: [3.84558901 5.07856805 4.93766782]
by channel and category:
r2_score: [0.41826584 0.54418443 0.62320079]
mean_absolute_error: [1408472.07020955   31203.96180601    2729.15017282]
mean_squared_error: [2.43333226e+13 1.23414165e+10 6.81100647e+08]
mean_squared_log_error: [1.48295895 1.87162236 2.19095673]


**Baseline 2**
========

In [34]:
from sklearn.feature_extraction.text import TfidfVectorizer

X = dataset[['title', 'description']]
y = dataset[['views', 'likes', 'dislikes']]
X = X.dropna(how='any')
y = y.dropna(how='any')

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.5, test_size=0.5, random_state=0)
X_test, X_development, y_test, y_development = train_test_split(X_test, y_test, train_size=0.05, test_size=0.95, random_state=0)

vectorizer = TfidfVectorizer()
X_train_title_transformed = vectorizer.fit_transform(X_train['title'].fillna(''))
X_test_title_transformed = vectorizer.transform(X_test['title'].fillna(''))

X_train_description_transofmed = vectorizer.fit_transform(X_train['description'].fillna(''))
X_test_description_transformed = vectorizer.transform(X_test['description'].fillna(''))

def tfidf_predict_by(X_train, X_test, y_train, y_test, title):
    knn = KNeighborsRegressor(n_neighbors=15)
    regr = MultiOutputRegressor(knn)
    regr.fit(X_train, y_train)
    y_test_pred = regr.predict(X_test)
    
    print(f"by {title}:")
    for metric in metrics:  
        print(f"{metric.__name__}: {metric(y_test, y_test_pred, multioutput='raw_values')}")
    

tfidf_predict_by(X_train_title_transformed, X_test_title_transformed, y_train, y_test, 'title')
tfidf_predict_by(X_train_description_transofmed, X_test_description_transformed, y_train, y_test, 'description')


by title:
r2_score: [0.68350366 0.79999343 0.91793384]
mean_absolute_error: [1458066.65388079   36152.54820818    2735.08222582]
mean_squared_error: [1.66225657e+13 7.77327788e+09 2.21604641e+08]
mean_squared_log_error: [2.27575674 3.35160713 3.07395405]
by description:
r2_score: [0.72660195 0.83931109 0.88503297]
mean_absolute_error: [1267994.26075995   28125.38779038    2428.98667387]
mean_squared_error: [1.43590194e+13 6.24519266e+09 3.10447402e+08]
mean_squared_log_error: [1.91332478 2.53416319 2.7038341 ]
