# MSA 2020 - AI & Advanced Analytics - Trending Youtube Video Statistics

### Import libraries

In [1]:
import pandas as pd

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold

### Read in file

In [2]:
df = pd.read_csv('US_youtube.csv', index_col=0)
df.head()

Unnamed: 0_level_0,trending_date,title,channel_title,category_id,publish_time,tags,views,likes,dislikes,comment_count,thumbnail_link,comments_disabled,ratings_disabled,video_error_or_removed,description
video_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2kyS6SvSYSE,17.14.11,WE WANT TO TALK ABOUT OUR MARRIAGE,CaseyNeistat,22,2017-11-13T17:13:01.000Z,SHANtell martin,748374,57527,2966,15954,https://i.ytimg.com/vi/2kyS6SvSYSE/default.jpg,False,False,False,SHANTELL'S CHANNEL - https://www.youtube.com/s...
1ZAPwfrtAFY,17.14.11,The Trump Presidency: Last Week Tonight with J...,LastWeekTonight,24,2017-11-13T07:30:00.000Z,"last week tonight trump presidency|""last week ...",2418783,97185,6146,12703,https://i.ytimg.com/vi/1ZAPwfrtAFY/default.jpg,False,False,False,"One year after the presidential election, John..."
5qpjK5DgCt4,17.14.11,"Racist Superman | Rudy Mancuso, King Bach & Le...",Rudy Mancuso,23,2017-11-12T19:05:24.000Z,"racist superman|""rudy""|""mancuso""|""king""|""bach""...",3191434,146033,5339,8181,https://i.ytimg.com/vi/5qpjK5DgCt4/default.jpg,False,False,False,WATCH MY PREVIOUS VIDEO ▶ \n\nSUBSCRIBE ► http...
puqaWrEC7tY,17.14.11,Nickelback Lyrics: Real or Fake?,Good Mythical Morning,24,2017-11-13T11:00:04.000Z,"rhett and link|""gmm""|""good mythical morning""|""...",343168,10172,666,2146,https://i.ytimg.com/vi/puqaWrEC7tY/default.jpg,False,False,False,Today we find out if Link is a Nickelback amat...
d380meD0W0M,17.14.11,I Dare You: GOING BALD!?,nigahiga,24,2017-11-12T18:01:41.000Z,"ryan|""higa""|""higatv""|""nigahiga""|""i dare you""|""...",2095731,132235,1989,17518,https://i.ytimg.com/vi/d380meD0W0M/default.jpg,False,False,False,I know it's been a while since we did this sho...


### Remove unwanted columns

In [3]:
df.isnull().sum()

trending_date               0
title                       0
channel_title               0
category_id                 0
publish_time                0
tags                        0
views                       0
likes                       0
dislikes                    0
comment_count               0
thumbnail_link              0
comments_disabled           0
ratings_disabled            0
video_error_or_removed      0
description               570
dtype: int64

In [4]:
df = df[["channel_title", "category_id", "publish_time", "views", "likes", "dislikes", "comment_count", "comments_disabled", "ratings_disabled", "video_error_or_removed"]]
df.isnull().sum()

channel_title             0
category_id               0
publish_time              0
views                     0
likes                     0
dislikes                  0
comment_count             0
comments_disabled         0
ratings_disabled          0
video_error_or_removed    0
dtype: int64

### Parse publish_time column

In [5]:
df['publish_time'] = df['publish_time'].astype(str)
df['publish_date'] = df['publish_time'].str.split('T').str[0]
df['publish_t'] = df['publish_time'].str.split('T').str[1]

df['publish_year'] = df['publish_date'].str.split('-').str[0].astype(int)
df['publish_month'] = df['publish_date'].str.split('-').str[1].astype(int)
df['publish_hour'] = df['publish_t'].str.split(':').str[0].astype(int)

df = df.drop(['publish_time', 'publish_date', 'publish_t'], axis=1)
df.head()

Unnamed: 0_level_0,channel_title,category_id,views,likes,dislikes,comment_count,comments_disabled,ratings_disabled,video_error_or_removed,publish_year,publish_month,publish_hour
video_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2kyS6SvSYSE,CaseyNeistat,22,748374,57527,2966,15954,False,False,False,2017,11,17
1ZAPwfrtAFY,LastWeekTonight,24,2418783,97185,6146,12703,False,False,False,2017,11,7
5qpjK5DgCt4,Rudy Mancuso,23,3191434,146033,5339,8181,False,False,False,2017,11,19
puqaWrEC7tY,Good Mythical Morning,24,343168,10172,666,2146,False,False,False,2017,11,11
d380meD0W0M,nigahiga,24,2095731,132235,1989,17518,False,False,False,2017,11,18


### Encode categorial columns

In [6]:
df = pd.get_dummies(df, columns=['channel_title', 'comments_disabled', 'ratings_disabled', 'video_error_or_removed'])
df.head()

Unnamed: 0_level_0,category_id,views,likes,dislikes,comment_count,publish_year,publish_month,publish_hour,channel_title_12 News,channel_title_1MILLION Dance Studio,...,channel_title_圧倒的不審者の極み!,channel_title_杰威爾音樂 JVR Music,channel_title_郭韋辰,channel_title_영국남자 Korean Englishman,comments_disabled_False,comments_disabled_True,ratings_disabled_False,ratings_disabled_True,video_error_or_removed_False,video_error_or_removed_True
video_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2kyS6SvSYSE,22,748374,57527,2966,15954,2017,11,17,0,0,...,0,0,0,0,1,0,1,0,1,0
1ZAPwfrtAFY,24,2418783,97185,6146,12703,2017,11,7,0,0,...,0,0,0,0,1,0,1,0,1,0
5qpjK5DgCt4,23,3191434,146033,5339,8181,2017,11,19,0,0,...,0,0,0,0,1,0,1,0,1,0
puqaWrEC7tY,24,343168,10172,666,2146,2017,11,11,0,0,...,0,0,0,0,1,0,1,0,1,0
d380meD0W0M,24,2095731,132235,1989,17518,2017,11,18,0,0,...,0,0,0,0,1,0,1,0,1,0


### Split data into training and testing sets

In [7]:
train_x, test_x, train_y, test_y = train_test_split(df.drop('views', axis=1), df['views'], test_size=0.2)

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import Ridge
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import GradientBoostingRegressor

from sklearn.model_selection import cross_val_score

regressors = [LinearRegression(), Lasso(), ElasticNet(), Ridge(), SVR(), DecisionTreeRegressor(), KNeighborsRegressor(), GradientBoostingRegressor()]
regressor_accuracy_list = []

for i, regressor in enumerate(regressors):
    print(regressor)
    accuracies = cross_val_score(regressor, train_x, train_y, cv=5)
    regressor_accuracy_list.append((accuracies.mean()))

regressor_accuracy_list = sorted(regressor_accuracy_list, reverse=True)
for item in regressor_accuracy_list:
    print(item[1], ": ", items[0])

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)
Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False)
