In [140]:
## Set up dataset

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import xgboost as xgb

df = pd.read_csv("../data/youtube_project_database.csv")

In [141]:
## Clean up data

import re

#Views
df = df[df['viewCount'].notna()]
df['logViews'] = np.log(df['viewCount'] + 1)

mean_log_views = np.mean(df['logViews'])
std_log_views = np.std(df['logViews'])
df['zLogViews'] = (df['logViews'] - mean_log_views) / std_log_views

# Categories
df["categoryId"].astype("category")

# Average Views
df = df[df["videoCount"] > 0]
df['avgViewsPerVid'] = pd.to_numeric(df['avgViewsPerVid'])

# Titles
def process_title(title):
    title = str(title)
    # title = re.sub(r"&.*?;", " ", title)
    title = re.sub(r"[!\"#\＄%&\(\)\*\+,-\./:;<=>\?@\[\\\]\^_`{\|}~]", " ", title)
    title = title.lower()
    title = title.split(" ")
    title = list(filter(None, title))
    return title

titles = df["vidTitle"].map(process_title)

# Tags
def process_tags(tags):
    if(tags == None):
        return []
    tags = str(tags)
    tags = tags.lower()
    tags = tags[1:-1]
    tags = tags.split(", ")
    tags = [tag[1:-1] for tag in tags]
    tags = list(filter(None, tags))
    return tags

tags = df["tags"].map(process_tags)
df["tagCount"] = len(tags)

# Times
df.loc[df['retrievalTime'].isna(), 'retrievalTime'] = '2023-04-07 14:25:00'

df['publishedAt'] = pd.to_datetime(df['publishedAt'], format='%Y-%m-%dT%H:%M:%SZ')
df['retrievalTime'] = pd.to_datetime(df['retrievalTime'], format='%Y-%m-%d %H:%M:%S')
df['timeElapsed'] = (df['retrievalTime'] - df['publishedAt']).dt.days

In [142]:
## Train Model for FastText

from gensim.models import Word2Vec, FastText

def convertToVec(ft, wordlist):
    if (len(wordlist) == 0):
        return np.zeros(ft.vector_size)
    else:
        return np.mean([ft.wv[word] for word in wordlist], axis=0)

vector_size_titles = 300
ft_titles = FastText(titles, vector_size=vector_size_titles)
title_vectors = [convertToVec(ft_titles, title) for title in titles]
title_vec_cols = ["titleVec" + str(num) for num in range(vector_size_titles)]
title_vec_df = pd.DataFrame(title_vectors, index=df.index, columns = title_vec_cols)
df = pd.concat([df, title_vec_df], axis=1)

vector_size_tags = 300
ft_tags = FastText(tags, min_count=2, vector_size = vector_size_tags)
tags_vectors = [convertToVec(ft_tags, tagList) for tagList in tags]
tags_vec_cols = ["tagsVec" + str(num) for num in range(vector_size_tags)]
tags_vec_df = pd.DataFrame(tags_vectors, index=df.index, columns = tags_vec_cols)
df = pd.concat([df, tags_vec_df], axis=1)

In [155]:
## Train Model for Img2Vec

from img2vec_pytorch import Img2Vec
from PIL import Image
import requests
from io import BytesIO

# Initialize Img2Vec with GPU
img2vec = Img2Vec()

# Read in an image (rgb format)

thumbnail_vectors = df['thumbnail lq'].map(lambda url : img2vec.get_vec(Image.open(BytesIO(requests.get(url).content)))).tolist()

vector_size_thumbnail = len(vectors[0])
thumbnail_vec_cols = ["thumbnailVec" + str(num) for num in range(vector_size_thumbnail)]
thumbnail_vec_df = pd.DataFrame(thumbnail_vectors, index=df.index, columns = thumbnail_vec_cols)
df = pd.concat([df, thumbnail_vec_df], axis=1)



In [159]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split

x_labels = ["categoryId", "tagCount", "avgViewsPerVid", "subscriberCount", "videoCount", "timeElapsed"] + title_vec_cols + tags_vec_cols + thumbnail_vec_cols
y_labels = ["zLogViews"]
x = df.loc[:, x_labels]
y = df.loc[:, y_labels]
dmatrix = xgb.DMatrix(data=x,label=y, enable_categorical=True)

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=123)

xgb_reg = xgb.XGBRegressor(objective ='reg:squarederror',
                           learning_rate = 0.1,
                           max_depth = 5,
                           n_estimators = 200)

xgb_reg.fit(x_train, y_train)
pred = xgb_reg.predict(x_test)

print("MAE (just mean): " + str(mean_absolute_error(np.exp(y_test), x_test["avgViewsPerVid"])))
print("MAE (all params): " + str(mean_absolute_error(np.exp(y_test * std_log_views + mean_log_views), np.exp(pred * std_log_views + mean_log_views))))

MAE (just mean): 1813171.0902219412
MAE (all params): 517597.5896786193


In [160]:
np.std(df["viewCount"])

868244.334412082