In [76]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
import joblib
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from datetime import datetime, timedelta
import matplotlib.pyplot as plt

In [77]:
dataset = pd.concat([pd.read_csv("../data/ytunlabeled2.csv"), pd.read_csv("../data/ytunlabeled3.csv")])
dataset = dataset[["viewCount", "avg polarity score", "subscribers", "totalVideos", "totalViews", "duration"]]
dataset = dataset[dataset["avg polarity score"].notna()]
label = "viewCount"

def convert_timestamp(time):
    parsing_str = "PT"
    
    if "H" in time:
        parsing_str += "%HH"
    if "M" in time:
        parsing_str += "%MM"
    if "S" in time:
        parsing_str += "%SS"
        
    t = datetime.strptime(time, parsing_str)
    # convert to timedelta to get the total seconds
    td = timedelta(minutes=t.minute, seconds=t.second)
    return td.total_seconds()

dataset["duration"] = dataset["duration"].apply(lambda time: convert_timestamp(time))

In [78]:
dataset[label].describe()

count    1.595000e+03
mean     1.262284e+05
std      4.799890e+05
min      0.000000e+00
25%      4.515000e+03
50%      3.208700e+04
75%      8.751800e+04
max      8.015846e+06
Name: viewCount, dtype: float64

In [79]:
x_train, x_test, y_train, y_test = train_test_split(dataset.drop(columns=[label]), dataset[label], test_size=0.33, random_state=42)

In [80]:
model = RandomForestRegressor()
model.fit(x_train, y_train)

preds = model.predict(x_test)
print("MSE:", mean_squared_error(y_test, preds))
print("MAE:", mean_absolute_error(y_test, preds))

MSE: 96014679310.36821
MAE: 63446.9380689437


In [81]:
diff = np.abs(preds - y_test.values)
res = pd.concat([pd.Series(preds), y_test.reset_index(drop=True), pd.Series(diff)], axis=1)
res.columns = ["preds", "viewCount", "diff"]
res

Unnamed: 0,preds,viewCount,diff
0,8.175472e+04,70858,10896.720000
1,2.605501e+04,1291,24764.010000
2,1.427826e+06,2065737,637910.590000
3,2.242567e+05,175157,49099.680000
4,7.861667e-01,1,0.213833
...,...,...,...
522,3.359510e+03,5415,2055.490000
523,2.279327e+05,266015,38082.340000
524,1.063016e+04,16379,5748.840000
525,5.678374e+04,71311,14527.260000


In [None]:
# save the model to disk
filename = 'random_forest.joblib'
joblib.dump(model, filename)

In [84]:
print(x_train)

     avg polarity score  subscribers  totalVideos  totalViews  duration
319            0.392919        57000          249     5006198    1124.0
355            0.271177        47500          253     5917076     337.0
495            0.303378       135000         1289    21427061     200.0
209            0.342399      1710000         2320   347648425     296.0
140            0.341549      1710000         2320   347648425     326.0
..                  ...          ...          ...         ...       ...
333            0.268607        47500          253     5917076     330.0
497            0.285690       135000         1289    21427061     150.0
63             0.279763        25600          392     2343442     313.0
662            0.554277       679000          322    55991342    1516.0
329            0.398720        57000          249     5006198    1220.0

[1068 rows x 5 columns]
