In [265]:
# Standard stack
import datetime
import pandas as pd
import numpy as np
import re

# Visualization
from pandas_profiling import ProfileReport
#import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns

# Modeling
import tensorflow as tf
import tensorflow_decision_forests as tfdf
try:
    from wurlitzer import sys_pipes
except:
    from colabtools.googlelog import CaptureLog as sys_pipes

# Scikit-learn packages
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer, OrdinalEncoder
from sklearn.compose import ColumnTransformer

# display
from IPython.core.magic import register_line_magic
from IPython.display import Javascript

import warnings
warnings.filterwarnings('ignore')

In [103]:
TRAIN_PATH = "data/train.csv"
TEST_PATH = "data/test.csv"
label = "Hours_watched"

In [104]:
train = pd.read_csv(TRAIN_PATH)
test = pd.read_csv(TEST_PATH)
submission = pd.read_csv(TEST_PATH)

In [105]:
train.head(1)

Unnamed: 0,Rank,Game,Month,Year,Hours_watched,Hours_Streamed,Peak_viewers,Peak_channels,Streamers,Avg_viewer_ratio
0,1,League of Legends,1,2016,94377226,1362044,530270,2903,129172,69.29


In [106]:
#train = train.drop('Rank',axis=1)

In [107]:
test.head(1)

Unnamed: 0,Game,Month,Year,Hours_Streamed,Peak_viewers,Peak_channels,Streamers,Avg_viewer_ratio
0,Life is Strange,5,2021,19085,119098,62,3240,71.19


In [108]:
64 * 8

512

## Data Transformations
1. Need to pivot data on date so each row has 64 * 6 rows (months * none date features).
2. Add lag values to data points

Lets start by working with only LoL to make dataset smaller for testing/

In [588]:
df = train.append(test)

In [589]:
data = df[df['Game'] == "League of Legends"]

In [590]:
features = ['Rank', 'Hours_watched', 'Hours_Streamed',
       'Peak_viewers', 'Peak_channels', 'Streamers', 'Avg_viewer_ratio']

In [591]:
features = ['Hours_watched']
lags = 1
unique_games = df['Game'].unique()

In [592]:
unique_games[0:6]

array(['League of Legends', 'Counter-Strike: Global Offensive', 'Dota 2',
       'Hearthstone', 'Call of Duty: Black Ops III', 'Minecraft'],
      dtype=object)

In [593]:
prediction_games = test['Game'].unique()

In [594]:
all_data = []
for game in unique_games: 
    temp = df[df['Game'] == game]
    for lag in range(1,3):
        for feature in features:
            temp[feature + str(lag)] = temp[feature].shift(lag)
    temp['hours_watch_change'] = temp['Hours_watched1'] - temp['Hours_watched2']
    temp['hours_watch_change_mean'] = temp['hours_watch_change'].mean()
    temp['hours_watch_change_std'] = temp['hours_watch_change'].std()
    temp['num_months_top_200'] = temp.shape[0]
    temp['in_prediction_set'] = np.where(temp['Game'].isin(prediction_games),1,0)
    temp = temp.tail(10)
    all_data.append(temp)

In [595]:
all_data[0]

Unnamed: 0,Rank,Game,Month,Year,Hours_watched,Hours_Streamed,Peak_viewers,Peak_channels,Streamers,Avg_viewer_ratio,Hours_watched1,Hours_watched2,hours_watch_change,hours_watch_change_mean,hours_watch_change_std,num_months_top_200,in_prediction_set
10951,2.0,League of Legends,8,2020,142551572.0,3250969,707347,7440,229635,43.85,135123272.0,143290296.0,-8167024.0,993919.746032,19485030.0,65,1
11152,3.0,League of Legends,9,2020,135008222.0,3174675,794857,6737,230335,42.53,142551572.0,135123272.0,7428300.0,993919.746032,19485030.0,65,1
11351,2.0,League of Legends,10,2020,171613067.0,3367157,2020835,8520,246211,50.97,135008222.0,142551572.0,-7543350.0,993919.746032,19485030.0,65,1
11551,2.0,League of Legends,11,2020,121085351.0,3392709,345045,8877,254028,35.69,171613067.0,135008222.0,36604845.0,993919.746032,19485030.0,65,1
11752,3.0,League of Legends,12,2020,96413165.0,3297197,318449,7979,247003,29.24,121085351.0,171613067.0,-50527716.0,993919.746032,19485030.0,65,1
11952,3.0,League of Legends,1,2021,170781975.0,4354400,689503,11606,294795,39.22,96413165.0,121085351.0,-24672186.0,993919.746032,19485030.0,65,1
12151,2.0,League of Legends,2,2021,150221076.0,3665750,703375,9687,268439,40.98,170781975.0,96413165.0,74368810.0,993919.746032,19485030.0,65,1
12352,3.0,League of Legends,3,2021,149786518.0,3766997,853781,9222,268051,39.76,150221076.0,170781975.0,-20560899.0,993919.746032,19485030.0,65,1
12552,3.0,League of Legends,4,2021,156994170.0,3592720,906631,8333,260952,43.7,149786518.0,150221076.0,-434558.0,993919.746032,19485030.0,65,1
136,,League of Legends,5,2021,,3516137,1257320,7697,251261,49.49,156994170.0,149786518.0,7207652.0,993919.746032,19485030.0,65,1


In [596]:
data = pd.concat(all_data)

In [597]:
data.shape

(6168, 17)

In [598]:
def inpute_missing(dataset):
    """ 
    Edit this to fix nulls. Default version replaces all int/float with 0
    """
    for col in dataset.columns:
        if dataset[col].dtype not in [str, object]:
            dataset[col] = dataset[col].fillna(0)
    return dataset

data = inpute_missing(data)

In [599]:
data.columns

Index(['Rank', 'Game', 'Month', 'Year', 'Hours_watched', 'Hours_Streamed',
       'Peak_viewers', 'Peak_channels', 'Streamers', 'Avg_viewer_ratio',
       'Hours_watched1', 'Hours_watched2', 'hours_watch_change',
       'hours_watch_change_mean', 'hours_watch_change_std',
       'num_months_top_200', 'in_prediction_set'],
      dtype='object')

In [600]:
features = ['Game', 'Month', 'Year', 'Hours_watched', 'Hours_Streamed',
       'Peak_viewers', 'Peak_channels', 'Streamers', 'Avg_viewer_ratio',
       'Hours_watched1', 'Hours_watched2', 'hours_watch_change',
       'hours_watch_change_mean', 'hours_watch_change_std',
       'num_months_top_200', 'in_prediction_set']

In [601]:
#new_train = data.drop(features, axis=1)
new_train = data[features]

In [602]:
new_train['year-month'] = new_train['Year'].astype(str) +'-'+ new_train['Month'].astype('str')

In [617]:
X_train = new_train[(new_train['year-month'] != '2021-4') & (new_train['year-month'] != '2021-5')]
X_train = X_train.drop(['Year','year-month'],axis=1)
X_test = new_train[new_train['year-month'] == '2021-4']
X_test = X_test.drop(['Year','year-month'],axis=1)
#X_train, X_test = train_test_split(train,test_size=0.1, random_state=42)
label = 'Hours_watched'

#X_train = X_train[X_train['Game'] == 'League of Legends']
#X_test = X_test[X_test['Game'] == 'League of Legends']

In [618]:
train_tf = tfdf.keras.pd_dataframe_to_tf_dataset(X_train, label=label,task=tfdf.keras.Task.REGRESSION)
test_tf = tfdf.keras.pd_dataframe_to_tf_dataset(X_test, label=label,task=tfdf.keras.Task.REGRESSION)
#predictions = tfdf.keras.pd_dataframe_to_tf_dataset(test,task=tfdf.keras.Task.REGRESSION)

In [619]:
# A more complex, but possibly, more accurate model.
models = {
    'rf_baselines': tfdf.keras.RandomForestModel(task = tfdf.keras.Task.REGRESSION),
    'rf_tune1': tfdf.keras.RandomForestModel(hyperparameter_template="benchmark_rank1", task = tfdf.keras.Task.REGRESSION),
    'gbt_baselines': tfdf.keras.GradientBoostedTreesModel(task = tfdf.keras.Task.REGRESSION),
    'gbt_tune1': tfdf.keras.GradientBoostedTreesModel(hyperparameter_template="benchmark_rank1",task = tfdf.keras.Task.REGRESSION)
    }

evaluation = {}
for key in models:
    print(key)
    #Evaluate: metric logsloss: BinaryCrossentropy
    models[key].compile(metrics=["mse"])
    
    #with sys_pipes():
    models[key].fit(x=train_tf)
    evaluation[key] = models[key].evaluate(test_tf, return_dict=True)

rf_baselines
rf_tune1
gbt_baselines
gbt_tune1


In [622]:
1 - 2

-1

In [621]:
18612635041792.0000 - 48017923112960.0000

-29405288071168.0

In [620]:
evaluation

{'rf_baselines': {'loss': 0.0, 'mse': 44223449530368.0},
 'rf_tune1': {'loss': 0.0, 'mse': 48017923112960.0},
 'gbt_baselines': {'loss': 0.0, 'mse': 18612635041792.0},
 'gbt_tune1': {'loss': 0.0, 'mse': 25926008045568.0}}

In [608]:
# Predictions

In [623]:
scores = models['gbt_baselines'].predict(test_tf)

In [624]:
X_test = X_test[X_test['Game'] == 'League of Legends']

In [625]:
X_test

Unnamed: 0,Game,Month,Hours_watched,Hours_Streamed,Peak_viewers,Peak_channels,Streamers,Avg_viewer_ratio,Hours_watched1,Hours_watched2,hours_watch_change,hours_watch_change_mean,hours_watch_change_std,num_months_top_200,in_prediction_set
12552,League of Legends,4,156994170.0,3592720,906631,8333,260952,43.7,149786518.0,150221076.0,-434558.0,993919.746032,19485030.0,65,1


In [626]:
scores[0][0]

146824200.0

In [627]:
X_test = train[(train['Year'] == 2021) & (train['Month'] == 4)]
X_test['prediction'] = scores
X_test['prediction'] = X_test['prediction'].astype(int)
X_test = X_test.sort_values(by='prediction', ascending=False)
X_test = X_test.reset_index(drop=True)
X_test = X_test.reset_index(drop=False)
X_test['index'] = X_test['index'] + 1
X_test = X_test.rename({'index':'Rank_Prediction'}, axis=1)
X_test = X_test.sort_values(by='Rank', ascending=True)

In [628]:
X_test.head(20)

Unnamed: 0,Rank_Prediction,Rank,Game,Month,Year,Hours_watched,Hours_Streamed,Peak_viewers,Peak_channels,Streamers,Avg_viewer_ratio,prediction
2,3,1,Just Chatting,4,2021,289547911,3544601,937749,8026,474257,81.69,146824192
7,8,2,Grand Theft Auto V,4,2021,239438984,2508876,1105818,7043,211141,95.44,67416424
9,10,3,League of Legends,4,2021,156994170,3592720,906631,8333,260952,43.7,45842492
16,17,4,Call of Duty: Warzone,4,2021,100571607,3596411,1629284,22437,218805,27.96,22320770
5,6,5,Fortnite,4,2021,94112337,5810309,498791,17790,588089,16.2,91680224
11,12,6,VALORANT,4,2021,85635830,3435109,440084,8565,306540,24.93,33994556
38,39,7,Minecraft,4,2021,79728626,3112654,749947,8863,405918,25.61,7806588
72,73,8,Counter-Strike: Global Offensive,4,2021,70072819,1464933,421485,4173,158206,47.83,3424150
69,70,9,Apex Legends,4,2021,51425055,3688671,171721,8110,297846,13.94,3481732
51,52,10,Dota 2,4,2021,49785357,666178,372593,1658,32754,74.73,4896329


In [629]:
from sklearn.metrics import accuracy_score
y_pred = X_test.Rank_Prediction.values.tolist()
y_true = X_test.Rank.values.tolist()

In [630]:
accuracy_score(y_true, y_pred)

0.03

In [464]:
submission

Unnamed: 0,Game,Month,Year,Hours_Streamed,Peak_viewers,Peak_channels,Streamers,Avg_viewer_ratio
0,Life is Strange,5,2021,19085,119098,62,3240,71.19
1,Overwatch,5,2021,770340,111114,2056,77905,15.96
2,RuneScape,5,2021,48822,6467,122,3121,24.41
3,Conan Exiles,5,2021,61966,120347,405,4752,24.50
4,It Takes Two,5,2021,149157,39805,575,24734,19.58
...,...,...,...,...,...,...,...,...
195,Warframe,5,2021,140753,55140,362,11324,9.22
196,Science & Technology,5,2021,204954,29844,415,9530,15.34
197,Call of Duty: Mobile,5,2021,206379,13520,541,19575,6.55
198,"Pools, Hot Tubs, and Beaches",5,2021,44701,59425,268,8097,105.36


In [None]:
submit = submission[['id','price']]

In [None]:
submit.head()

In [None]:
submit.to_csv('submission.csv', index=False)

In [95]:
games = scoring.Game.unique()

In [92]:
scoring = pd.read_csv('data/scoring.csv')

In [100]:
predictions = df[(df['Month']==4) & (df['Year']==2021)]
predictions = predictions[['Game','Rank','Hours_watched']]

In [101]:
predictions[predictions['Game'].isin(games)]

Unnamed: 0,Game,Rank,Hours_watched
12550,Just Chatting,1.0,289547911.0
12551,Grand Theft Auto V,2.0,239438984.0
12552,League of Legends,3.0,156994170.0
12553,Call of Duty: Warzone,4.0,100571607.0
12554,Fortnite,5.0,94112337.0
...,...,...,...
12737,Lineage 2,188.0,850897.0
12739,Sekiro: Shadows Die Twice,190.0,841198.0
12744,Terraria,195.0,796745.0
12747,Epic Seven,198.0,778750.0


In [93]:
scoring

Unnamed: 0,Rank,Game,Month,Year,Hours_watched,Hours_Streamed,Peak_viewers,Peak_channels,Streamers,Avg_viewers,Avg_channels,Avg_viewer_ratio
0,1,Just Chatting,5,2021,285948196,3522387 hours,983678,7465,462447,384856,4740,81.18
1,2,Grand Theft Auto V,5,2021,253419456,2510018 hours,1140656,6978,197818,341075,3378,100.96
2,3,League of Legends,5,2021,174012672,3516137 hours,1257320,7697,251261,234202,4732,49.49
3,4,VALORANT,5,2021,100244546,3307836 hours,1010321,8041,288329,134918,4452,30.31
4,5,Call of Duty: Warzone,5,2021,96069742,3482496 hours,406728,9133,204332,129299,4687,27.59
...,...,...,...,...,...,...,...,...,...,...,...,...
195,196,Detroit: Become Human,5,2021,831018,32823 hours,56330,112,5188,1118,44,25.32
196,197,The Elder Scrolls V: Skyrim,5,2021,828381,85645 hours,6923,210,9446,1114,115,9.67
197,198,FIFA Online 4,5,2021,814729,12079 hours,61452,54,1234,1096,16,67.45
198,199,Rogue Company,5,2021,813253,142933 hours,24500,414,16791,1094,192,5.69
