In [1]:
import pandas as pd
import datetime

# Converting Daily Timeseries to a Weekly Timeseries

In [2]:
spy = pd.read_csv("outputs/SPY.csv")
spy = spy[["Date", "Close"]].rename(columns = {"Date": "day"})
spy["day"] = spy["day"].map(lambda x: pd.to_datetime(x))
spy.head()

Unnamed: 0,day,Close
0,2017-01-26,229.330002
1,2017-01-27,228.970001
2,2017-01-30,227.550003
3,2017-01-31,227.529999
4,2017-02-01,227.619995


## Creating match between days and weeks

In [3]:
min_spy_date, max_spy_date = spy.day.min(), spy.day.max()

matchDayWeek = pd.DataFrame()
matchDayWeek["day"] = pd.date_range(start = min_spy_date, end = max_spy_date)
matchDayWeek["weekNumber"] = matchDayWeek["day"].map(lambda x: int(x.strftime("%V")))
matchDayWeek["year"] = matchDayWeek["day"].map(lambda x: int(x.strftime("%Y")))

weekId = matchDayWeek[["year", "weekNumber"]].drop_duplicates()
weekId["weekId"] = range(weekId.shape[0])

matchDayWeek = matchDayWeek.merge(weekId, on = ["weekNumber", "year"]).drop(columns = ["weekNumber", "year"])

## Assigning delayed weekId on SPY to predict from YouTube Data

In [4]:
spyLabel = spy.merge(matchDayWeek, on = ["day"]).groupby("weekId").agg({"Close": ["first", "last"]}).reset_index()
spyLabel.columns = [c.replace("_", "") if c.startswith("_") else c for c in ["_".join(mc[::-1]) for mc in spyLabel.columns]]
spyLabel["ratioClose"] = 100 * (spyLabel["last_Close"] / spyLabel["first_Close"] - 1)
spyLabel["weekId"] = spyLabel["weekId"] - 1
spyLabel = spyLabel[["weekId", "ratioClose"]]
spyLabel.tail()

Unnamed: 0,weekId,ratioClose
283,284,-1.891455
284,285,1.756734
285,286,0.234233
286,287,3.440237
287,288,-1.183101


In [5]:
spyLabel.to_csv("outputs/spy_label.csv", index = False)

# Numeric Cols FE for YouTube Data

In [6]:
videos_df = pd.read_csv("outputs/videoInfo.csv")
videos_df["shorts"] = videos_df["title"].map(lambda x: 1 if "#shorts" in x else 0)

In [7]:
numeric_selected_vars = [
    "channelTitle",
    "duration",
    "viewCount",
    "likeCount",
    "commentCount",
    "publishedAtDay",
    "publishedAtHour"
    ]

numeric_df = videos_df[numeric_selected_vars].copy()
numeric_df["percInteraction"] = numeric_df["commentCount"]/numeric_df["viewCount"]
numeric_df["percLike"] = numeric_df["likeCount"]/numeric_df["viewCount"]
numeric_df["ratioLikeComment"] = numeric_df["likeCount"]/numeric_df["commentCount"]

numeric_df.head()

Unnamed: 0,channelTitle,duration,viewCount,likeCount,commentCount,publishedAtDay,publishedAtHour,percInteraction,percLike,ratioLikeComment
0,Andrei Jikh,41.0,14122.0,455.0,68.0,2022-07-27,22,0.004815,0.032219,6.691176
1,Andrei Jikh,40.0,21164.0,821.0,47.0,2022-07-26,22,0.002221,0.038792,17.468085
2,Andrei Jikh,711.0,325114.0,16684.0,2682.0,2022-07-25,22,0.008249,0.051317,6.220731
3,Andrei Jikh,51.0,48931.0,1619.0,74.0,2022-07-24,22,0.001512,0.033087,21.878378
4,Andrei Jikh,38.0,44342.0,1071.0,105.0,2022-07-23,22,0.002368,0.024153,10.2


In [8]:
minCommonDate = numeric_df.groupby("channelTitle").agg({"publishedAtDay": "min"}).publishedAtDay.max()
maxCommonDate = numeric_df.groupby("channelTitle").agg({"publishedAtDay": "max"}).publishedAtDay.max()

raw_df = numeric_df[numeric_df.publishedAtDay >= minCommonDate].drop(columns = ["channelTitle"])
raw_df["day"] = raw_df["publishedAtDay"].map(lambda x: pd.to_datetime(x))

raw_df = raw_df.merge(matchDayWeek, on = ["day"]).drop(columns = ["publishedAtDay"])

In [9]:
numeric_cols_to_agg = [c for c in raw_df.columns if c not in ["day", "weekId"]]
dict_numeric_cols_to_agg = {c: ["max", "min", "mean", "sum", "std"] for c in numeric_cols_to_agg}

In [10]:
fe = raw_df.groupby("weekId").agg(dict_numeric_cols_to_agg).reset_index()
fe.columns = [c.replace("_", "") if c.startswith("_") else c for c in ["_".join(mc[::-1]) for mc in fe.columns]]

In [11]:
fe.head()

Unnamed: 0,weekId,max_duration,min_duration,mean_duration,sum_duration,std_duration,max_viewCount,min_viewCount,mean_viewCount,sum_viewCount,...,max_percLike,min_percLike,mean_percLike,sum_percLike,std_percLike,max_ratioLikeComment,min_ratioLikeComment,mean_ratioLikeComment,sum_ratioLikeComment,std_ratioLikeComment
0,101,996.0,959.0,977.5,1955.0,26.162951,726428.0,165445.0,445936.5,891873.0,...,0.096632,0.057735,0.077183,0.154367,0.027504,21.552349,6.596685,14.074517,28.149034,10.575251
1,102,862.0,618.0,753.666667,2261.0,124.275232,258275.0,72186.0,136294.0,408882.0,...,0.061713,0.040538,0.051326,0.153978,0.010594,11.115385,2.765926,8.068166,24.204497,4.608927
2,103,1660.0,417.0,1035.5,4142.0,579.811751,1101650.0,21376.0,406366.25,1625465.0,...,0.068806,0.039016,0.049685,0.198739,0.013436,27.457688,5.413547,12.363831,49.455324,10.164699
3,104,979.0,279.0,706.25,2825.0,300.094846,1339138.0,28117.0,374999.25,1499997.0,...,0.077672,0.037486,0.058391,0.233563,0.019818,13.068862,2.966497,7.673158,30.692632,5.341116
4,105,1954.0,390.0,997.5,3990.0,669.611081,368889.0,32923.0,208712.0,834848.0,...,0.081081,0.0349,0.055441,0.221763,0.019091,18.799497,5.075536,10.544267,42.177069,6.164214


In [12]:
fe.to_csv("outputs/fe.csv", index = False)