## Preprocessing

In [1]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
import tensorflow as tf
from sklearn.metrics import r2_score
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn import tree
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from matplotlib import pyplot as plt
import seaborn as sb
import numpy as np
import warnings
warnings.filterwarnings('ignore')
warnings.filterwarnings('ignore', category=DeprecationWarning)

import datetime as dt
from datetime import datetime

In [2]:
file_path = 'data/english_video_data.csv'

In [3]:
videos_df = pd.read_csv(file_path)

videos_df.head()

Unnamed: 0,c_channel_id,v_video_id,v_date_publishedAt,v_time_publishedAt,c_channel_title,c_channel_upload_playlist,c_channel_subCount,c_channel_videoCount,c_channel_viewCount,v_category_id,...,v_comment_count,v_likes,v_favorites,v_views,v_definition,v_duration,v_caption,v_age_restricted,v_privacy_status,v_made_for_kids
0,UC295-Dw_tDNtZXFeAPAW6Aw,oGga7hZypCs,2024-09-02,11:00:48,5-Minute Crafts,UU295-Dw_tDNtZXFeAPAW6Aw,81100000,7445,27960945663,26,...,28,442,0,76602,hd,2:03:35,False,Not Restricted,public,False
1,UC295-Dw_tDNtZXFeAPAW6Aw,pkzIiXkRxB0,2024-08-30,11:00:54,5-Minute Crafts,UU295-Dw_tDNtZXFeAPAW6Aw,81100000,7445,27960945663,26,...,21,1165,0,341133,hd,2:01:48,False,Not Restricted,public,False
2,UC295-Dw_tDNtZXFeAPAW6Aw,TUS8vZaHNds,2024-08-31,11:00:26,5-Minute Crafts,UU295-Dw_tDNtZXFeAPAW6Aw,81100000,7445,27960945663,26,...,43,1374,0,222425,hd,1:00:04,False,Not Restricted,public,False
3,UC295-Dw_tDNtZXFeAPAW6Aw,rBzKTkUG3Wk,2024-08-31,13:00:57,5-Minute Crafts,UU295-Dw_tDNtZXFeAPAW6Aw,81100000,7445,27960945663,26,...,19,274,0,40821,hd,1:01:09,False,Not Restricted,public,False
4,UC295-Dw_tDNtZXFeAPAW6Aw,XHQP6mut8YI,2024-09-01,11:00:36,5-Minute Crafts,UU295-Dw_tDNtZXFeAPAW6Aw,81100000,7445,27960945663,26,...,27,485,0,82374,hd,0:20:16,False,Not Restricted,public,False


In [4]:
# Determine the number of unique values in each column.
videos_df.nunique()

c_channel_id                   400
v_video_id                   22516
v_date_publishedAt             366
v_time_publishedAt           12795
c_channel_title                400
c_channel_upload_playlist      400
c_channel_subCount             350
c_channel_videoCount           318
c_channel_viewCount            400
v_category_id                   15
v_tags                        8717
v_title                      21557
v_description                13500
v_comment_count                593
v_likes                       3268
v_favorites                      1
v_views                      13374
v_definition                     2
v_duration                    3020
v_caption                        2
v_age_restricted                 2
v_privacy_status                 1
v_made_for_kids                  2
dtype: int64

In [5]:
videos_df.dtypes

c_channel_id                 object
v_video_id                   object
v_date_publishedAt           object
v_time_publishedAt           object
c_channel_title              object
c_channel_upload_playlist    object
c_channel_subCount            int64
c_channel_videoCount          int64
c_channel_viewCount           int64
v_category_id                 int64
v_tags                       object
v_title                      object
v_description                object
v_comment_count               int64
v_likes                       int64
v_favorites                   int64
v_views                       int64
v_definition                 object
v_duration                   object
v_caption                      bool
v_age_restricted             object
v_privacy_status             object
v_made_for_kids                bool
dtype: object

In [6]:
# Drop v_favorites and v_privacy_status since they only have one value
videos_df_2 = videos_df.drop(['v_favorites', 'v_privacy_status','c_channel_id', 'v_video_id', 'c_channel_upload_playlist'], axis=1)

In [7]:
# converting date and time published to datetime type
videos_df_2['v_date_publishedAt'] = videos_df_2['v_date_publishedAt'].astype('datetime64[s]')
videos_df_2['v_time_publishedAt'] = videos_df_2['v_time_publishedAt'].astype('datetime64[s]')

In [8]:
# splitting the date and time published into separate columns
videos_df_2['v_year'] = videos_df_2['v_date_publishedAt'].dt.year
videos_df_2['v_mont'] = videos_df_2['v_date_publishedAt'].dt.month
videos_df_2['v_day'] = videos_df_2['v_date_publishedAt'].dt.day
videos_df_2['v_hour'] = videos_df_2['v_time_publishedAt'].dt.hour
videos_df_2['v_minute'] = videos_df_2['v_time_publishedAt'].dt.minute
videos_df_2['v_second'] = videos_df_2['v_time_publishedAt'].dt.second

In [9]:
videos_df_2.drop(['v_date_publishedAt', 'v_time_publishedAt'], axis=1, inplace=True)
videos_df_2.dtypes

c_channel_title         object
c_channel_subCount       int64
c_channel_videoCount     int64
c_channel_viewCount      int64
v_category_id            int64
v_tags                  object
v_title                 object
v_description           object
v_comment_count          int64
v_likes                  int64
v_views                  int64
v_definition            object
v_duration              object
v_caption                 bool
v_age_restricted        object
v_made_for_kids           bool
v_year                   int32
v_mont                   int32
v_day                    int32
v_hour                   int32
v_minute                 int32
v_second                 int32
dtype: object

In [10]:
# Transforming durationg to a datetime object
videos_df_2['v_duration_time'] = pd.to_timedelta(videos_df_2['v_duration'])

In [11]:
# converting the duration into total seconds
videos_df_2['v_duration_time'] = videos_df_2['v_duration_time'].dt.total_seconds()

# drop the old duration column
videos_df_2.drop(['v_duration'], axis=1, inplace=True)

In [12]:
videos_df_2.dtypes

c_channel_title          object
c_channel_subCount        int64
c_channel_videoCount      int64
c_channel_viewCount       int64
v_category_id             int64
v_tags                   object
v_title                  object
v_description            object
v_comment_count           int64
v_likes                   int64
v_views                   int64
v_definition             object
v_caption                  bool
v_age_restricted         object
v_made_for_kids            bool
v_year                    int32
v_mont                    int32
v_day                     int32
v_hour                    int32
v_minute                  int32
v_second                  int32
v_duration_time         float64
dtype: object

In [13]:
# Cleaning v_category_id, grouping the less used ones into 'other' category
videos_df_cats = videos_df_2.copy()
videos_df_cats.head(5)

Unnamed: 0,c_channel_title,c_channel_subCount,c_channel_videoCount,c_channel_viewCount,v_category_id,v_tags,v_title,v_description,v_comment_count,v_likes,...,v_caption,v_age_restricted,v_made_for_kids,v_year,v_mont,v_day,v_hour,v_minute,v_second,v_duration_time
0,5-Minute Crafts,81100000,7445,27960945663,26,"['5 minute craft', '5 minutes craft', '5-minut...",BRILLIANT LIFE HACKS 🌟💙 COOL DOLL REUSE,Discover brilliant life hacks and creative way...,28,442,...,False,Not Restricted,False,2024,9,2,11,0,48,7415.0
1,5-Minute Crafts,81100000,7445,27960945663,26,"['5 minute craft', '5 minutes craft', '5-minut...",SECRET SMART COOKING HACKS THAT CHANGE EVERYTH...,🍳 Ready to elevate your cooking game? In this ...,21,1165,...,False,Not Restricted,False,2024,8,30,11,0,54,7308.0
2,5-Minute Crafts,81100000,7445,27960945663,26,"['5 minute craft', '5 minutes craft', '5-minut...",INCREDIBLE RAINBOW IDEAS 🌈 Creative Room Makeo...,🎨 Ready to transform your space into something...,43,1374,...,False,Not Restricted,False,2024,8,31,11,0,26,3604.0
3,5-Minute Crafts,81100000,7445,27960945663,26,"['5 minute craft', '5 minutes craft', '5-minut...",BOOST YOUR GPA WITH THESE EPIC SCHOOL HACKS & ...,Ready to ace school and crush math? Check out ...,19,274,...,False,Not Restricted,False,2024,8,31,13,0,57,3669.0
4,5-Minute Crafts,81100000,7445,27960945663,26,"['5 minute craft', '5 minutes craft', '5-minut...",FROM DULL TO STUNNING CLOTHES | MUST-SEE CLOTH...,👗 Ready to transform your wardrobe? In this vi...,27,485,...,False,Not Restricted,False,2024,9,1,11,0,36,1216.0


In [14]:
videos_df_cats['v_category_id'].value_counts()

v_category_id
26    14635
22     2299
24     1502
2      1370
27      996
28      931
1       290
20      213
19       78
29       64
17       61
25       58
15       11
10        5
23        3
Name: count, dtype: int64

In [15]:
categories = {
    1: "Film & Animation",
    2: "Autos & Vehicles",
    10: "Music",
    15: "Pets & Animals",
    17: "Sports",
    18: "Short Movies",
    19: "Travel & Events",
    20: "Gaming",
    21: "Videoblogging",
    22: "People & Blogs",
    23: "Comedy",
    24: "Entertainment",
    25: "News & Politics",
    26: "Howto & Style",
    27: "Education",
    28: "Science & Technology",
    29: "Nonprofits & Activism",
    30: "Movies",
    31: "Anime/Animation",
    32: "Action/Adventure",
    33: "Classics",
    34: "Comedy",
    35: "Documentary",
    36: "Drama",
    37: "Family",
    38: "Foreign",
    39: "Horror",
    40: "Sci-Fi/Fantasy",
    41: "Thriller",
    42: "Shorts",
    43: "Shows",
    44: "Trailers"
}

In [16]:
def name_categories(df, category_dict):
    # Iterate through each row in the dataframe
    for index, row in df.iterrows():
        # Check if the category ID is in the list of "other" categories
        if row['v_category_id'] in [23, 10, 17, 15, 19]:
            df.at[index, 'v_category_id'] = "other"
        elif row['v_category_id'] in category_dict:
            # Map the category ID to its name from the dictionary
            df.at[index, 'v_category_id'] = category_dict[row['v_category_id']]

    return df


In [17]:
# here now we should turn the categories into discrete columns using onehotencoder or get dummies
videos_df_cats = name_categories(videos_df_cats, categories)
videos_df_cats.head()

Unnamed: 0,c_channel_title,c_channel_subCount,c_channel_videoCount,c_channel_viewCount,v_category_id,v_tags,v_title,v_description,v_comment_count,v_likes,...,v_caption,v_age_restricted,v_made_for_kids,v_year,v_mont,v_day,v_hour,v_minute,v_second,v_duration_time
0,5-Minute Crafts,81100000,7445,27960945663,Howto & Style,"['5 minute craft', '5 minutes craft', '5-minut...",BRILLIANT LIFE HACKS 🌟💙 COOL DOLL REUSE,Discover brilliant life hacks and creative way...,28,442,...,False,Not Restricted,False,2024,9,2,11,0,48,7415.0
1,5-Minute Crafts,81100000,7445,27960945663,Howto & Style,"['5 minute craft', '5 minutes craft', '5-minut...",SECRET SMART COOKING HACKS THAT CHANGE EVERYTH...,🍳 Ready to elevate your cooking game? In this ...,21,1165,...,False,Not Restricted,False,2024,8,30,11,0,54,7308.0
2,5-Minute Crafts,81100000,7445,27960945663,Howto & Style,"['5 minute craft', '5 minutes craft', '5-minut...",INCREDIBLE RAINBOW IDEAS 🌈 Creative Room Makeo...,🎨 Ready to transform your space into something...,43,1374,...,False,Not Restricted,False,2024,8,31,11,0,26,3604.0
3,5-Minute Crafts,81100000,7445,27960945663,Howto & Style,"['5 minute craft', '5 minutes craft', '5-minut...",BOOST YOUR GPA WITH THESE EPIC SCHOOL HACKS & ...,Ready to ace school and crush math? Check out ...,19,274,...,False,Not Restricted,False,2024,8,31,13,0,57,3669.0
4,5-Minute Crafts,81100000,7445,27960945663,Howto & Style,"['5 minute craft', '5 minutes craft', '5-minut...",FROM DULL TO STUNNING CLOTHES | MUST-SEE CLOTH...,👗 Ready to transform your wardrobe? In this vi...,27,485,...,False,Not Restricted,False,2024,9,1,11,0,36,1216.0


In [18]:
videos_df_cats.dtypes

c_channel_title          object
c_channel_subCount        int64
c_channel_videoCount      int64
c_channel_viewCount       int64
v_category_id            object
v_tags                   object
v_title                  object
v_description            object
v_comment_count           int64
v_likes                   int64
v_views                   int64
v_definition             object
v_caption                  bool
v_age_restricted         object
v_made_for_kids            bool
v_year                    int32
v_mont                    int32
v_day                     int32
v_hour                    int32
v_minute                  int32
v_second                  int32
v_duration_time         float64
dtype: object

In [19]:
def expand_tags(df, column_name):
    # Flatten the list of tags from all rows
    all_tags = set(tag for tags in df[column_name] for tag in tags)
    
    # Create new binary columns for each unique tag
    for tag in all_tags:
        col_name = f'v_tag_{tag}'
        df[col_name] = df[column_name].apply(lambda tags: 1 if tag in tags else 0)
    
    return df

In [20]:
video_df_tags = expand_tags(videos_df_cats,'v_tags')
video_df_tags.head()

Unnamed: 0,c_channel_title,c_channel_subCount,c_channel_videoCount,c_channel_viewCount,v_category_id,v_tags,v_title,v_description,v_comment_count,v_likes,...,v_tag_ไ,v_tag_っ,v_tag_師,v_tag_版,v_tag_老,v_tag_ê,v_tag_魚,v_tag_勁,v_tag_宅,v_tag_¸
0,5-Minute Crafts,81100000,7445,27960945663,Howto & Style,"['5 minute craft', '5 minutes craft', '5-minut...",BRILLIANT LIFE HACKS 🌟💙 COOL DOLL REUSE,Discover brilliant life hacks and creative way...,28,442,...,0,0,0,0,0,0,0,0,0,0
1,5-Minute Crafts,81100000,7445,27960945663,Howto & Style,"['5 minute craft', '5 minutes craft', '5-minut...",SECRET SMART COOKING HACKS THAT CHANGE EVERYTH...,🍳 Ready to elevate your cooking game? In this ...,21,1165,...,0,0,0,0,0,0,0,0,0,0
2,5-Minute Crafts,81100000,7445,27960945663,Howto & Style,"['5 minute craft', '5 minutes craft', '5-minut...",INCREDIBLE RAINBOW IDEAS 🌈 Creative Room Makeo...,🎨 Ready to transform your space into something...,43,1374,...,0,0,0,0,0,0,0,0,0,0
3,5-Minute Crafts,81100000,7445,27960945663,Howto & Style,"['5 minute craft', '5 minutes craft', '5-minut...",BOOST YOUR GPA WITH THESE EPIC SCHOOL HACKS & ...,Ready to ace school and crush math? Check out ...,19,274,...,0,0,0,0,0,0,0,0,0,0
4,5-Minute Crafts,81100000,7445,27960945663,Howto & Style,"['5 minute craft', '5 minutes craft', '5-minut...",FROM DULL TO STUNNING CLOTHES | MUST-SEE CLOTH...,👗 Ready to transform your wardrobe? In this vi...,27,485,...,0,0,0,0,0,0,0,0,0,0


In [21]:
def bucket_views(df, column_name):
    bins = [-1, 1000, 10000, 100000, 1000000, float("inf")]
    labels = ["Very Low (0-1,000)", "Low (1,000-10,000)", "Medium (10,000-100,000)", "High (100,000-1,000,000)", "Very High (1,000,000+)"]
    
    df["view_bucket"] = pd.cut(df[column_name], bins=bins, labels=labels, right=True)
    
    return df

In [22]:
bucket_video_df = bucket_views(video_df_tags, 'v_views')
bucket_video_df['view_bucket'].value_counts()

view_bucket
Low (1,000-10,000)          10159
Medium (10,000-100,000)      5585
Very Low (0-1,000)           5072
High (100,000-1,000,000)     1449
Very High (1,000,000+)        251
Name: count, dtype: int64

In [23]:
# randomize the order of the dataframe
bucket_video_df = bucket_video_df.sample(frac=1, random_state=42).reset_index(drop=True)

In [24]:
videos_df_dummies = pd.get_dummies(data= video_df_tags, columns=['v_category_id','v_definition', 'v_caption', 'v_age_restricted', 'v_made_for_kids'],dtype=int)

In [25]:
encoder = LabelEncoder()
videos_df_dummies["view_bucket_encoded"] = encoder.fit_transform(videos_df_dummies["view_bucket"])

In [26]:
videos_df_dummies.drop(columns=['v_tags','v_title','v_description','c_channel_title','view_bucket','v_views'], axis=1, inplace =True)

In [27]:
# Split our preprocessed data into our features and target arrays
y = videos_df_dummies['view_bucket_encoded'].values
X = videos_df_dummies.drop(columns=['view_bucket_encoded']).values

In [28]:
# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42,stratify=y) #train_test_split(X, y,random_state=42)

In [29]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [30]:
# Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.
number_input_features = len(X_train[0])
hidden_nodes_layer1 = 16
hidden_nodes_layer2 = 8
hidden_nodes_layer3 = 4
hidden_nodes_layer4 = 1

nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer1, activation="relu", input_dim=number_input_features))

# Second hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer2, activation="relu"))
# Third hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer3, activation="relu"))

# Output layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer4, activation="sigmoid"))

# Check the structure of the model
nn.summary()

In [31]:
# Compile the model
nn.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [32]:
# Train the model
fit_model = nn.fit(X_train_scaled, y_train, epochs=100)

Epoch 1/100
[1m563/563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.4432 - loss: -85.8104
Epoch 2/100
[1m563/563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.4477 - loss: -6145.7617
Epoch 3/100
[1m563/563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.4496 - loss: -61685.7695
Epoch 4/100
[1m563/563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.4505 - loss: -267313.5000
Epoch 5/100
[1m563/563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.4539 - loss: -728476.7500
Epoch 6/100
[1m563/563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.4445 - loss: -1576395.5000
Epoch 7/100
[1m563/563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.4474 - loss: -2871861.5000
Epoch 8/100
[1m563/563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.4531 - loss: -4

In [33]:
# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

141/141 - 0s - 3ms/step - accuracy: 0.4512 - loss: -2.7911e+10
Loss: -27911041024.0, Accuracy: 0.45115453004837036


In [34]:
# Export our model to HDF5 file
nn.save('model/video_model.h5')

