In [1]:
import re
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, accuracy_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.compose import ColumnTransformer
from scipy.sparse import hstack

file_path = 'data/english_video_data.csv'
videos_df = pd.read_csv(file_path)
videos_df.head()

Unnamed: 0,c_channel_id,v_video_id,v_date_publishedAt,v_time_publishedAt,c_channel_title,c_channel_upload_playlist,c_channel_subCount,c_channel_videoCount,c_channel_viewCount,v_category_id,...,v_comment_count,v_likes,v_favorites,v_views,v_definition,v_duration,v_caption,v_age_restricted,v_privacy_status,v_made_for_kids
0,UC295-Dw_tDNtZXFeAPAW6Aw,oGga7hZypCs,2024-09-02,11:00:48,5-Minute Crafts,UU295-Dw_tDNtZXFeAPAW6Aw,81100000,7445,27960945663,26,...,28,442,0,76602,hd,2:03:35,False,Not Restricted,public,False
1,UC295-Dw_tDNtZXFeAPAW6Aw,pkzIiXkRxB0,2024-08-30,11:00:54,5-Minute Crafts,UU295-Dw_tDNtZXFeAPAW6Aw,81100000,7445,27960945663,26,...,21,1165,0,341133,hd,2:01:48,False,Not Restricted,public,False
2,UC295-Dw_tDNtZXFeAPAW6Aw,TUS8vZaHNds,2024-08-31,11:00:26,5-Minute Crafts,UU295-Dw_tDNtZXFeAPAW6Aw,81100000,7445,27960945663,26,...,43,1374,0,222425,hd,1:00:04,False,Not Restricted,public,False
3,UC295-Dw_tDNtZXFeAPAW6Aw,rBzKTkUG3Wk,2024-08-31,13:00:57,5-Minute Crafts,UU295-Dw_tDNtZXFeAPAW6Aw,81100000,7445,27960945663,26,...,19,274,0,40821,hd,1:01:09,False,Not Restricted,public,False
4,UC295-Dw_tDNtZXFeAPAW6Aw,XHQP6mut8YI,2024-09-01,11:00:36,5-Minute Crafts,UU295-Dw_tDNtZXFeAPAW6Aw,81100000,7445,27960945663,26,...,27,485,0,82374,hd,0:20:16,False,Not Restricted,public,False


In [2]:
videos_df['v_description'] = videos_df['v_description'].fillna("")

In [3]:
# def remove_urls(description):
#     clean_text = re.sub(r"http\S+|www\S+|https\S+", "", description, flags=re.MULTILINE)
#     clean_text = re.sub(rf"\b{re.escape('video')}\b", "", clean_text, flags=re.IGNORECASE)
#     clean_text = re.sub(rf"\b{re.escape('channel')}\b", "", clean_text, flags=re.IGNORECASE)
#     return clean_text

In [4]:
# vectorizer = CountVectorizer(stop_words='english', preprocessor=remove_urls, ngram_range=(1,1))

# vectors = vectorizer.fit_transform(videos_df['v_description'])
# word_counts = vectors.sum(axis=0)

# word_freq = pd.DataFrame({
#     'word': vectorizer.get_feature_names_out(),
#     'count': word_counts.A1
# })

# word_freq = word_freq.sort_values(by='count', ascending=False)

# word_freq.head(20)

In [5]:
videos_df_2 = videos_df.drop(['v_favorites','v_privacy_status','c_channel_id','v_video_id','c_channel_upload_playlist','v_description'], axis=1)

In [6]:
videos_df_2.dtypes

v_date_publishedAt      object
v_time_publishedAt      object
c_channel_title         object
c_channel_subCount       int64
c_channel_videoCount     int64
c_channel_viewCount      int64
v_category_id            int64
v_tags                  object
v_title                 object
v_comment_count          int64
v_likes                  int64
v_views                  int64
v_definition            object
v_duration              object
v_caption                 bool
v_age_restricted        object
v_made_for_kids           bool
dtype: object

In [7]:
videos_df_2['v_date_publishedAt'] = videos_df_2['v_date_publishedAt'].astype('datetime64[s]')
videos_df_2['v_time_publishedAt'] = videos_df_2['v_time_publishedAt'].astype('datetime64[s]')

videos_df_2['v_year'] = videos_df_2['v_date_publishedAt'].dt.year
videos_df_2['v_mont'] = videos_df_2['v_date_publishedAt'].dt.month
videos_df_2['v_day'] = videos_df_2['v_date_publishedAt'].dt.day
videos_df_2['v_hour'] = videos_df_2['v_time_publishedAt'].dt.hour
videos_df_2['v_minute'] = videos_df_2['v_time_publishedAt'].dt.minute
videos_df_2['v_second'] = videos_df_2['v_time_publishedAt'].dt.second

videos_df_2.drop(['v_date_publishedAt', 'v_time_publishedAt'], axis=1, inplace=True)

videos_df_2['v_duration_time'] = pd.to_timedelta(videos_df_2['v_duration'])

videos_df_2['v_duration_time'] = videos_df_2['v_duration_time'].dt.total_seconds()

videos_df_2.drop(['v_duration'], axis=1, inplace=True)

In [8]:
videos_df_2.dtypes

c_channel_title          object
c_channel_subCount        int64
c_channel_videoCount      int64
c_channel_viewCount       int64
v_category_id             int64
v_tags                   object
v_title                  object
v_comment_count           int64
v_likes                   int64
v_views                   int64
v_definition             object
v_caption                  bool
v_age_restricted         object
v_made_for_kids            bool
v_year                    int32
v_mont                    int32
v_day                     int32
v_hour                    int32
v_minute                  int32
v_second                  int32
v_duration_time         float64
dtype: object

In [9]:
videos_df_2

Unnamed: 0,c_channel_title,c_channel_subCount,c_channel_videoCount,c_channel_viewCount,v_category_id,v_tags,v_title,v_comment_count,v_likes,v_views,...,v_caption,v_age_restricted,v_made_for_kids,v_year,v_mont,v_day,v_hour,v_minute,v_second,v_duration_time
0,5-Minute Crafts,81100000,7445,27960945663,26,"['5 minute craft', '5 minutes craft', '5-minut...",BRILLIANT LIFE HACKS 🌟💙 COOL DOLL REUSE,28,442,76602,...,False,Not Restricted,False,2024,9,2,11,0,48,7415.0
1,5-Minute Crafts,81100000,7445,27960945663,26,"['5 minute craft', '5 minutes craft', '5-minut...",SECRET SMART COOKING HACKS THAT CHANGE EVERYTH...,21,1165,341133,...,False,Not Restricted,False,2024,8,30,11,0,54,7308.0
2,5-Minute Crafts,81100000,7445,27960945663,26,"['5 minute craft', '5 minutes craft', '5-minut...",INCREDIBLE RAINBOW IDEAS 🌈 Creative Room Makeo...,43,1374,222425,...,False,Not Restricted,False,2024,8,31,11,0,26,3604.0
3,5-Minute Crafts,81100000,7445,27960945663,26,"['5 minute craft', '5 minutes craft', '5-minut...",BOOST YOUR GPA WITH THESE EPIC SCHOOL HACKS & ...,19,274,40821,...,False,Not Restricted,False,2024,8,31,13,0,57,3669.0
4,5-Minute Crafts,81100000,7445,27960945663,26,"['5 minute craft', '5 minutes craft', '5-minut...",FROM DULL TO STUNNING CLOTHES | MUST-SEE CLOTH...,27,485,82374,...,False,Not Restricted,False,2024,9,1,11,0,36,1216.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22511,Diy Craft and Art ideas 💫,12,11,1906,22,"['diy', 'YouTube shorts', 'handmade', 'painting']",Diy painting ideas #youtubeshorts #artist,0,11,413,...,False,Not Restricted,False,2024,7,18,13,54,19,32.0
22512,Diy Craft and Art ideas 💫,12,11,1906,22,"['cute painting', 'shorts', 'viral', 'top pain...",Little chettah heart 💜 #diy,0,4,12,...,False,Not Restricted,False,2024,8,12,13,59,4,15.0
22513,Diy Craft and Art ideas 💫,12,11,1906,22,"['diy', 'cute painting ideas', 'handmade', 'ca...",Cute painting 🎨 #shorts,0,4,20,...,False,Not Restricted,False,2024,8,12,4,18,37,8.0
22514,Eve diy and arts,5,2,215,22,[],Drawing Kurdistan flag #shorts #edit #viralvideo,0,7,98,...,False,Not Restricted,False,2024,12,6,15,31,20,12.0


In [10]:
categories = {
    1: "Film & Animation",
    2: "Autos & Vehicles",
    10: "Music",
    15: "Pets & Animals",
    17: "Sports",
    18: "Short Movies",
    19: "Travel & Events",
    20: "Gaming",
    21: "Videoblogging",
    22: "People & Blogs",
    23: "Comedy",
    24: "Entertainment",
    25: "News & Politics",
    26: "Howto & Style",
    27: "Education",
    28: "Science & Technology",
    29: "Nonprofits & Activism",
    30: "Movies",
    31: "Anime/Animation",
    32: "Action/Adventure",
    33: "Classics",
    34: "Comedy",
    35: "Documentary",
    36: "Drama",
    37: "Family",
    38: "Foreign",
    39: "Horror",
    40: "Sci-Fi/Fantasy",
    41: "Thriller",
    42: "Shorts",
    43: "Shows",
    44: "Trailers"
}

In [11]:
def name_categories(df, category_dict):
    for index, row in df.iterrows():
        if row['v_category_id'] in [23, 10, 17, 15, 19]:
            df.at[index, 'v_category_id'] = "other"
        elif row['v_category_id'] in category_dict:
            df.at[index, 'v_category_id'] = category_dict[row['v_category_id']]

    return df

In [12]:
videos_df_cats = name_categories(videos_df_2, categories)

  df.at[index, 'v_category_id'] = category_dict[row['v_category_id']]


In [13]:
videos_df_cats = videos_df_cats.sample(frac=1, random_state=42).reset_index(drop=True)

In [14]:
videos_df_cats.columns

Index(['c_channel_title', 'c_channel_subCount', 'c_channel_videoCount',
       'c_channel_viewCount', 'v_category_id', 'v_tags', 'v_title',
       'v_comment_count', 'v_likes', 'v_views', 'v_definition', 'v_caption',
       'v_age_restricted', 'v_made_for_kids', 'v_year', 'v_mont', 'v_day',
       'v_hour', 'v_minute', 'v_second', 'v_duration_time'],
      dtype='object')

In [15]:
videos_df_dummies = pd.get_dummies(data= videos_df_cats, columns=['v_category_id','v_definition','v_caption','v_age_restricted','v_made_for_kids'],dtype=int)

In [16]:
def bucket_views(df, column_name):
    bins = [-1, 1000, 5000, 10000, 50000, 100000, 500000, 1000000, float("inf")]
    labels = [
        "Very Low (0 - 1K)",
        "Low (1K - 5K)",
        "Moderate (5K - 10K)",
        "Growing (10K - 50K)",
        "Established (50K - 100K)",
        "Popular (100K - 500K)",
        "Trending (500K - 1M)",
        "Viral (1M+)"
    ]
    df["view_bucket"] = pd.cut(df[column_name], bins=bins, labels=labels, right=True)
    
    return df

# Use the function bucket_views to bucket each video viewcount
bucket_video_df = bucket_views(videos_df_dummies, 'v_views')
bucket_video_df.drop(columns=['v_views'], axis=1, inplace =True)
bucket_video_df['view_bucket'].value_counts()

bucket_video_df = bucket_video_df.sample(frac=1, random_state=42).reset_index(drop=True)

encoder = LabelEncoder()
bucket_video_df["video_views_bucket_encoded"] = encoder.fit_transform(bucket_video_df["view_bucket"])

bucket_video_df.drop(columns=['view_bucket'], axis=1, inplace =True)

In [17]:
videos_df_dummies.columns

Index(['c_channel_title', 'c_channel_subCount', 'c_channel_videoCount',
       'c_channel_viewCount', 'v_tags', 'v_title', 'v_comment_count',
       'v_likes', 'v_year', 'v_mont', 'v_day', 'v_hour', 'v_minute',
       'v_second', 'v_duration_time', 'v_category_id_Autos & Vehicles',
       'v_category_id_Education', 'v_category_id_Entertainment',
       'v_category_id_Film & Animation', 'v_category_id_Gaming',
       'v_category_id_Howto & Style', 'v_category_id_News & Politics',
       'v_category_id_Nonprofits & Activism', 'v_category_id_People & Blogs',
       'v_category_id_Science & Technology', 'v_category_id_other',
       'v_definition_hd', 'v_definition_sd', 'v_caption_False',
       'v_caption_True', 'v_age_restricted_Not Restricted',
       'v_age_restricted_ytAgeRestricted', 'v_made_for_kids_False',
       'v_made_for_kids_True', 'view_bucket'],
      dtype='object')

In [18]:
# # used for testing with just 2 buckets for views
# def bucket_views_2(df, column_name):
#     df["view_bucket"] = pd.qcut(df[column_name], 2)
    
#     return df

# # Use the function bucket_views to bucket each video viewcount
# bucket_video_df = bucket_views_2(videos_df_dummies, 'v_views')
# bucket_video_df.drop(columns=['v_views'], axis=1, inplace =True)
# bucket_video_df['view_bucket'].value_counts()

# bucket_video_df = bucket_video_df.sample(frac=1, random_state=42).reset_index(drop=True)

# encoder = LabelEncoder()
# bucket_video_df["video_views_bucket_encoded"] = encoder.fit_transform(bucket_video_df["view_bucket"])

# bucket_video_df.drop(columns=['view_bucket'], axis=1, inplace =True)

In [19]:
bucket_video_df.dtypes

c_channel_title                         object
c_channel_subCount                       int64
c_channel_videoCount                     int64
c_channel_viewCount                      int64
v_tags                                  object
v_title                                 object
v_comment_count                          int64
v_likes                                  int64
v_year                                   int32
v_mont                                   int32
v_day                                    int32
v_hour                                   int32
v_minute                                 int32
v_second                                 int32
v_duration_time                        float64
v_category_id_Autos & Vehicles           int32
v_category_id_Education                  int32
v_category_id_Entertainment              int32
v_category_id_Film & Animation           int32
v_category_id_Gaming                     int32
v_category_id_Howto & Style              int32
v_category_id

In [20]:
text_columns = ['c_channel_title', 'v_tags', 'v_title']
numeric_columns = [col for col in bucket_video_df.columns if (col not in text_columns) and (col != 'video_views_bucket_encoded')]

print(text_columns)
print(numeric_columns)

['c_channel_title', 'v_tags', 'v_title']
['c_channel_subCount', 'c_channel_videoCount', 'c_channel_viewCount', 'v_comment_count', 'v_likes', 'v_year', 'v_mont', 'v_day', 'v_hour', 'v_minute', 'v_second', 'v_duration_time', 'v_category_id_Autos & Vehicles', 'v_category_id_Education', 'v_category_id_Entertainment', 'v_category_id_Film & Animation', 'v_category_id_Gaming', 'v_category_id_Howto & Style', 'v_category_id_News & Politics', 'v_category_id_Nonprofits & Activism', 'v_category_id_People & Blogs', 'v_category_id_Science & Technology', 'v_category_id_other', 'v_definition_hd', 'v_definition_sd', 'v_caption_False', 'v_caption_True', 'v_age_restricted_Not Restricted', 'v_age_restricted_ytAgeRestricted', 'v_made_for_kids_False', 'v_made_for_kids_True']


In [21]:
# Function to clean up lines to not include links or the word video/channel
def remove_urls(description):
    clean_text = re.sub(r"http\S+|www\S+|https\S+", "", description, flags=re.MULTILINE)
    clean_text = re.sub(rf"\b{re.escape('video')}\b", "", clean_text, flags=re.IGNORECASE)
    clean_text = re.sub(rf"\b{re.escape('channel')}\b", "", clean_text, flags=re.IGNORECASE)
    return clean_text

In [22]:
column_transformer = ColumnTransformer([
    ('text', CountVectorizer(stop_words='english', preprocessor=remove_urls, ngram_range=(1,1)), text_columns),
    ('num', StandardScaler(), numeric_columns)
])

In [23]:
y = bucket_video_df['video_views_bucket_encoded']
X = bucket_video_df.drop(columns=['video_views_bucket_encoded'])

In [24]:
# # used for testing without buckets and just using video titles
# y = videos_df['v_views'].values
# X = videos_df['v_title'].values

In [25]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)

In [26]:
print("X_train shape before transformation:", X_train.shape)
print("X_test shape before transformation:", X_test.shape)

X_train shape before transformation: (18012, 34)
X_test shape before transformation: (4504, 34)


In [27]:
# Accuracy on test set: 0.6574156305506217

text_transformer = CountVectorizer(stop_words='english', ngram_range=(1,1))
text_train_transformed = text_transformer.fit_transform(X_train[text_columns].apply(lambda x: ' '.join(x), axis=1))  # Convert text columns into a single string
text_test_transformed = text_transformer.transform(X_test[text_columns].apply(lambda x: ' '.join(x), axis=1))

print("Text train shape:", text_train_transformed.shape)
print("Text test shape:", text_test_transformed.shape)

X_train_combined = hstack([text_train_transformed, X_train[numeric_columns]])
X_test_combined = hstack([text_test_transformed, X_test[numeric_columns]])

print("Combined train shape:", X_train_combined.shape)
print("Combined test shape:", X_test_combined.shape)

Text train shape: (18012, 24236)
Text test shape: (4504, 24236)
Combined train shape: (18012, 24267)
Combined test shape: (4504, 24267)


In [28]:
# # Accuracy on test set: 0.6571936056838366

# text_transformer = CountVectorizer(stop_words='english', ngram_range=(1,1))
# text_train_transformed = text_transformer.fit_transform(X_train[text_columns].apply(lambda x: ' '.join(x), axis=1))  # Convert text columns into a single string
# text_test_transformed = text_transformer.transform(X_test[text_columns].apply(lambda x: ' '.join(x), axis=1))

# numeric_transformer = StandardScaler()
# numeric_train_transformed = numeric_transformer.fit_transform(X_train[numeric_columns])
# numeric_test_transformed = numeric_transformer.transform(X_test[numeric_columns])

# print("Text train shape:", text_train_transformed.shape)
# print("Text test shape:", text_test_transformed.shape)
# print("Numeric train shape:", numeric_train_transformed.shape)
# print("Numeric test shape:", numeric_test_transformed.shape)

# X_train_combined = hstack([text_train_transformed, numeric_train_transformed])
# X_test_combined = hstack([text_test_transformed, numeric_test_transformed])

# print("Combined train shape:", X_train_combined.shape)
# print("Combined test shape:", X_test_combined.shape)

In [29]:
# numeric_columns

In [30]:
# l = list(text_transformer.get_feature_names_out()) + numeric_columns
# l[-5:]

In [31]:
# X_train_vector = column_transformer.fit_transform(X_train)
# X_test_vector = column_transformer.transform(X_test)

In [32]:
# DecisionTreeClassifier, DecisionTreeRegressor
# RandomForestRegressor, RandomForestClassifier

model = RandomForestClassifier(random_state=42)
model.fit(X_train_combined, y_train)

In [33]:
y_pred = model.predict(X_test_combined)

In [34]:
r2 = r2_score(y_test, y_pred)
print(f"R² Score: {r2}")

R² Score: 0.3277861639243451


In [43]:
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy*100:.3f}%")

Accuracy: 65.742%


In [36]:
# X_train_combined

In [37]:
# from sklearn.tree import plot_tree
# import matplotlib.pyplot as plt

# plt.figure(figsize=(30,10))

# plot_tree(model.estimators_[0], feature_names=l, fontsize=8, max_depth=6, class_names=["less","more"], filled=True)


In [38]:
# import numpy as np

# importances = model.feature_importances_

# feature_names = videos_df_dummies.drop(columns=['video_views_bucket_encoded']).columns

# indices = np.argsort(importances)[::-1]

# top_10_indices = indices[:20]
# #bottom_10_indices = indices[-10:]

# #combined_indices = np.concatenate([top_10_indices, bottom_10_indices])
# combined_indices = top_10_indices

# combined_feature_names = [feature_names[i] for i in combined_indices]
# combined_importances = importances[combined_indices]

# plt.figure(figsize=(10,6))
# plt.title("Top 20 Feature Importance")
# plt.bar(range(len(combined_importances)), combined_importances, align="center")
# plt.xticks(range(len(combined_importances)), combined_feature_names, rotation=90)
# plt.show()

In [39]:
# feature_importances = pd.DataFrame({  "Feature": l,
#                                     "Importance": model.feature_importances_})

# type(feature_importances.sort_values(by="Importance", ascending=False))
    
# pd.set_option("display.max_rows", 100)

# for x in feature_importances.sort_values(by="Importance", ascending=False).values[-100:]:
#     print(x)