In [3]:
import pandas as pd

#Load files
content = pd.read_csv('Content.csv')
cities = pd.read_csv('Cities.csv')
date = pd.read_csv('Date.csv')
source = pd.read_csv('Traffic source.csv')

In [4]:
#Remove first row (totals)
content = content.iloc[1:]
cities = cities.iloc[1:]
date = date.iloc[1:]
source = source.iloc[1:]

In [5]:
# Datetime
content['Video publish time'] = pd.to_datetime(content['Video publish time'])
content['Duration'] = pd.to_timedelta(content['Duration'], unit='s')
content['Watch time (hours)'] = pd.to_timedelta(content['Watch time (hours)'], unit='h')
content['Average view duration'] = pd.to_timedelta(content['Average view duration'])
content['Impressions click-through rate (%)'] = content['Impressions click-through rate (%)'] / 100
content.rename(columns={'Impressions click-through rate (%)': 'Impressions CTR'}, inplace=True)

In [6]:
date['Date'] = pd.to_datetime(date['Date'])
date['Watch time (hours)'] = pd.to_timedelta(date['Watch time (hours)'], unit='h')
date['Average view duration'] = pd.to_timedelta(date['Average view duration'])

cities['Watch time (hours)'] = pd.to_timedelta(cities['Watch time (hours)'], unit='h')
cities['Average view duration'] = pd.to_timedelta(cities['Average view duration'])

source['Watch time (hours)'] = pd.to_timedelta(source['Watch time (hours)'], unit='h')
source['Average view duration'] = pd.to_timedelta(source['Average view duration'])

In [7]:
print(content.head())

       Content                                        Video title  \
1  9rU6Jf1EpLg  I LOVE the new Jinx design #shorts #arcane #le...   
2  RhvaJIXedtY  Season 2 Vi is 🥵🥵🥵🥵 #shorts  #arcane #leagueof...   
3  B0MKg88uE1g  Maddie makes me 😡😡🤬😠 #arcane #leagueoflegends ...   
4  Mk0-nA1Vad8  The most UNDERRATED enforcer #arcane #leagueof...   
5  f40T4AbyJiI  Arcane's time travel explained! #arcane #leagu...   

  Video publish time        Duration      Views          Watch time (hours)  \
1         2024-09-09 0 days 00:00:31  1348535.0 419 days 22:19:58.439999996   
2         2024-09-22 0 days 00:00:32   735917.0    216 days 00:57:31.320000   
3         2024-11-27 0 days 00:00:23   638341.0    189 days 13:29:33.360000   
4         2024-11-28 0 days 00:00:29   587002.0    200 days 23:41:38.400000   
5         2024-11-25 0 days 00:00:26   546862.0    216 days 13:29:51.720000   

   Subscribers Average view duration  Impressions  Impressions CTR  
1        604.0       0 days 00:00:26     

# Statistical Testing

**Anova Test on Importance of Publishing Date and Runtime**

In [8]:
# Convert 'Duration' to seconds if it's in a timedelta format
content['Duration_sec'] = content['Duration'].dt.total_seconds()

# Define bins for runtime (e.g., Short: <30s, Medium: 30-60s, Long: >60s)
bins_runtime = [0, 30, 60, float('inf')]
labels_runtime = ['Short', 'Medium', 'Long']
content['Runtime_Category'] = pd.cut(content['Duration_sec'], bins=bins_runtime, labels=labels_runtime)

# Extract day of the week from publish time (0 = Monday, 6 = Sunday)
content['Publish_Day'] = content['Video publish time'].dt.day_name()

In [None]:
import statsmodels.api as sm
from statsmodels.formula.api import ols

# Drop rows with missing values in relevant columns
anova_data = content.dropna(subset=['Views', 'Runtime_Category', 'Publish_Day'])

# Run two-way anova on Runtime Category and Publish Day
model = ols('Views ~ C(Runtime_Category) + C(Publish_Day) + C(Runtime_Category):C(Publish_Day)', data=anova_data).fit()
anova_table = sm.stats.anova_lm(model, typ=2)

print(anova_table)

                                          sum_sq     df          F  \
C(Runtime_Category)                 6.139887e+11    2.0  21.666612   
C(Publish_Day)                      1.162959e+11    6.0   1.367962   
C(Runtime_Category):C(Publish_Day)  3.411711e+11   12.0   2.006558   
Residual                            2.989660e+12  211.0        NaN   

                                          PR(>F)  
C(Runtime_Category)                 2.765491e-09  
C(Publish_Day)                      2.288752e-01  
C(Runtime_Category):C(Publish_Day)  2.502329e-02  
Residual                                     NaN  


- According to their respective p-values, runtime had a statistically significant affect on view-count, whereas the publishing date did not

**Z-Test on Importance of Runtime by Category**

In [None]:
from scipy.stats import zscore

# Drop missing values
normalized_data = content.dropna(subset=['Views', 'Runtime_Category'])

# Normalize the views using z-score
normalized_data['Views_Z'] = zscore(normalized_data['Views'])

# Group by runtime category and compute the average z-score
avg_zscores = normalized_data.groupby('Runtime_Category')['Views_Z'].mean().sort_values(ascending=False)

print("Average Z-Score of Views per Runtime Category:")
print(avg_zscores)

Average Z-Score of Views per Runtime Category:
Runtime_Category
Medium    0.525609
Short     0.437280
Long     -0.344424
Name: Views_Z, dtype: float64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  normalized_data['Views_Z'] = zscore(normalized_data['Views'])
  avg_zscores = normalized_data.groupby('Runtime_Category')['Views_Z'].mean().sort_values(ascending=False)


- Further analysis of runtime has shown that that a medium runtime of 30-60 seconds had the most postiive affect on view-count

**Tests on Correlation between City and Views**

In [24]:
cities_clean = cities.dropna(subset=['City name', 'Views', 'Average view duration']).copy()
cities_clean['Views'] = pd.to_numeric(cities_clean['Views'], errors='coerce')

# Normalize using z-score
cities_clean['Views_Z'] = zscore(cities_clean['Views'])

# Bin into 4 quartile categories based on z-score ranks
cities_clean['Views_Quartile'] = pd.qcut(cities_clean['Views_Z'], 4, labels=['Q1', 'Q2', 'Q3', 'Q4'])  # Q1 = lowest views

In [25]:
from scipy.stats import chi2_contingency

# Contingency table
views_table = pd.crosstab(cities_clean['City name'], cities_clean['Views_Quartile'])

# Chi-square test
chi2_v, p_v, dof_v, _ = chi2_contingency(views_table)
print(f"Views Quartile vs City: χ² = {chi2_v:.4f}, p = {p_v:.8g}")

Views Quartile vs City: χ² = 1500.0000, p = 0.47330172


- The resulting p value shows that city has no statistically significant affect on view-count, and we don't necessarily have to appeal to any geographic location in particular

**Exploring the effects of hashtags**

In [None]:
import re
content['Hashtags_List'] = content['Video title'].str.findall(r'#\w+')

In [29]:
# Extract hashtags
content['Hashtags_List'] = content['Video title'].str.findall(r'#\w+')

# Explode to get one row per hashtag
hashtag_df = content.explode('Hashtags_List').dropna(subset=['Hashtags_List', 'Views'])

# Clean
hashtag_df['Views'] = pd.to_numeric(hashtag_df['Views'], errors='coerce')

In [None]:
# Count appearances and average views
hashtag_stats = hashtag_df.groupby('Hashtags_List').agg(
    count=('Views', 'count'),
    avg_views=('Views', 'mean'),
    std_views=('Views', 'std')
)

In [None]:
import numpy as np

# Normalize data to account for outliers
hashtag_stats['normalized_score'] = hashtag_stats['avg_views'] / np.log1p(hashtag_stats['count'])

In [31]:
# Sort by normalized impact
top_tags = hashtag_stats.sort_values('normalized_score', ascending=False)
print(top_tags.head(10))

                  count      avg_views      std_views  normalized_score
Hashtags_List                                                          
#leagueoflegends     57  142112.421053  230355.702197      34999.240399
#shorts              74  140910.905405  205691.791390      32637.242234
#arctober            28   88179.678571  100329.859598      26187.089886
#arcane              96  118157.343750  188802.478099      25828.373487
#arcanefics           2   10195.000000    5730.393355       9279.888915
#riotgames            2   10195.000000    5730.393355       9279.888915


- Even after normalizaiton, the most affective hashtag at garnering views was #leagueoflegends
- Shorts should thereby have this hashtag by default whenever possible