### Importing Libraries

In [None]:
import pandas as pd
import numpy as np
import utils
import plotly.graph_objs as go
import dash
from dash import dcc
from dash import html
import plots
from dash.dependencies import Input, Output
from dateutil import parser
%load_ext autoreload
%autoreload 2

### Parsing the Columns

In [152]:
name = 'autos'
bottom_channels = pd.read_csv(f"./{name}/bottom_{name}_vid.csv", index_col = 0)
bottom_channels = utils.parse_cols(bottom_channels)


top_channels = pd.read_csv(f"./{name}/top_{name}_vid.csv", index_col = 0)
top_channels = utils.parse_cols(top_channels)

bottom_channels = utils.add_pop_unpop_col(bottom_channels, False)
top_channels = utils.add_pop_unpop_col(top_channels, True)

b_nout = utils.remove_outliers(bottom_channels)
t_nout = utils.remove_outliers(top_channels)

b_nout.to_csv(f"./{name}/bottom_{name}_vid_nout.csv")
t_nout.to_csv(f"./{name}/top_{name}_vid_nout.csv")
# Autos: 2 reject, 15220, 13595
# Shows: 1st only reject, 15147, 13296
# Travel: 2 reject, 10809, 9810
# Sports: 1st only reject, 16272, 14143
# Tech: 2 reject, 12765, 11179

Total rows before parsing: 8555
Parser dropped 0 rows during 'publishedAt' parsing
Parser dropped 2 rows during 'viewCount' parsing
Total rows after parsing: 8549
Total rows before parsing: 6672
Parser dropped 0 rows during 'publishedAt' parsing
Parser dropped 0 rows during 'viewCount' parsing
Total rows after parsing: 6671


In [None]:
# 15220+15147+10809+16272+12765 = 70213 # Before Outliers
# 13595+13296+9810+14143+11179 = 62023 # After Outliers

### Describing the Data

In [None]:
bottom_channels.shape, top_channels.shape

### Adding the popularity Column and combining the popular and unpopular dataframes

In [None]:
bottom_channels = utils.add_pop_unpop_col(bottom_channels, False)
top_channels = utils.add_pop_unpop_col(top_channels, True)



In [153]:
comb_vids = utils.combine_pop_unpop_df(, bottom_channels)

(13595, 21)

### Removing the Outliers of the Data

In [None]:
# Remove outliers from the dataframe
cleaned_dataframe = utils.remove_outliers(comb_vids)
print(comb_vids.shape, cleaned_dataframe.shape)

In [None]:
import os
# Define the file path
file_path = '/home/yuvi_dh/side_projects/Projects/smr_yt/eda_nbs/data.txt'

# Define the file path
file_path = '/home/yuvi_dh/side_projects/Projects/smr_yt/eda_nbs/data.txt'

with open(file_path, 'a') as file:
    # Add the category information to the text file
    file.write("channels_category: " + name + "\n")
    file.write("Pop:\n")
    file.write(str(cleaned_dataframe[cleaned_dataframe['pop_unpop'] == 1]['definition'].value_counts()) + "\n\n")
    file.write("Unpop:\n")
    file.write(str(cleaned_dataframe[cleaned_dataframe['pop_unpop'] == 0]['definition'].value_counts()) + "\n\n")


### Some Plots

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import math
df = cleaned_dataframe

# 1) Scatterplots
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Scatterplot: viewCount vs commentCount
sns.scatterplot(data=df, x='viewCount', y='commentCount', hue='pop_unpop', palette={0: 'orange', 1: 'green'}, ax=axes[0])
axes[0].set_title('Scatterplot: viewCount vs commentCount')

# Scatterplot: viewCount vs likeCount
sns.scatterplot(data=df, x='viewCount', y='likeCount', hue='pop_unpop', palette={0: 'orange', 1: 'green'}, ax=axes[1])
axes[1].set_title('Scatterplot: viewCount vs likeCount')

# 2) Weekday distribution
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Popular Channels
sns.countplot(data=df[df['pop_unpop'] == 1], x='publishDayName', order=['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'], ax=axes[0], palette=['green'])
axes[0].set_title('Weekday Distribution for Popular Channels')
axes[0].tick_params(axis='x', rotation=45)  # Change rotation angle as needed

# Unpopular Channels
sns.countplot(data=df[df['pop_unpop'] == 0], x='publishDayName', order=['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'], ax=axes[1], palette=['orange'])
axes[1].set_title('Weekday Distribution for Unpopular Channels')
axes[1].tick_params(axis='x', rotation=45)  # Change rotation angle as needed

# 3) Bar plot of duration
fig, axes = plt.subplots(1, 2, figsize=(14, 5), sharex=True, sharey=True)

# Calculate the number of bars for each 5-minute interval
num_bars = math.ceil(10000 / 300)
bar_interval = 300
bar_values = [i * bar_interval / 60 for i in range(num_bars + 1)]  # Convert to minutes

# Popular Channels
sns.histplot(df[(df['durationSecs'] < 10000) & (df['pop_unpop'] == 1)], x='durationSecs', bins=bar_values, kde=False, ax=axes[0], color='green')
axes[0].set_title('Bar Plot of Duration for Popular Channels')
axes[0].set_xlabel('Duration (minutes)')
axes[0].set_ylabel('Count')

# Unpopular Channels
sns.histplot(data=df[(df['durationSecs'] < 10000) & (df['pop_unpop'] == 0)], x='durationSecs', bins=bar_values, kde=False, ax=axes[1], color='orange')
axes[1].set_title('Bar Plot of Duration for Unpopular Channels')
axes[1].set_xlabel('Duration (minutes)')
axes[1].set_ylabel('Count')



# # 4) ViewCount vs Tags Distribution
# fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# # Scatterplot: viewCount vs tagsCount
# sns.scatterplot(data=df, x='viewCount', y='tagsCount', hue='pop_unpop', palette={0: 'orange', 1: 'green'}, ax=axes[0])
# axes[0].set_title('Scatterplot: viewCount vs tagsCount')

# # Distribution of tag count
# sns.histplot(data=df, x='tagsCount', hue='pop_unpop', multiple='stack', bins=range(0, max(df['tagsCount']) + 1), ax=axes[1], palette={0: 'orange', 1: 'green'})
# axes[1].set_title('Distribution of Tag Count for Channels')
# axes[1].set_xlabel('Tags Count')
# axes[1].set_ylabel('Count')
# 4) Bubble plot: viewCount vs tagsCount
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Define tag count ranges
tag_ranges = [(0, 10), (10, 20), (20, 30)]

for tag_range in tag_ranges:
    min_tag, max_tag = tag_range
    subset_df = df[(df['tagsCount'] >= min_tag) & (df['tagsCount'] < max_tag)]
    size = subset_df.groupby('pop_unpop').size().reset_index(name='count')
    color = 'green' if tag_range[0] == 0 else 'orange'  # Use green for 0-10 range and orange for others
    sns.scatterplot(data=subset_df, x='viewCount', y='tagsCount', hue='pop_unpop', palette={0: 'orange', 1: 'green'}, size=size['count'], sizes=(20, 200), ax=axes[0], color=color)

axes[0].set_title('Bubble Plot: viewCount vs tagsCount')
axes[0].set_xlabel('viewCount')
axes[0].set_ylabel('tagsCount')

# Distribution of tag count
sns.histplot(data=df, x='tagsCount', hue='pop_unpop', multiple='stack', bins=range(0, max(df['tagsCount']) + 1), ax=axes[1], palette={0: 'orange', 1: 'green'})
axes[1].set_title('Distribution of Tag Count for Channels')
axes[1].set_xlabel('Tags Count')
axes[1].set_ylabel('Count')

plt.tight_layout()
plt.show()

### Splitting the columns into 2 Chunks each having popular and unpopular channels on the base of mean of views

In [None]:
# chunklow_ch_names, chunkhigh_ch_names = utils.split_and_merge_by_views(comb_autos_info)
# chunklow_ch_names.channelName.tolist(), chunkhigh_ch_names.channelName.tolist()

In [None]:
summed_views_df, mean_views_popular, mean_views_unpopular, popular_below_mean, popular_above_mean, unpopular_below_mean, unpopular_above_mean = utils.pop_unpop_chunks(cleaned_dataframe)

above_mean_chunk = popular_above_mean + unpopular_above_mean
below_mean_chunk = popular_below_mean + unpopular_below_mean


print("Mean views of popular channels: ", mean_views_popular)
print("Mean views of unpopular channels: ", mean_views_unpopular)

print(f"Popular channel names with views above mean: {(popular_above_mean)}")
print(f"Popular channel names with views below mean: {(popular_below_mean)}")
print(f"UnPopular channel names with views above mean: {(unpopular_above_mean)}")
print(f"UnPopular channel names with views above mean: {(unpopular_below_mean)}")

In [None]:
# # Run the first App to see the growth of views over the years for above mean chunk
# app = plots.dynamic_view_plots(cleaned_dataframe)
# app.run_server(debug=True, use_reloader=False, mode="inline", port=23000)

In [None]:
# percentiles = ['.25', '.5', '.75', '.9', '.95', '.99']
# app_2 = plots.percentiles_plot(cleaned_dataframe, percentiles=percentiles)
# app_2.run_server(debug=True, use_reloader=False, mode="inline", port=23001)

In [None]:
comb_vids

In [None]:
import matplotlib.pyplot as plt
import statsmodels.api as sm
import seaborn as sns
import pandas as pd
from scipy.stats import ttest_ind
from scipy.stats import mannwhitneyu

df = cleaned_dataframe
# recent_years = df['publishingYear'].isin([2023, 2022,2021])
# df = df[recent_years]

# Assuming your DataFrame is named 'df'
# Separate data for popular and unpopular channels
popular_views = df[df['pop_unpop'] == 1]['viewCount']
unpopular_views = df[df['pop_unpop'] == 0]['viewCount']

# Create Q-Q plots
fig, axs = plt.subplots(1, 2, figsize=(12, 6))

sm.qqplot(popular_views, line='s', color='blue', ax=axs[0])
axs[0].set_title('Q-Q Plot - Popular Channels')

sm.qqplot(unpopular_views, line='s', color='orange', ax=axs[1])
axs[1].set_title('Q-Q Plot - Unpopular Channels')

plt.tight_layout()
plt.show()

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Assuming your CSV file is named 'your_file.csv'
df = cleaned_dataframe

# Separate data for popular and unpopular channels
popular_df = df[df['pop_unpop'] == 1]
unpopular_df = df[df['pop_unpop'] == 0]

# Plotting the distribution of viewCount for popular channels
plt.figure(figsize=(10, 6))
plt.hist(popular_df['viewCount'], bins=50, color='blue', alpha=0.7, label='Popular Channels')
plt.title('Distribution of viewCount for Popular Channels')
plt.xlabel('viewCount')
plt.ylabel('Frequency')
plt.legend()
plt.show()

# Plotting the distribution of viewCount for unpopular channels
plt.figure(figsize=(10, 6))
plt.hist(unpopular_df['viewCount'], bins=50, color='red', alpha=0.7, label='Unpopular Channels')
plt.title('Distribution of viewCount for Unpopular Channels')
plt.xlabel('viewCount')
plt.ylabel('Frequency')
plt.legend()
plt.show()


In [None]:
# Assuming your DataFrame is named 'df'
# Filter data for the most recent three years
df = cleaned_dataframe
recent_years = df['publishingYear'].isin([2023, 2022,2021])  # Replace 'year1', 'year2', 'year3' with actual years
filtered_df = df[recent_years]

# Separate data for popular and unpopular channels in the recent years
popular = filtered_df[filtered_df['pop_unpop'] == 1]['viewCount']
unpopular = filtered_df[filtered_df['pop_unpop'] == 0]['viewCount']

# Perform two-tailed t-test
t_statistic, p_value = mannwhitneyu(popular, unpopular)

# Print the results
print("T-statistic:", t_statistic)
print("P-value:", p_value)

# Interpret the results
alpha = 0.05  # Set your significance level
if p_value < alpha:
    print("Reject the null hypothesis: There is a significant difference between the view counts of popular and unpopular channels in the recent three years.")
else:
    print("Fail to reject the null hypothesis: There is no significant difference between the view counts of popular and unpopular channels in the recent three years.")


In [None]:
# Assuming your DataFrame is named 'cleaned_dataframe'
# Filter data for the most recent three years
df = cleaned_dataframe

popular_df = df[df['pop_unpop']==1]
unpopular_df = df[df['pop_unpop']==0]

# Group by channelTitle and calculate the mean view count for each channel
popular_mean_view = popular_df.groupby('channelTitle')['viewCount'].mean().reset_index()
popular_mean_view['pop_unpop'] = 1
unpopular_mean_view = unpopular_df.groupby('channelTitle')['viewCount'].mean().reset_index()
unpopular_mean_view['pop_unpop'] = 0

# Get the view Count
popular = popular_mean_view['viewCount']
unpopular = unpopular_mean_view['viewCount']

# Perform two-tailed t-test
t_statistic, p_value = mannwhitneyu(popular, unpopular)

# Print the results
print("T-statistic:", t_statistic)
print("P-value:", p_value)

# Interpret the results
alpha = 0.01  # Set your significance level
if p_value < alpha:
    print("Reject the null hypothesis: There is a significant difference between the mean view counts of popular and unpopular channels.")
else:
    print("Fail to reject the null hypothesis: There is no significant difference between the mean view counts of popular and unpopular channels.")

In [None]:
# Convert viewCount column to scientific notation (e-notation)
sns.set_style('darkgrid')
popular_mean_view['viewCount'] = popular_mean_view['viewCount'].apply(lambda x: format(x, '.2e'))

# Create a figure and axis
fig, ax = plt.subplots(figsize=(10, 6))

# Plot the table
table = plt.table(cellText=popular_mean_view.values,
                  colLabels=popular_mean_view.columns,
                  loc='center')

# Style the table
table.auto_set_font_size(False)
table.set_fontsize(15)
table.scale(1, 2)

# Remove axis
ax.axis('off')

# Display the table
plt.show()

In [None]:
# Convert viewCount column to scientific notation (e-notation)
unpopular_mean_view['viewCount'] = unpopular_mean_view['viewCount'].apply(lambda x: format(x, '.2e'))

# Create a figure and axis
fig, ax = plt.subplots(figsize=(10, 6))

# Plot the table
table = plt.table(cellText=unpopular_mean_view.values,
                  colLabels=unpopular_mean_view.columns,
                  loc='center')

# Style the table
table.auto_set_font_size(False)
table.set_fontsize(15)
table.scale(1, 2)

# Remove axis
ax.axis('off')

# Display the table
plt.show()