In [None]:
from scipy.stats import shapiro

# Example post types and categories; replace with your actual column names and categories as necessary
post_types = ['Unweighted_Engagement_Score']
days_of_week = sorted(merged_df['Day of Week'].unique())
parts_of_day = sorted(merged_df['Part of Day Numeric'].unique())

# Ensure all post type columns are numeric and handle any non-numeric values appropriately
for post_type in post_types:
    merged_df[post_type] = pd.to_numeric(merged_df[post_type], errors='coerce')

# Iterating over each combination of conditions
for post_type in post_types:
    print(f"\nAnalyzing {post_type}:")
    for day in days_of_week:
        for part in parts_of_day:
            filtered_scores = merged_df[(merged_df['Day of Week'] == day) &
                                        (merged_df['Part of Day Numeric'] == part)][post_type].dropna()

            if len(filtered_scores) > 3:  # Shapiro-Wilk requires more than 3 values
                stat, p = shapiro(filtered_scores)
                print(f"Day {day}, Part of Day {part}: p-value={p:.4f}")
            else:
                print(f"Day {day}, Part of Day {part}: Not enough data for Shapiro-Wilk Test.")


In [None]:


# Convert 'Date' column to datetime to ensure 'Day of Week' and 'Part of Day Numeric' can be calculated correctly
merged_df['Date'] = pd.to_datetime(merged_df['Date'])

# Assuming the 'Day of Week' and 'Part of Day Numeric' are not in the dataset, calculate them
merged_df['Day of Week'] = merged_df['Date'].dt.dayofweek

# Function to categorize part of the day; adjust according to your dataset's specific hours if needed
def get_part_of_day_numeric(hour):
    if 5 <= hour < 12:
        return 0  # Morning
    elif 12 <= hour < 17:
        return 1  # Afternoon
    elif 17 <= hour < 22:
        return 2  # Evening
    else:
        return 3  # Night

# Apply the function to create 'Part of Day Numeric'
merged_df['Part of Day Numeric'] = merged_df['Date'].dt.hour.apply(get_part_of_day_numeric)

# Now proceed with the Shapiro-Wilk normality test as previously outlined
from scipy.stats import shapiro

post_types = ['Unweighted_Engagement_Score']  # Specify the column to analyze
days_of_week = sorted(merged_df['Day of Week'].unique())
parts_of_day = sorted(merged_df['Part of Day Numeric'].unique())

# Ensure 'Unweighted_Engagement_Score' is numeric
merged_df['Unweighted_Engagement_Score'] = pd.to_numeric(merged_df['Unweighted_Engagement_Score'], errors='coerce')

for post_type in post_types:
    print(f"\nAnalyzing {post_type}:")
    for day in days_of_week:
        for part in parts_of_day:
            filtered_scores = merged_df[(merged_df['Day of Week'] == day) &
                                        (merged_df['Part of Day Numeric'] == part)][post_type].dropna()
            if len(filtered_scores) > 3:
                stat, p = shapiro(filtered_scores)
                print(f"Day {day}, Part of Day {part}: p-value={p:.4f}")
            else:
                print(f"Day {day}, Part of Day {part}: Not enough data for Shapiro-Wilk Test.")


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

# Assuming 'merged_df' is your DataFrame and it's already loaded

# Ensure 'Date' is a datetime type
merged_df['Date'] = pd.to_datetime(merged_df['Date'])

# Plot 1: Engagement Over Time
plt.figure(figsize=(12, 6))
merged_df.set_index('Date')['Unweighted_Engagement_Score'].plot()
plt.title('Engagement Scores Over Time')
plt.ylabel('Unweighted Engagement Score')
plt.xlabel('Date')
plt.tight_layout()
plt.show()

# Plot 2: Average Engagement Scores by Part of Day
plt.figure(figsize=(8, 6))
sns.barplot(x='Part of Day Numeric', y='Unweighted_Engagement_Score', data=merged_df, estimator=np.mean)
plt.title('Average Engagement Scores by Part of Day')
plt.ylabel('Average Unweighted Engagement Score')
plt.xlabel('Part of Day')
plt.tight_layout()
plt.show()

# For 'Day of the Week', assuming 0 = Monday, ..., 6 = Sunday
# Plot 3: Average Engagement Scores by Day of the Week
plt.figure(figsize=(8, 6))
sns.barplot(x='Day of Week', y='Unweighted_Engagement_Score', data=merged_df, estimator=np.mean)
plt.title('Average Engagement Scores by Day of the Week')
plt.ylabel('Average Unweighted Engagement Score')
plt.xlabel('Day of Week')
plt.xticks(ticks=range(7), labels=['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun'])
plt.tight_layout()
plt.show()


In [None]:
from scipy.stats import kruskal

# Example: Comparing engagement scores across three parts of the day
scores_morning = merged_df[merged_df['Part of Day Numeric'] == 0]['Unweighted_Engagement_Score'].dropna()
scores_afternoon = merged_df[merged_df['Part of Day Numeric'] == 1]['Unweighted_Engagement_Score'].dropna()
scores_evening = merged_df[merged_df['Part of Day Numeric'] == 2]['Unweighted_Engagement_Score'].dropna()
scores_nights = merged_df[merged_df['Part of Day Numeric'] == 3]['Unweighted_Engagement_Score'].dropna()

stat, p = kruskal(scores_morning, scores_afternoon, scores_evening)
print(f"Kruskal-Wallis H Test: Statistic={stat}, p-value={p}")


In [None]:
from scipy import stats
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Let's assume your 'merged_df' DataFrame is already loaded and has the columns mentioned.
# You may need to replace this line with the code that actually loads your data:
# merged_df = pd.read_csv('path_to_your_data.csv') or other DataFrame loading method

# Function to create and display Q-Q plots
def qq_plot(data, title):
    stats.probplot(data, dist="norm", plot=plt)
    plt.title(title)
    plt.xlabel('Theoretical Quantiles')
    plt.ylabel('Sample Quantiles')
    plt.show()

# Q-Q plots for 'Unweighted_Engagement_Score' for each part of the day
for i in merged_df['Part of Day Numeric'].unique():
    scores = merged_df[merged_df['Part of Day Numeric'] == i]['Unweighted_Engagement_Score'].dropna()
    qq_plot(scores, f'Q-Q Plot for Part of Day {i}')

# Q-Q plots for 'Unweighted_Engagement_Score' for each day of the week
for i in merged_df['Day of Week'].unique():
    scores = merged_df[merged_df['Day of Week'] == i]['Unweighted_Engagement_Score'].dropna()
    qq_plot(scores, f'Q-Q Plot for Day of Week {i}')


In [None]:
from scipy.stats import kruskal

# Example: Comparing engagement scores across three parts of the day
scores_monday = merged_df[merged_df['Day of Week'] == 0]['Unweighted_Engagement_Score'].dropna()
scores_tuesday = merged_df[merged_df['Day of Week'] == 1]['Unweighted_Engagement_Score'].dropna()
scores_wensday = merged_df[merged_df['Day of Week'] == 2]['Unweighted_Engagement_Score'].dropna()
scores_tursday = merged_df[merged_df['Day of Week'] == 3]['Unweighted_Engagement_Score'].dropna()
scores_friday = merged_df[merged_df['Day of Week'] == 4]['Unweighted_Engagement_Score'].dropna()
scores_saturday = merged_df[merged_df['Day of Week'] == 5]['Unweighted_Engagement_Score'].dropna()
scores_sunday = merged_df[merged_df['Day of Week'] == 6]['Unweighted_Engagement_Score'].dropna()

stat, p = kruskal(scores_monday, scores_tuesday, scores_wensday,scores_tursday, scores_friday, scores_saturday, scores_sunday)
print(f"Kruskal-Wallis H Test: Statistic={stat}, p-value={p}")

In [None]:
from scipy.stats import kruskal

# Example: Comparing engagement scores across three parts of the day
scores_morning = merged_df[merged_df['Part of Day Numeric'] == 0]['Unweighted_Engagement_Score'].dropna()
scores_afternoon = merged_df[merged_df['Part of Day Numeric'] == 1]['Unweighted_Engagement_Score'].dropna()
scores_evening = merged_df[merged_df['Part of Day Numeric'] == 2]['Unweighted_Engagement_Score'].dropna()
scores_nights = merged_df[merged_df['Part of Day Numeric'] == 3]['Unweighted_Engagement_Score'].dropna()

stat, p = kruskal(scores_morning, scores_afternoon, scores_evening)
print(f"Kruskal-Wallis H Test: Statistic={stat}, p-value={p}")
