In [None]:
# Import relevant libraries

import pandas as pd
import matplotlib.pyplot as plt

# Read the CSV file into a Pandas dataframe
# Insert filename with correct path, e.g. '.../SgLDC Datawork/FB_Analytics(2022).csv'
df = pd.read_csv() 

### Finding the Top Contributors ###

In [None]:
# Group the data by the 'Top Contributors' column and calculate the total number of posts for each contributor
grouped_df = df.groupby('Top Contributors')['Posts'].sum()
grouped_df = grouped_df[grouped_df > 0]
# Sort the top contributors by total number of posts, in descending order
sorted_df = grouped_df.sort_values(ascending=False)

In [None]:
# Extract the data to be plotted from the sorted_df DataFrame
data = sorted_df.values

# Create the boxplot
fig, ax = plt.subplots()
ax.boxplot(data)
ax.set_xlabel('Top Contributors')
ax.set_ylabel('Number of Posts')
ax.set_xticklabels([])

# Show the plot
plt.show()

In [None]:
# Making the boxplot without the outliers

data_filtered = sorted_df.loc[sorted_df < 4].values

# Create the boxplot
fig, ax = plt.subplots()
ax.boxplot(data_filtered)
ax.set_xlabel('Top Contributors')
ax.set_ylabel('Number of Posts')
ax.set_xticklabels([])

# Show the plot
plt.show()

### Computing the Churn Rate ###

In [None]:
def compute_index(lst1, lst2):
    assert len(lst1) == len(lst2)
    common_names = list(set(lst1) & set(lst2))
    return len(common_names) / len(lst1)

In [None]:
# Define a mapping from month names to month numbers
month_mapping = {'Jan': 1, 'Feb': 2, 'Mar': 3, 'Apr': 4, 'May': 5, 'Jun': 6,
                 'Jul': 7, 'Aug': 8, 'Sep': 9, 'Oct': 10, 'Nov': 11, 'Dec': 12}

# Split the month and day from the input string
new = df['Date'].str.split('/', n=1, expand=True)
df['month'] = new[0]
df['year'] = new[1]

# Convert the month to a month number using the mapping
df['month'] = df['month'].apply(lambda x: month_mapping[x])

# Convert the year to an integer
df['year'] = pd.to_numeric(df['year'])

grouped_df = df.groupby(['month', 'Top Contributors'])['Posts'].sum()

# Iterate through each month and calculate the index
for i, (month, group) in enumerate(grouped_df.groupby(level=0)):
    # Get the top 10 names for this month
    top_contributors = group.nlargest(5)
    
    # If this is not the first month, calculate the index
    if i > 0:
        # Get the top 5 names for the previous month
        prev_month = month - 1
        prev_group = grouped_df.loc[prev_month]
        prev_top_contributors = prev_group.nlargest(5)
        
        # Calculate the index
        index = compute_index(top_contributors, prev_top_contributors)
        print(f'Index for month {month}: {index}')