In [1]:
import pandas as pd
import os

# Load the processed data from the CSV file
processed_match_details_csv = '../data/processed/match_details_cleaned.csv'
processed_entries_csv = '../data/processed/entries.csv'

# Ensure the processed CSV files exist
for file in [processed_match_details_csv, processed_entries_csv]:
    if not os.path.exists(file):
        print(f"{file} not found.")
        exit()

# Load the data into pandas DataFrames
processed_match_details_df = pd.read_csv(processed_match_details_csv)
processed_entries_df = pd.read_csv(processed_entries_csv)

# Convert 'game_creation' to datetime format in the processed match_details
processed_match_details_df['game_creation_converted'] = pd.to_datetime(processed_match_details_df['game_creation'], unit='ms')

# Calculate the difference in days
today = pd.Timestamp.now()
processed_match_details_df['days_diff'] = (today - processed_match_details_df['game_creation_converted']).dt.days

# Define the groups based on days_diff (same as before)
def assign_time_group(days):
    if days <= 7:
        return 'Active Players (1-7 days)'
    elif 8 <= days <= 14:
        return 'Slightly Inactive (8-14 days)'
    elif 15 <= days <= 28:
        return 'Moderately Inactive (15-28 days)'
    elif 29 <= days <= 40:
        return 'Highly Inactive (29-40 days)'
    elif 41 <= days <= 60:
        return 'At Risk of Churn (41-60 days)'
    else:
        return 'Beyond 60 days - Lost'

# Apply the function to create the time groups
processed_match_details_df['time_group'] = processed_match_details_df['days_diff'].apply(assign_time_group)

# Merge the processed match_details with the processed entries to get the rank information
combined_processed_data = pd.merge(processed_match_details_df, processed_entries_df, on='summoner_id', how='left')

# Group by rank and time_group to count the number of summoners in each group
grouped_processed_results = combined_processed_data.groupby(['rank', 'time_group']).size().reset_index(name='count')

# Print the grouped results to see where the shortfall is
print(grouped_processed_results)


   rank                        time_group  count
0     I         Active Players (1-7 days)    100
1     I     At Risk of Churn (41-60 days)    100
2     I             Beyond 60 days - Lost    100
3     I      Highly Inactive (29-40 days)    100
4     I  Moderately Inactive (15-28 days)    100
5     I     Slightly Inactive (8-14 days)    100
6    II         Active Players (1-7 days)    100
7    II     At Risk of Churn (41-60 days)    100
8    II             Beyond 60 days - Lost    100
9    II      Highly Inactive (29-40 days)     97
10   II  Moderately Inactive (15-28 days)    100
11   II     Slightly Inactive (8-14 days)    100
12  III         Active Players (1-7 days)    100
13  III     At Risk of Churn (41-60 days)    100
14  III             Beyond 60 days - Lost    100
15  III      Highly Inactive (29-40 days)    100
16  III  Moderately Inactive (15-28 days)    100
17  III     Slightly Inactive (8-14 days)    100
18   IV         Active Players (1-7 days)    100
19   IV     At Risk 