In [7]:
import pandas as pd
import os
import random

# Define file paths for your CSV data
summoner_details_csv = '../data/mass_fetch_8K_accounts/summoner_details.csv'
match_details_csv = '../data/mass_fetch_8K_accounts/match_details.csv'
entries_csv = '../data/mass_fetch_8K_accounts/entries.csv'
match_ids_csv = '../data/mass_fetch_8K_accounts/match_ids.csv'

# Ensure all CSV files exist
for file in [summoner_details_csv, match_details_csv, entries_csv, match_ids_csv]:
    if not os.path.exists(file):
        print(f"{file} not found.")
        exit()

# Load the data into pandas DataFrames
summoner_details_df = pd.read_csv(summoner_details_csv)
match_details_df = pd.read_csv(match_details_csv)
entries_df = pd.read_csv(entries_csv)
match_ids_df = pd.read_csv(match_ids_csv)

# Convert 'game_creation' to datetime format
match_details_df['game_creation_converted'] = pd.to_datetime(match_details_df['game_creation'], unit='ms')

# Calculate the difference in days
today = pd.Timestamp.now()
match_details_df['days_diff'] = (today - match_details_df['game_creation_converted']).dt.days

# Define the groups based on days_diff
def assign_time_group(days):
    if days <= 7:
        return 'Active Players (1-7 days)'
    elif 8 <= days <= 14:
        return 'Slightly Inactive (8-14 days)'
    elif 15 <= days <= 28:
        return 'Moderately Inactive (15-28 days)'
    elif 29 <= days <= 40:
        return 'Highly Inactive (29-40 days)'
    elif 41 <= days <= 60:
        return 'At Risk of Churn (41-60 days)'
    else:
        return 'Beyond 60 days - Lost'

# Apply the function to create the time groups
match_details_df['time_group'] = match_details_df['days_diff'].apply(assign_time_group)

# Clean the match_details.csv by removing duplicate summoner_ids, keeping the most recent match
cleaned_match_details_df = match_details_df.sort_values('game_creation', ascending=False).drop_duplicates(subset='summoner_id', keep='first')

# Filter out rows in match_details that correspond to players in summoner_details
target_summoners = summoner_details_df['summoner_id'].unique()
filtered_match_details = cleaned_match_details_df[cleaned_match_details_df['summoner_id'].isin(target_summoners)].copy()

# Joining with the entries DataFrame to get the rank information
combined_data = pd.merge(filtered_match_details, entries_df, on='summoner_id', how='left')

# Count the number of distinct match_ids for each puuid and filter for puuids with 30 or more matches
match_count_per_puuid = match_ids_df.groupby('puuid')['match_id'].nunique().reset_index(name='match_count')
puuids_with_30_plus_matches = match_count_per_puuid[match_count_per_puuid['match_count'] >= 30]

# Merge to filter only summoners with 30 or more matches
filtered_summoner_details = pd.merge(summoner_details_df, puuids_with_30_plus_matches, on='puuid', how='inner')

# Filter the combined data based on the filtered summoner details
combined_data_filtered = combined_data[combined_data['summoner_id'].isin(filtered_summoner_details['summoner_id'])]

# Select exactly 100 accounts per rank and time_group
accounts_per_group = 100
selected_summoners = []

for rank in combined_data_filtered['rank'].unique():
    for group in combined_data_filtered['time_group'].unique():
        summoners_in_group = combined_data_filtered[(combined_data_filtered['rank'] == rank) & (combined_data_filtered['time_group'] == group)]
        if len(summoners_in_group) > accounts_per_group:
            selected_summoners += random.sample(list(summoners_in_group['summoner_id'].unique()), accounts_per_group)
        else:
            selected_summoners += list(summoners_in_group['summoner_id'].unique())

# Ensure the list of selected summoners is unique
selected_summoners = list(set(selected_summoners))

# Check group sizes and duplicates

# Group by time_group and rank and check the number of accounts per group
grouped_counts = combined_data_filtered.groupby(['time_group', 'rank']).size().reset_index(name='count')
print("Group sizes before filtering for 100 accounts per group:")
print(grouped_counts)

# Check if any group has fewer than 100 accounts
print("\nGroups with fewer than 100 accounts:")
print(grouped_counts[grouped_counts['count'] < 100])

# Check for duplicate summoner_ids in the final selected accounts
selected_summoners_df = pd.DataFrame(selected_summoners, columns=['summoner_id'])
duplicate_summoners = selected_summoners_df[selected_summoners_df.duplicated()]

print(f"\nDuplicate summoner_ids: {len(duplicate_summoners)}")
if not duplicate_summoners.empty:
    print("List of duplicate summoner_ids:")
    print(duplicate_summoners)

# Display the total number of selected summoners
print(f"\nTotal selected summoners after filtering: {len(selected_summoners)}")


Group sizes before filtering for 100 accounts per group:
                          time_group rank  count
0          Active Players (1-7 days)    I   1035
1          Active Players (1-7 days)   II   1175
2          Active Players (1-7 days)  III   1191
3          Active Players (1-7 days)   IV   1172
4      At Risk of Churn (41-60 days)    I    124
5      At Risk of Churn (41-60 days)   II    127
6      At Risk of Churn (41-60 days)  III    147
7      At Risk of Churn (41-60 days)   IV    130
8              Beyond 60 days - Lost    I    218
9              Beyond 60 days - Lost   II    149
10             Beyond 60 days - Lost  III    127
11             Beyond 60 days - Lost   IV    146
12      Highly Inactive (29-40 days)    I    105
13      Highly Inactive (29-40 days)   II     90
14      Highly Inactive (29-40 days)  III     95
15      Highly Inactive (29-40 days)   IV     91
16  Moderately Inactive (15-28 days)    I    233
17  Moderately Inactive (15-28 days)   II    191
18  Moderate