In [1]:
import pandas as pd
import os
import random

# Define file paths for your CSV data
summoner_details_csv = '../data/raw/summoner_details.csv'
match_details_csv = '../data/raw/match_details.csv'
entries_csv = '../data/raw/entries.csv'
match_ids_csv = '../data/raw/match_ids.csv'

# Ensure all CSV files exist
for file in [summoner_details_csv, match_details_csv, entries_csv, match_ids_csv]:
    if not os.path.exists(file):
        print(f"{file} not found.")
        exit()

# Load the data into pandas DataFrames
summoner_details_df = pd.read_csv(summoner_details_csv)
match_details_df = pd.read_csv(match_details_csv)
entries_df = pd.read_csv(entries_csv)
match_ids_df = pd.read_csv(match_ids_csv)

# Filter out rows in match_details that correspond to players in summoner_details
target_summoners = summoner_details_df['summoner_id'].unique()
filtered_match_details = match_details_df[match_details_df['summoner_id'].isin(target_summoners)].copy()

# Convert game_creation to datetime format
filtered_match_details['game_creation_converted'] = pd.to_datetime(filtered_match_details['game_creation'], unit='ms')

# Calculate the difference in days
today = pd.Timestamp.now()
filtered_match_details['days_diff'] = (today - filtered_match_details['game_creation_converted']).dt.days

# Define the groups based on days_diff
def assign_time_group(days):
    if days <= 7:
        return 'Active Players (1-7 days)'
    elif 8 <= days <= 14:
        return 'Slightly Inactive (8-14 days)'
    elif 15 <= days <= 28:
        return 'Moderately Inactive (15-28 days)'
    elif 29 <= days <= 40:
        return 'Highly Inactive (29-40 days)'
    elif 41 <= days <= 60:
        return 'At Risk of Churn (41-60 days)'
    else:
        return 'Beyond 60 days - Lost'

# Apply the function to create the time groups
filtered_match_details['time_group'] = filtered_match_details['days_diff'].apply(assign_time_group)

# Joining with the entries DataFrame to get the rank information
combined_data = pd.merge(filtered_match_details, entries_df, on='summoner_id', how='left')

# Group by time_group and rank
grouped_results = combined_data.groupby(['time_group', 'rank']).size().reset_index(name='count')

# Select 100 accounts per rank and tier
accounts_per_group = 100
selected_summoners = []

for rank in entries_df['rank'].unique():
    for group in filtered_match_details['time_group'].unique():
        summoners_in_group = combined_data[(combined_data['rank'] == rank) & (combined_data['time_group'] == group)]
        if len(summoners_in_group) > accounts_per_group:
            selected_summoners += random.sample(list(summoners_in_group['summoner_id']), accounts_per_group)
        else:
            selected_summoners += list(summoners_in_group['summoner_id'])

# Filter the CSV files for the selected summoners
filtered_entries = entries_df[entries_df['summoner_id'].isin(selected_summoners)]
filtered_summoner_details = summoner_details_df[summoner_details_df['summoner_id'].isin(selected_summoners)]
filtered_match_ids = match_ids_df[match_ids_df['puuid'].isin(filtered_summoner_details['puuid'])]

# Create the processed folder if it doesn't exist
processed_folder = '../data/processed'
if not os.path.exists(processed_folder):
    os.makedirs(processed_folder)

# Save the filtered data into the processed folder
filtered_entries.to_csv(f'{processed_folder}/entries.csv', index=False)
filtered_summoner_details.to_csv(f'{processed_folder}/summoner_details.csv', index=False)
filtered_match_ids.to_csv(f'{processed_folder}/match_ids.csv', index=False)

print(f"Filtered data saved in {processed_folder}")


Filtered data saved in ../data/processed
