In [7]:
import pandas as pd
import os

# Define file paths for your CSV data
match_ids_csv = '../data/raw/match_ids.csv'
summoner_details_csv = '../data/raw/summoner_details.csv'
entries_csv = '../data/raw/entries.csv'

# Ensure all CSV files exist
for file in [match_ids_csv, summoner_details_csv, entries_csv]:
    if not os.path.exists(file):
        print(f"{file} not found.")
        exit()

# Load the data into pandas DataFrames
match_ids_df = pd.read_csv(match_ids_csv)
summoner_details_df = pd.read_csv(summoner_details_csv)
entries_df = pd.read_csv(entries_csv)

# Group match_ids by puuid to count the number of matches per account
matches_per_puuid = match_ids_df.groupby('puuid')['match_id'].nunique().reset_index(name='match_count')

# Merge with summoner_details to get summoner_id
matches_per_puuid = pd.merge(matches_per_puuid, summoner_details_df[['puuid', 'summoner_id']], on='puuid', how='left')

# Merge with entries to get the rank and tier
matches_per_puuid = pd.merge(matches_per_puuid, entries_df[['summoner_id', 'rank', 'tier']], on='summoner_id', how='left')

# Group by rank, tier, and number of matches
grouped_matches = matches_per_puuid.groupby(['rank', 'tier', 'match_count']).size().reset_index(name='count')

# Display the grouped results
print(grouped_matches)

# Optionally, save the grouped results to a CSV file
grouped_matches.to_csv('../data/processed/grouped_matches.csv', index=False)


   rank  tier  match_count  count
0    II  GOLD           22      1
1    II  GOLD           25      1
2    II  GOLD           31      1
3    II  GOLD           47      1
4    II  GOLD           59     20
5   III  GOLD           14      1
6   III  GOLD           18      1
7   III  GOLD           23      1
8   III  GOLD           33      1
9   III  GOLD           34      1
10  III  GOLD           36      1
11  III  GOLD           38      1
12  III  GOLD           39      1
13  III  GOLD           52      1
14  III  GOLD           53      1
15  III  GOLD           59     24
16   IV  GOLD            9      2
17   IV  GOLD           22      1
18   IV  GOLD           59     26


In [8]:
# Filter the accounts that have exactly 59 matches
filtered_accounts = matches_per_puuid[matches_per_puuid['match_count'] == 59]

# Display the filtered results
print(filtered_accounts)

# Optionally, save the filtered results to a CSV file for further use
filtered_accounts.to_csv('../data/processed/filtered_accounts_with_59_matches.csv', index=False)


                                                puuid  match_count  \
2   2L5tVwrIogGyB7WcXKM5wSvqjO-M2MXlNkEprtMmqlR5cU...           59   
3   2qkoaHrz1dWku8VAgEjblYaprBEqgewXzamCf1QdqyqjcM...           59   
4   3jgt0zNs-YzYH703f6emUUj6K8vkeupG_8ZU1DuGsv0y8_...           59   
5   4K8FJPOjTSTa9UlEU6WvflI04u5Hk20K0-qEz0bQbePIEK...           59   
6   5IMEoc21qtIT7_uiKbtq8UdyEPL9LC7eRdMuMY-_w1qora...           59   
..                                                ...          ...   
82  ws72IxAdAs2WU0M6K7nCO8Yek9E3SEJHI0NFjSYLYzcT0h...           59   
83  xA1JKB2ujC5fpJS5Y6S_tqEq6y4tU-VA2zcNtuzUTRpFTw...           59   
84  xM-QykIXsWAJte5zqS357lnxtjqTATM3ixfS1kgTs5dHS9...           59   
85  yX-c1obh0Ax4uu-vbQQMSFjOH-rXykTin890_INSN4zHFv...           59   
86  zjVtYCLC5GkgrSq_05S0OB870najP6rd772MhlUmCgQtEH...           59   

                                         summoner_id rank  tier  
2   9lAqeClj-g6evDTejZLYCy08QbqOD8sF4wn4fW6pB1xCA5XZ   IV  GOLD  
3       gYBnbZg_XiGfW6bdIm1

In [9]:
import pandas as pd

# Define file paths for your CSV data
entries_csv = '../data/raw/entries.csv'
match_ids_csv = '../data/raw/match_ids.csv'
summoner_details_csv = '../data/raw/summoner_details.csv'

# Load the data into pandas DataFrames
entries_df = pd.read_csv(entries_csv)
match_ids_df = pd.read_csv(match_ids_csv)
summoner_details_df = pd.read_csv(summoner_details_csv)

# Get the summoner_ids of accounts that have exactly 59 matches
puuids_with_59_matches = match_ids_df.groupby('puuid')['match_id'].count()
puuids_with_59_matches = puuids_with_59_matches[puuids_with_59_matches == 59].index

# Get the corresponding summoner_ids from summoner_details.csv
summoner_ids_with_59_matches = summoner_details_df[summoner_details_df['puuid'].isin(puuids_with_59_matches)]['summoner_id']

# Filter the entries.csv for those summoner_ids
filtered_entries = entries_df[entries_df['summoner_id'].isin(summoner_ids_with_59_matches)]

# Display the filtered entries DataFrame
print(filtered_entries)

# Optionally, save the filtered entries to a new CSV file
filtered_entries.to_csv('../data/processed/filtered_entries_with_59_matches.csv', index=False)


                               league_id       queue_type  tier rank  \
0   71e2cc57-96f0-4c32-a9a6-a247d322eaf5  RANKED_SOLO_5x5  GOLD   II   
2   75bb654e-0516-4de6-aa10-bf48bc378f8f  RANKED_SOLO_5x5  GOLD   II   
3   3becfc0f-c27e-4311-95ac-94d09932fd01  RANKED_SOLO_5x5  GOLD   II   
4   2610e86e-1f23-411f-8cc7-82856ebf934a  RANKED_SOLO_5x5  GOLD   II   
6   34bd5cf8-47da-44d3-8d73-a40eaddf54cc  RANKED_SOLO_5x5  GOLD   II   
..                                   ...              ...   ...  ...   
80  b33f1937-be10-4488-8786-fd19ff68a8e5  RANKED_SOLO_5x5  GOLD   IV   
81  6b4558e6-bf86-4ab3-b6a6-16eb2a52c44b  RANKED_SOLO_5x5  GOLD   IV   
82  47f8c8c4-2470-4d5e-9239-8d4bbd6a7527  RANKED_SOLO_5x5  GOLD   IV   
83  df46043c-5c85-4c98-aba1-08dd6d2c8de8  RANKED_SOLO_5x5  GOLD   IV   
84  0526c5c5-f50f-47c6-b420-caf3b3e92bd1  RANKED_SOLO_5x5  GOLD   IV   

                                         summoner_id  league_points  wins  \
0   27e3ppC2YX1YRomMgqPJv3cUmkARZsKL4d-yzlNE0IdIVhqJ      

In [10]:
import pandas as pd
import os

# Define file paths for your CSV data
entries_csv = '../data/raw/entries.csv'
match_ids_csv = '../data/raw/match_ids.csv'
summoner_details_csv = '../data/raw/summoner_details.csv'
processed_entries_csv = '../data/processed/entries.csv'  # Processed file to append to

# Load the data into pandas DataFrames
entries_df = pd.read_csv(entries_csv)
match_ids_df = pd.read_csv(match_ids_csv)
summoner_details_df = pd.read_csv(summoner_details_csv)

# Get the summoner_ids of accounts that have exactly 59 matches
puuids_with_59_matches = match_ids_df.groupby('puuid')['match_id'].count()
puuids_with_59_matches = puuids_with_59_matches[puuids_with_59_matches == 59].index

# Get the corresponding summoner_ids from summoner_details.csv
summoner_ids_with_59_matches = summoner_details_df[summoner_details_df['puuid'].isin(puuids_with_59_matches)]['summoner_id']

# Filter the entries.csv for those summoner_ids
filtered_entries = entries_df[entries_df['summoner_id'].isin(summoner_ids_with_59_matches)]

# Load the existing processed entries CSV file
if os.path.exists(processed_entries_csv):
    processed_entries_df = pd.read_csv(processed_entries_csv)
else:
    processed_entries_df = pd.DataFrame(columns=entries_df.columns)  # Create an empty DataFrame if file doesn't exist

# Append the new filtered entries to the existing processed entries
updated_entries_df = pd.concat([processed_entries_df, filtered_entries]).drop_duplicates(subset='summoner_id')

# Save the updated entries DataFrame back to the processed folder
updated_entries_df.to_csv(processed_entries_csv, index=False)

print(f"Appended new accounts to {processed_entries_csv}")


Appended new accounts to ../data/processed/entries.csv


In [11]:
import pandas as pd
import os

# Define file paths for your CSV data
entries_csv = '../data/raw/entries.csv'
match_ids_csv = '../data/raw/match_ids.csv'
summoner_details_csv = '../data/raw/summoner_details.csv'
processed_entries_csv = '../data/processed/entries.csv'  # Processed file to append to

# Load the data into pandas DataFrames
entries_df = pd.read_csv(entries_csv)
match_ids_df = pd.read_csv(match_ids_csv)
summoner_details_df = pd.read_csv(summoner_details_csv)
processed_entries_df = pd.read_csv(processed_entries_csv)

# Step 1: Find the groups that need more accounts
current_counts = processed_entries_df.groupby(['rank', 'time_group']).size().reset_index(name='count')
needed_groups = current_counts[(current_counts['count'] < 100)]

# Step 2: Get the summoner_ids of accounts that have exactly 59 matches
puuids_with_59_matches = match_ids_df.groupby('puuid')['match_id'].count()
puuids_with_59_matches = puuids_with_59_matches[puuids_with_59_matches == 59].index

# Get the corresponding summoner_ids from summoner_details.csv
summoner_ids_with_59_matches = summoner_details_df[summoner_details_df['puuid'].isin(puuids_with_59_matches)]['summoner_id']

# Step 3: Filter the entries.csv for those summoner_ids
filtered_entries = entries_df[entries_df['summoner_id'].isin(summoner_ids_with_59_matches)]

# Step 4: Check how many accounts are missing in each group and append only those
for _, row in needed_groups.iterrows():
    rank = row['rank']
    time_group = row['time_group']
    current_count = row['count']
    missing_count = 100 - current_count
    
    # Filter accounts in the required rank and time_group
    candidates = filtered_entries[(filtered_entries['rank'] == rank) & (filtered_entries['time_group'] == time_group)]
    
    # Append only the number of missing accounts
    if len(candidates) > missing_count:
        selected_accounts = candidates.sample(n=missing_count, random_state=42)  # Randomly select the needed accounts
    else:
        selected_accounts = candidates
    
    # Append the selected accounts to the processed entries
    processed_entries_df = pd.concat([processed_entries_df, selected_accounts]).drop_duplicates(subset='summoner_id')

# Step 5: Save the updated entries DataFrame back to the processed folder
processed_entries_df.to_csv(processed_entries_csv, index=False)

print(f"Appended the necessary accounts to {processed_entries_csv}.")


KeyError: 'time_group'

In [12]:
import pandas as pd
import os

# Define file paths for your CSV data
processed_entries_csv = '../data/processed/entries.csv'
raw_entries_csv = '../data/raw/entries.csv'

# Load the data into pandas DataFrames
processed_entries_df = pd.read_csv(processed_entries_csv)
raw_entries_df = pd.read_csv(raw_entries_csv)

# Step 1: Find the groups that have fewer than 100 accounts in the processed file
current_counts = processed_entries_df.groupby(['rank']).size().reset_index(name='count')
print("Current group counts in the processed file:")
print(current_counts)

# The missing counts based on the initial results:
missing_counts = {
    ('II', 'GOLD'): 10,  # Needs 10 more accounts to reach 100
    ('III', 'GOLD'): 5,  # Needs 5 more accounts to reach 100
    ('IV', 'GOLD'): 9   # Needs 9 more accounts to reach 100
}

# Step 2: Filter raw entries based on the missing accounts
new_entries_to_add = pd.DataFrame()

for (rank, tier), missing_count in missing_counts.items():
    potential_accounts = raw_entries_df[(raw_entries_df['rank'] == rank) & (raw_entries_df['tier'] == tier)]
    
    # Check if there are enough accounts to add
    if len(potential_accounts) >= missing_count:
        selected_accounts = potential_accounts.sample(n=missing_count, random_state=1)
    else:
        selected_accounts = potential_accounts  # Add all if not enough
    
    new_entries_to_add = pd.concat([new_entries_to_add, selected_accounts])

# Step 3: Append the new accounts to the processed entries DataFrame
final_entries_df = pd.concat([processed_entries_df, new_entries_to_add]).drop_duplicates(subset='summoner_id')

# Step 4: Save the updated entries back to the processed folder
final_entries_df.to_csv(processed_entries_csv, index=False)

print(f"Updated entries saved to {processed_entries_csv}")


Current group counts in the processed file:
  rank  count
0    I    600
1   II    590
2  III    595
3   IV    591
Updated entries saved to ../data/processed/entries.csv


In [1]:
import pandas as pd
import os

# Define the file path to the processed match_ids.csv
match_ids_csv = '../data/processed/match_ids.csv'

# Ensure the match_ids CSV file exists
if not os.path.exists(match_ids_csv):
    print(f"{match_ids_csv} not found.")
    exit()

# Load the match_ids CSV into a DataFrame
match_ids_df = pd.read_csv(match_ids_csv)

# Count the number of match_ids per PUUID
matches_per_puuid = match_ids_df.groupby('puuid')['match_id'].nunique().reset_index(name='match_count')

# Display the result
print(matches_per_puuid)

# Optionally, save the result to a CSV file
# matches_per_puuid.to_csv('../data/processed/matches_per_puuid.csv', index=False)


                                                  puuid  match_count
0     -0DVCIv9JiSfm4oB2zIv19NhZH9IWMGk3-n4m6IkxRqBfV...           59
1     -1JCcBaL9-zN4y676N7Qm4LDtDq33pNXpfk6pUv33NOGiE...           43
2     -1i77tYvBS-0lVcpChDXjWBrd5liH16qJvxKRyNrg51B9S...           59
3     -21tZOzPs2yiWOXSLvyZavpds9nVBIKQczbHXdKt3XwsQt...           59
4     -2AUTpYzPKKru6-ZE6zFF5VEZFt03pRizZ_1uFYO32mrsT...           59
...                                                 ...          ...
2371  zqBhjeKnIiiiNEnQzotZ9sIVy18PAgT_hApQnB_TKeDP4v...           59
2372  zu0O-Imedl0ONaFOw_HHoCZmIrZgF3mqCkVkwBjjMPmYyH...           59
2373  zvFS1lmM0if3Kiss1utrPQNhBke-ryc3b5rjy-Ocfe_prx...           59
2374  zxmJuKE0eCnrgCwpg0Q8uQMWbmezhuk6zkw3sTEe_TR9Jk...           59
2375  zyMJb5l_r6pBPvYVNkwmaptbveI-uZZrNkmpI1QVrppQgK...           59

[2376 rows x 2 columns]


In [2]:
import pandas as pd
import os

# Define the file path to the processed match_ids.csv
match_ids_csv = '../data/processed/match_ids.csv'

# Ensure the match_ids CSV file exists
if not os.path.exists(match_ids_csv):
    print(f"{match_ids_csv} not found.")
    exit()

# Load the match_ids CSV into a DataFrame
match_ids_df = pd.read_csv(match_ids_csv)

# Count the number of match_ids per PUUID
matches_per_puuid = match_ids_df.groupby('puuid')['match_id'].nunique().reset_index(name='match_count')

# Define bins for grouping the number of matches
bins = [1, 10, 20, 30, 40, 50, 59, 60]  # Adjust these ranges as needed
labels = ['1-10', '11-20', '21-30', '31-40', '41-50', '51-59', '60+']

# Assign each account to a group based on the number of matches
matches_per_puuid['group'] = pd.cut(matches_per_puuid['match_count'], bins=bins, labels=labels, right=False)

# Count the number of PUUIDs in each group
grouped_counts = matches_per_puuid.groupby('group')['puuid'].count().reset_index(name='puuid_count')

# Display the grouped result
print(grouped_counts)

# Optionally, save the grouped result to a CSV file
# grouped_counts.to_csv('../data/processed/grouped_matches_per_puuid.csv', index=False)


   group  puuid_count
0   1-10            0
1  11-20            0
2  21-30            0
3  31-40           84
4  41-50           83
5  51-59           66
6    60+         2143


  grouped_counts = matches_per_puuid.groupby('group')['puuid'].count().reset_index(name='puuid_count')
