# Parsing League of Legends Match History
This notebook parses the match history data from an HTML file and creates a structured JSON file for analysis. Then EDA and VD is applied to the data. Relationship of different aspects of the data and the win rate(WR) is tackled and last but not least a regression model is formed using logistic regression and cross validation with the data set getting divided into 5 sets. The coefficients of the logistic regression is supplied as well for one to calculate the expected wr or outcome of a hypothetical game in future.

In [1]:
import json
from datetime import datetime, timedelta
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.metrics import accuracy_score, mean_squared_error, roc_auc_score, confusion_matrix
from sklearn.calibration import calibration_curve
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from bs4 import BeautifulSoup


In [2]:
# Input and output file paths
html_file = 'dsa210_data.html'
output_file = 'match_history.json'

In [3]:
def parse_match_history(html_path, output_path):
    # Load the HTML content
    with open(html_path, 'r', encoding='utf-8') as file:
        soup = BeautifulSoup(file, 'html.parser')

    # Data structure to store match details
    match_data = []

    # Locate the match history table
    match_table = soup.find('table', {'class': 'recentGamesTable'})
    if not match_table:
        print("Match history table not found.")
        return

    # Extract rows from the table body
    rows = match_table.find('tbody').find_all('tr', recursive=False)

    for i, row in enumerate(rows):
        # Extract data cells from the row
        cells = row.find_all('td', recursive=False)
        
        if len(cells) < 5:
            continue

        # Champion, outcome, and duration
        champion = cells[0].find('a').find('img')['alt'] if cells[0].find('a') else "Unknown"
        outcome = 'Victory' if 'Victory' in cells[2].get_text(strip=True) else 'Defeat'
        duration = cells[3].get_text(strip=True)

        # Role and side
        role, side = assign_role_and_side_to_summoner(row)

        # KDA
        kda_container = cells[4].find('div', {'class': 'kdaContainer'})
        kills = int(kda_container.find('span', {'class': 'kills'}).get_text(strip=True)) if kda_container else 0
        deaths = int(kda_container.find('span', {'class': 'deaths'}).get_text(strip=True)) if kda_container else 0
        assists = int(kda_container.find('span', {'class': 'assists'}).get_text(strip=True)) if kda_container else 0
        kda = (kills + assists) / deaths if deaths else kills + assists

        # CS and KP
        cs = cells[5].find('span', {'class': 'number'}).get_text(strip=True) if cells[5] else "0"
        kp = cells[5].find('span', {'class': 'killParticipation'}).get_text(strip=True) if cells[5] else "0%"

        # Time ago
        time_ago = parse_game_age(cells[8].get_text(strip=True))

        # Append match data
        match_data.append({
            'champion': champion,
            'outcome': outcome,
            'duration': duration,
            'role': role,
            'side': side,
            'kills': kills,
            'deaths': deaths,
            'assists': assists,
            'kda': kda,
            'cs': cs,
            'kp': kp,
            'game_age': time_ago.strftime('%Y-%m-%dT%H:%M:%S')
        })

    with open(output_path, 'w', encoding='utf-8') as json_file:
        json.dump(match_data, json_file, indent=4)

def assign_role_and_side_to_summoner(row):
    # Role assignment logic
    return "TOP", "blue"

def parse_game_age(time_ago_text):
    # Time parsing logic
    return datetime.now()


In [4]:
parse_match_history(html_file, output_file)

In [5]:
# Load the match history data from the JSON file
input_file = 'match_history.json'

def calculate_win_rates(input_path):
    # Load the data
    with open(input_path, 'r', encoding='utf-8') as json_file:
        match_data = json.load(json_file)

    # Filter out remakes (matches with durations between 0 and 10 minutes)
    match_data = [match for match in match_data if 'duration' in match and convert_duration_to_minutes(match['duration']) > 10]

    # Calculate overall win rate
    total_matches = len(match_data)
    total_wins = sum(1 for match in match_data if match['outcome'] == 'Victory')
    overall_win_rate = total_wins / total_matches if total_matches > 0 else 0

    # Calculate win rates for champions
    champion_stats = {}
    for match in match_data:
        champion = match['champion']
        outcome = match['outcome']
        if champion not in champion_stats:
            champion_stats[champion] = {'wins': 0, 'total': 0}
        champion_stats[champion]['total'] += 1
        if outcome == 'Victory':
            champion_stats[champion]['wins'] += 1

    # Calculate win rates for each champion
    champion_win_rates = {champion: stats['wins'] / stats['total'] for champion, stats in champion_stats.items()}

    # Sort champions by number of matches played (descending) and get top 10
    sorted_champions = sorted(champion_win_rates.items(), key=lambda item: champion_stats[item[0]]['total'], reverse=True)
    top_10_champions = sorted_champions[:10]

    # Data for plotting
    champions = [champion for champion, _ in top_10_champions]
    win_rates = [champion_win_rates[champion] for champion, _ in top_10_champions]
    match_counts = [champion_stats[champion]['total'] for champion, _ in top_10_champions]

    return champions, win_rates, overall_win_rate, match_counts, total_matches

def convert_duration_to_minutes(duration):
    """Convert match duration from 'XXmin XXs' to total minutes"""
    minutes, seconds = 0, 0
    if 'min' in duration:
        minutes = int(duration.split('min')[0].strip())
    if 's' in duration:
        seconds = int(duration.split('min')[1].split('s')[0].strip())
    return minutes + seconds / 60

def plot_champion_win_rates(champions, win_rates, overall_win_rate, match_counts, total_matches):
    # Set up the plot with dual axes
    fig, ax1 = plt.subplots(figsize=(12, 7))

    # Create a horizontal bar chart for the number of games played with a pastel color
    ax1.barh(champions, match_counts, color='#a3c9f1', label='Games Played', align='center')  # Pastel Blue color

    # Create a secondary axis to show the win rate with a bright shiny blue line
    ax2 = ax1.twiny()  # Create a second axis on top
    ax2.plot(win_rates, champions, color='#1E90FF', marker='o', label='Win Rate (%)', linestyle='-',
             linewidth=2)  # Bright Shiny Blue
    ax2.set_xlim(0, 1)  # Win rates from 0 to 1

    # Add labels and titles
    ax1.set_xlabel('Games Played', fontsize=12)
    ax2.set_xlabel('Win Rate (%)', fontsize=12)
    ax1.set_ylabel('Champion', fontsize=12)
    ax1.set_title('Top 10 Most Played Champions: Win Rates vs Games Played', fontsize=14)

    # Plot the overall win rate as a vertical line
    ax2.axvline(x=overall_win_rate, color='#d1d7d8', linestyle='--',
                label=f'Overall Win Rate ({overall_win_rate * 100:.2f}%)')  # Light pastel gray

    # Display the total number of games played
    ax1.text(0.5, -0.12, f'Total Games Played: {total_matches}', ha='center', va='center', transform=ax1.transAxes,
             fontsize=12)

    # Display the plot
    ax1.legend(loc='upper left', fontsize=10)
    ax2.legend(loc='upper right', fontsize=10)
    plt.tight_layout()
    plt.show()

# Execute the analysis and plotting
champions, win_rates, overall_win_rate, match_counts, total_matches = calculate_win_rates(input_file)
plot_champion_win_rates(champions, win_rates, overall_win_rate, match_counts, total_matches)
