# Parsing League of Legends Match History
This notebook parses the match history data from an HTML file and creates a structured JSON file for analysis. Then EDA and VD is applied to the data. Relationship of different aspects of the data and the win rate(WR) is tackled and last but not least a regression model is formed using logistic regression and cross validation with the data set getting divided into 5 sets. The coefficients of the logistic regression is supplied as well for one to calculate the expected wr or outcome of a hypothetical game in future.

In [1]:
import json
from datetime import datetime, timedelta
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.metrics import accuracy_score, mean_squared_error, roc_auc_score, confusion_matrix
from sklearn.calibration import calibration_curve
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from bs4 import BeautifulSoup


In [2]:
# Input and output file paths
html_file = 'dsa210_data.html'
output_file = 'match_history.json'

In [3]:
def parse_match_history(html_path, output_path):
    # Load the HTML content
    with open(html_path, 'r', encoding='utf-8') as file:
        soup = BeautifulSoup(file, 'html.parser')

    # Data structure to store match details
    match_data = []

    # Locate the match history table
    match_table = soup.find('table', {'class': 'recentGamesTable'})
    if not match_table:
        print("Match history table not found.")
        return

    # Extract rows from the table body
    rows = match_table.find('tbody').find_all('tr', recursive=False)

    for i, row in enumerate(rows):
        # Extract data cells from the row
        cells = row.find_all('td', recursive=False)
        
        if len(cells) < 5:
            continue

        # Champion, outcome, and duration
        champion = cells[0].find('a').find('img')['alt'] if cells[0].find('a') else "Unknown"
        outcome = 'Victory' if 'Victory' in cells[2].get_text(strip=True) else 'Defeat'
        duration = cells[3].get_text(strip=True)

        # Role and side
        role, side = assign_role_and_side_to_summoner(row)

        # KDA
        kda_container = cells[4].find('div', {'class': 'kdaContainer'})
        kills = int(kda_container.find('span', {'class': 'kills'}).get_text(strip=True)) if kda_container else 0
        deaths = int(kda_container.find('span', {'class': 'deaths'}).get_text(strip=True)) if kda_container else 0
        assists = int(kda_container.find('span', {'class': 'assists'}).get_text(strip=True)) if kda_container else 0
        kda = (kills + assists) / deaths if deaths else kills + assists

        # CS and KP
        cs = cells[5].find('span', {'class': 'number'}).get_text(strip=True) if cells[5] else "0"
        kp = cells[5].find('span', {'class': 'killParticipation'}).get_text(strip=True) if cells[5] else "0%"

        # Time ago
        time_ago = parse_game_age(cells[8].get_text(strip=True))

        # Append match data
        match_data.append({
            'champion': champion,
            'outcome': outcome,
            'duration': duration,
            'role': role,
            'side': side,
            'kills': kills,
            'deaths': deaths,
            'assists': assists,
            'kda': kda,
            'cs': cs,
            'kp': kp,
            'game_age': time_ago.strftime('%Y-%m-%dT%H:%M:%S')
        })

    with open(output_path, 'w', encoding='utf-8') as json_file:
        json.dump(match_data, json_file, indent=4)

def assign_role_and_side_to_summoner(row):
    # Role assignment logic
    return "TOP", "blue"

def parse_game_age(time_ago_text):
    # Time parsing logic
    return datetime.now()


In [4]:
parse_match_history(html_file, output_file)

In [5]:
# Load the match history data from the JSON file
input_file = 'match_history.json'

def calculate_win_rates(input_path):
    # Load the data
    with open(input_path, 'r', encoding='utf-8') as json_file:
        match_data = json.load(json_file)

    # Filter out remakes (matches with durations between 0 and 10 minutes)
    match_data = [match for match in match_data if 'duration' in match and convert_duration_to_minutes(match['duration']) > 10]

    # Calculate overall win rate
    total_matches = len(match_data)
    total_wins = sum(1 for match in match_data if match['outcome'] == 'Victory')
    overall_win_rate = total_wins / total_matches if total_matches > 0 else 0

    # Calculate win rates for champions
    champion_stats = {}
    for match in match_data:
        champion = match['champion']
        outcome = match['outcome']
        if champion not in champion_stats:
            champion_stats[champion] = {'wins': 0, 'total': 0}
        champion_stats[champion]['total'] += 1
        if outcome == 'Victory':
            champion_stats[champion]['wins'] += 1

    # Calculate win rates for each champion
    champion_win_rates = {champion: stats['wins'] / stats['total'] for champion, stats in champion_stats.items()}

    # Sort champions by number of matches played (descending) and get top 10
    sorted_champions = sorted(champion_win_rates.items(), key=lambda item: champion_stats[item[0]]['total'], reverse=True)
    top_10_champions = sorted_champions[:10]

    # Data for plotting
    champions = [champion for champion, _ in top_10_champions]
    win_rates = [champion_win_rates[champion] for champion, _ in top_10_champions]
    match_counts = [champion_stats[champion]['total'] for champion, _ in top_10_champions]

    return champions, win_rates, overall_win_rate, match_counts, total_matches

def convert_duration_to_minutes(duration):
    """Convert match duration from 'XXmin XXs' to total minutes"""
    minutes, seconds = 0, 0
    if 'min' in duration:
        minutes = int(duration.split('min')[0].strip())
    if 's' in duration:
        seconds = int(duration.split('min')[1].split('s')[0].strip())
    return minutes + seconds / 60

def plot_champion_win_rates(champions, win_rates, overall_win_rate, match_counts, total_matches):
    # Set up the plot with dual axes
    fig, ax1 = plt.subplots(figsize=(12, 7))

    # Create a horizontal bar chart for the number of games played with a pastel color
    ax1.barh(champions, match_counts, color='#a3c9f1', label='Games Played', align='center')  # Pastel Blue color

    # Create a secondary axis to show the win rate with a bright shiny blue line
    ax2 = ax1.twiny()  # Create a second axis on top
    ax2.plot(win_rates, champions, color='#1E90FF', marker='o', label='Win Rate (%)', linestyle='-',
             linewidth=2)  # Bright Shiny Blue
    ax2.set_xlim(0, 1)  # Win rates from 0 to 1

    # Add labels and titles
    ax1.set_xlabel('Games Played', fontsize=12)
    ax2.set_xlabel('Win Rate (%)', fontsize=12)
    ax1.set_ylabel('Champion', fontsize=12)
    ax1.set_title('Top 10 Most Played Champions: Win Rates vs Games Played', fontsize=14)

    # Plot the overall win rate as a vertical line
    ax2.axvline(x=overall_win_rate, color='#d1d7d8', linestyle='--',
                label=f'Overall Win Rate ({overall_win_rate * 100:.2f}%)')  # Light pastel gray

    # Display the total number of games played
    ax1.text(0.5, -0.12, f'Total Games Played: {total_matches}', ha='center', va='center', transform=ax1.transAxes,
             fontsize=12)

    # Display the plot
    ax1.legend(loc='upper left', fontsize=10)
    ax2.legend(loc='upper right', fontsize=10)
    plt.tight_layout()
    plt.show()

# Execute the analysis and plotting
champions, win_rates, overall_win_rate, match_counts, total_matches = calculate_win_rates(input_file)
plot_champion_win_rates(champions, win_rates, overall_win_rate, match_counts, total_matches)


![WR_CHAMP_PLOT](wr_champ.png)

In [7]:
def calculate_win_rates(input_path):
    # Load the data
    with open(input_path, 'r', encoding='utf-8') as json_file:
        match_data = json.load(json_file)

    # Filter out remakes (matches with durations between 0 and 10 minutes)
    match_data = [match for match in match_data if 'duration' in match and convert_duration_to_minutes(match['duration']) > 10]

    # Calculate total games played per champion
    champion_stats = {}
    for match in match_data:
        champion = match['champion']
        outcome = match['outcome']

        if champion not in champion_stats:
            champion_stats[champion] = {'wins': 0, 'total': 0}

        champion_stats[champion]['total'] += 1
        if outcome == 'Victory':
            champion_stats[champion]['wins'] += 1

    # Separate champions by the number of games played
    champions_played_1 = {champion: stats for champion, stats in champion_stats.items() if stats['total'] == 1}
    champions_played_2 = {champion: stats for champion, stats in champion_stats.items() if stats['total'] == 2}
    champions_played_3 = {champion: stats for champion, stats in champion_stats.items() if stats['total'] == 3}

    # Calculate win rates for each category
    win_rate_1 = calculate_win_rate_for_group(champions_played_1)
    win_rate_2 = calculate_win_rate_for_group(champions_played_2)
    win_rate_3 = calculate_win_rate_for_group(champions_played_3)

    # Calculate cumulative win rates
    cumulative_wr = []
    cumulative_wins = 0
    cumulative_games = 0
    for group in [champions_played_1, champions_played_2, champions_played_3]:
        for champion, stats in group.items():
            cumulative_wins += stats['wins']
            cumulative_games += stats['total']
        cumulative_wr.append(cumulative_wins / cumulative_games if cumulative_games > 0 else 0)

    # Calculate the win rate for all champions
    total_wins = sum(stats['wins'] for stats in champion_stats.values())
    total_games = sum(stats['total'] for stats in champion_stats.values())
    overall_win_rate = total_wins / total_games if total_games > 0 else 0

    return win_rate_1, win_rate_2, win_rate_3, cumulative_wr, overall_win_rate

def convert_duration_to_minutes(duration):
    """Convert match duration from 'XXmin XXs' to total minutes"""
    minutes, seconds = 0, 0
    if 'min' in duration:
        minutes = int(duration.split('min')[0].strip())
    if 's' in duration:
        seconds = int(duration.split('min')[1].split('s')[0].strip())
    return minutes + seconds / 60

def calculate_win_rate_for_group(champions):
    """Calculate average win rate for a group of champions"""
    total_wins = sum(champion['wins'] for champion in champions.values())
    total_games = sum(champion['total'] for champion in champions.values())
    return total_wins / total_games if total_games > 0 else 0

def plot_win_rate_bars(win_rate_1, win_rate_2, win_rate_3, cumulative_wr, overall_win_rate):
    # Data for the bar plot
    categories = ['1 Game Played', '2 Games Played', '3 Games Played']
    win_rates = [win_rate_1, win_rate_2, win_rate_3]

    # Define pastel green tones for the bars
    green_tones = ['#A8D5BA', '#6BBF71', '#2D9A5F']  # Light to Dark

    # Sort the win rates in ascending order to apply colors based on rank
    sorted_win_rates = sorted(zip(win_rates, green_tones), reverse=True)

    # Plotting the bar graph with thinner bars and pastel green tones
    plt.figure(figsize=(8, 5))
    bars = plt.bar([1, 2, 3], [wr for wr, _ in sorted_win_rates],
                   color=[color for _, color in sorted_win_rates], width=0.4)  # Green colors and thinner bars

    # Adding the exact win rate on top of each bar
    for bar in bars:
        yval = bar.get_height()
        plt.text(bar.get_x() + bar.get_width() / 2, yval + 0.02, f'{yval:.2f}', ha='center', va='bottom', fontsize=12)

    # Plot the overall win rate for all champions as a straight black line
    plt.axhline(y=overall_win_rate, color='black', linestyle='-', label=f'Overall Win Rate: {overall_win_rate:.2f}', linewidth=2)

    # Adding labels and title
    plt.xticks([1, 2, 3], categories)  # Set the correct x-axis labels (1, 2, 3)
    plt.xlabel('Number of Games Played')
    plt.ylabel('Average Win Rate')
    plt.title('Average Win Rate for Champions Played 1, 2, or 3 Times')

    # Display the plot
    plt.legend(loc='best')
    plt.tight_layout()
    plt.show()

# Execute the analysis and plotting
win_rate_1, win_rate_2, win_rate_3, cumulative_wr, overall_win_rate = calculate_win_rates(input_file)
plot_win_rate_bars(win_rate_1, win_rate_2, win_rate_3, cumulative_wr, overall_win_rate)


![WR_CHAMP_PLOT2](wr_champ2.png)

In [9]:
# Assuming `match_data` is a list of dictionaries from your parsed HTML data
# Convert it into a DataFrame
match_data_df = pd.read_json('match_history.json')

# Function to convert match duration from 'XXmin XXs' to total minutes
def convert_duration_to_minutes(duration):
    minutes, seconds = 0, 0
    if 'min' in duration:
        minutes = int(duration.split('min')[0].strip())
    if 's' in duration:
        seconds = int(duration.split('min')[1].split('s')[0].strip())
    return minutes + seconds / 60

# Apply the duration conversion to the 'duration' column
match_data_df['duration_minutes'] = match_data_df['duration'].apply(convert_duration_to_minutes)

# Filter out games with duration between 0 and 10 minutes (remakes)
match_data_df = match_data_df[(match_data_df['duration_minutes'] > 10)]

# Convert outcome to binary (1 for Victory, 0 for Defeat)
match_data_df['outcome_binary'] = match_data_df['outcome'].apply(lambda x: 1 if x == 'Victory' else 0)

# Set the size of the plot
plt.figure(figsize=(10, 6))

# Create scatter plot with regression line
sns.regplot(x='duration_minutes', y='outcome_binary', data=match_data_df,
            scatter_kws={'s': 10, 'alpha': 0.7}, # Customize scatter points
            line_kws={"color": "blue", "alpha": 0.7, "label": 'Regression Line'})  # Customize regression line

# Calculate the regression line (slope and intercept)
slope, intercept, r_value, p_value, std_err = stats.linregress(match_data_df['duration_minutes'], match_data_df['outcome_binary'])

# Adjust the formatting of the regression equation to always show small coefficients
equation = f'Y = {slope:.5f}X + {intercept:.2f}'  # Show slope with more precision

# Add annotations to clarify the scatter and regression line
plt.text(0.1, 0.9, 'Each point represents a match', fontsize=12, color='black', transform=plt.gca().transAxes)
plt.text(0.1, 0.85, 'The x-axis shows match duration in minutes', fontsize=12, color='black', transform=plt.gca().transAxes)
plt.text(0.1, 0.8, 'The y-axis shows win rate (1 = Victory, 0 = Defeat)', fontsize=12, color='black', transform=plt.gca().transAxes)
plt.text(0.1, 0.75, 'The regression line indicates the relationship between match duration and win rate', fontsize=12, color='black', transform=plt.gca().transAxes)

# Add regression equation text to the plot
plt.text(0.1, 0.7, f'Regression Equation: {equation}', fontsize=14, color='blue', transform=plt.gca().transAxes)

# Display the legend for the regression line
plt.legend(loc='lower right', fontsize=12)

# Set plot title and labels
plt.title('Match Duration vs Win Rate (Excluding Remakes)', fontsize=16)
plt.xlabel('Match Duration (minutes)', fontsize=14)
plt.ylabel('Win Rate (1 = Victory, 0 = Defeat)', fontsize=14)

# Adjust layout to avoid clipping
plt.tight_layout()

# Show the plot
plt.show()


![WR_DURATION_PLOT](wr_duration.png)

In [11]:
# Load match data from JSON file
input_file = 'match_history.json'

def load_match_data(input_path):
    # Load the data from the JSON file
    with open(input_path, 'r', encoding='utf-8') as json_file:
        match_data = json.load(json_file)
    return match_data

def parse_duration(duration_str):
    """Convert duration string (e.g., '29min 37s') to minutes"""
    minutes = 0
    if 'min' in duration_str:
        minutes = int(duration_str.split('min')[0].strip())
    return minutes

def prepare_data_for_plot(match_data):
    # Prepare the data for plotting KDA and Win Rate (Victory or Defeat)
    kda_values = []
    win_rates = []

    for match in match_data:
        # Remove games with duration between 0 and 10 minutes (inclusive)
        if 'duration' in match and parse_duration(match['duration']) <= 10:
            continue  # Skip this match

        kda_value = match['kda']
        outcome = match['outcome']

        kda_values.append(kda_value)
        win_rates.append(1 if outcome == 'Victory' else 0)  # 1 for win, 0 for loss

    return kda_values, win_rates

def plot_box_plot(kda_values, win_rates):
    # Create a DataFrame for plotting (using win rate as categories)
    data = pd.DataFrame({
        'KDA': kda_values,
        'Win Rate': ['Victory' if rate == 1 else 'Defeat' for rate in win_rates]
    })

    # Set up the boxplot using Seaborn
    plt.figure(figsize=(10, 6))
    sns.boxplot(x='Win Rate', y='KDA', data=data, palette='RdYlGn')  # Red to Green color scale

    # Add title and labels
    plt.title('Box Plot of KDA vs Win Rate (Victory vs Defeat)', fontsize=14)
    plt.xlabel('Win Rate', fontsize=12)
    plt.ylabel('KDA (Kills + Assists) / Deaths', fontsize=12)

    # Show the plot
    plt.tight_layout()
    plt.show()

# Load match data
match_data = load_match_data(input_file)

# Prepare KDA values and win/loss outcomes
kda_values, win_rates = prepare_data_for_plot(match_data)

# Plot the box plot
plot_box_plot(kda_values, win_rates)


![WR_KDA_PLOT](wr_kda.png)

In [13]:
# Load the match history data from the JSON file
input_file = 'match_history.json'

def calculate_role_win_rates(input_path):
    # Load the data
    with open(input_path, 'r', encoding='utf-8') as json_file:
        match_data = json.load(json_file)

    # Remove games with duration between 0 and 10 minutes (inclusive)
    match_data = [match for match in match_data if 'duration' in match and parse_duration(match['duration']) > 10]

    # Calculate win rates and games played for roles
    role_stats = {}
    for match in match_data:
        role = match['role']
        outcome = match['outcome']

        if role not in role_stats:
            role_stats[role] = {'wins': 0, 'total': 0}

        role_stats[role]['total'] += 1
        if outcome == 'Victory':
            role_stats[role]['wins'] += 1

    # Calculate win rates for each role
    role_win_rates = {role: stats['wins'] / stats['total'] for role, stats in role_stats.items()}

    # Sort roles by win rate (descending)
    sorted_roles = sorted(role_win_rates.items(), key=lambda item: item[1], reverse=True)

    # Data for plotting
    roles = [role for role, _ in sorted_roles]
    win_rates = [role_win_rates[role] for role, _ in sorted_roles]
    game_counts = [role_stats[role]['total'] for role, _ in sorted_roles]  # Games played per role

    return roles, win_rates, game_counts

def parse_duration(duration_str):
    """Convert duration string (e.g., '29min 37s') to minutes"""
    minutes = 0
    if 'min' in duration_str:
        minutes = int(duration_str.split('min')[0].strip())
    return minutes

def plot_role_win_rates(roles, win_rates, game_counts):
    # Set up the plot with a bar chart
    fig, ax = plt.subplots(figsize=(10, 6))

    # Define a color palette based on shades of blue
    colors = ['#A6C8FF', '#7FA9E6', '#4F8BD7', '#2E6BB1', '#1F4A8F']

    # Plot the bar chart with different shades of blue
    bars = ax.bar(roles, win_rates, color=colors, edgecolor='black')

    # Add labels to each bar for win rate
    for bar in bars:
        height = bar.get_height()
        ax.text(bar.get_x() + bar.get_width() / 2, height + 0.02, f'{height:.2f}', ha='center', va='bottom', fontsize=10)

    # Add annotations for the number of games played
    for i, bar in enumerate(bars):
        height = bar.get_height()
        ax.text(bar.get_x() + bar.get_width() / 2, -0.1, f'Games: {game_counts[i]}', ha='center', va='top', fontsize=10, color='black')

    # Add labels and title
    ax.set_xlabel('Role', fontsize=12)
    ax.set_ylabel('Win Rate (%)', fontsize=12)
    ax.set_title('Win Rate by Role (Excluding Remakes)', fontsize=14)

    # Adjust the x-axis to ensure there's no overlap
    ax.set_xticklabels(roles, rotation=45, ha='right')

    # Display the plot
    plt.tight_layout()
    plt.show()

# Execute the analysis and plotting
roles, win_rates, game_counts = calculate_role_win_rates(input_file)
plot_role_win_rates(roles, win_rates, game_counts)


![WR_ROLE_PLOT](wr_role.png)

In [15]:
# Load the match history data from the JSON file
input_file = 'match_history.json'

def parse_duration(duration_str):
    """Convert duration string (e.g., '29min 37s') to minutes"""
    minutes = 0
    if 'min' in duration_str:
        minutes = int(duration_str.split('min')[0].strip())
    return minutes

def load_and_prepare_data(input_path):
    # Load the data from the JSON file
    with open(input_path, 'r', encoding='utf-8') as json_file:
        match_data = json.load(json_file)

    # Remove games with duration between 0 and 10 minutes
    match_data = [match for match in match_data if 'duration' in match and parse_duration(match['duration']) > 10]

    # Prepare a list to store data
    role_kda_data = []

    # Process the match data
    for match in match_data:
        role = match['role']
        kda = match['kda']
        outcome = match['outcome']

        # Calculate Win Rate (1 for Victory, 0 for Defeat)
        win_rate = 1 if outcome == 'Victory' else 0

        role_kda_data.append({
            'Role': role,
            'KDA': kda,
            'Win Rate': win_rate
        })

    # Convert the list into a pandas DataFrame
    df = pd.DataFrame(role_kda_data)

    # Calculate average KDA and WR by role
    role_stats = df.groupby('Role').agg(
        avg_kda=('KDA', 'mean'),
        avg_wr=('Win Rate', 'mean')
    ).reset_index()

    return role_stats

def plot_role_kda_vs_wr(role_stats):
    # Set up the plot
    plt.figure(figsize=(10, 6))

    # Use Seaborn to create a scatter plot
    sns.scatterplot(x='avg_kda', y='avg_wr', data=role_stats, hue='Role', palette='coolwarm', s=100)

    # Add labels and title
    plt.xlabel('Average KDA (Kills + Deaths + Assists)', fontsize=12)
    plt.ylabel('Average Win Rate', fontsize=12)
    plt.title('Relationship between KDA and Win Rate by Role', fontsize=14)

    # Show the plot
    plt.tight_layout()
    plt.show()

# Load and prepare data
role_stats = load_and_prepare_data(input_file)

# Print the DataFrame for reference
print(role_stats)

# Plot the relationship between KDA and WR for each role
plot_role_kda_vs_wr(role_stats)


![WR_ROLE_BASED_KDA_PLOT](wr_rolebasedkda.png)

In [17]:
# Load the match history data from the JSON file
input_file = 'match_history.json'

def parse_duration(duration_str):
    """Convert duration string (e.g., '29min 37s') to minutes"""
    minutes = 0
    if 'min' in duration_str:
        minutes = int(duration_str.split('min')[0].strip())
    return minutes

def parse_kp(kp_str):
    """Convert KP percentage string (e.g., '70%' to 0.70)"""
    return float(kp_str.strip('%')) / 100 if kp_str else 0

def load_and_prepare_data(input_path):
    # Load the data from the JSON file
    with open(input_path, 'r', encoding='utf-8') as json_file:
        match_data = json.load(json_file)

    # Remove games with duration between 0 and 10 minutes
    match_data = [match for match in match_data if 'duration' in match and parse_duration(match['duration']) > 10]

    # Prepare a list to store data
    role_kp_data = []

    # Process the match data
    for match in match_data:
        role = match['role']
        kp_str = match['kp']  # Assuming 'kp' is stored as a string like '70%'
        outcome = match['outcome']

        # Convert KP percentage string to numeric value
        kp = parse_kp(kp_str)

        # Calculate Win Rate (1 for Victory, 0 for Defeat)
        win_rate = 1 if outcome == 'Victory' else 0

        role_kp_data.append({
            'Role': role,
            'KP': kp,
            'Win Rate': win_rate
        })

    # Convert the list into a pandas DataFrame
    df = pd.DataFrame(role_kp_data)

    # Calculate average KP and WR by role
    role_stats = df.groupby('Role').agg(
        avg_kp=('KP', 'mean'),
        avg_wr=('Win Rate', 'mean')
    ).reset_index()

    return role_stats

def plot_role_kp_vs_wr(role_stats):
    # Set up the plot
    plt.figure(figsize=(10, 6))

    # Use Seaborn to create a scatter plot
    sns.scatterplot(x='avg_kp', y='avg_wr', data=role_stats, hue='Role', palette='coolwarm', s=100)

    # Add labels and title
    plt.xlabel('Average KP (Kill Participation)', fontsize=12)
    plt.ylabel('Average Win Rate', fontsize=12)
    plt.title('Relationship between KP and Win Rate by Role', fontsize=14)

    # Show the plot
    plt.tight_layout()
    plt.show()

# Load and prepare data
role_stats = load_and_prepare_data(input_file)

# Print the DataFrame for reference
print(role_stats)

# Plot the relationship between KP and WR for each role
plot_role_kp_vs_wr(role_stats)


![WR_ROLE_BASED_KP_PLOT](wr_rolebasedkp.png)

In [19]:
def load_and_preprocess_data(input_file):
    with open(input_file, 'r', encoding='utf-8') as json_file:
        match_data = json.load(json_file)

    def parse_duration(duration_str):
        if 'min' in duration_str:
            return int(duration_str.split('min')[0].strip())
        return 0

    match_data = [match for match in match_data if 'duration' in match and parse_duration(match['duration']) > 10]

    X = []
    y = []

    roles = ['TOP', 'JGL', 'MID', 'ADC', 'SUP']
    for match in match_data:
        kp = float(match['kp'].strip('%')) / 100  # Convert KP to a fraction
        kda = match['kda']
        role = match['role']
        duration = parse_duration(match['duration'])
        outcome = 1 if match['outcome'] == 'Victory' else 0

        role_encoding = [1 if role == r else 0 for r in roles]

        features = [kp, kda, duration] + role_encoding
        X.append(features)
        y.append(outcome)

    return np.array(X), np.array(y)

input_file = 'match_history.json'
X, y = load_and_preprocess_data(input_file)

model = Pipeline([
    ('scaler', StandardScaler()),
    ('log_reg', LogisticRegression(max_iter=1000))
])

cv_scores = cross_val_score(model, X, y, cv=5, scoring='accuracy')
cv_predictions = cross_val_predict(model, X, y, cv=5, method='predict')
cv_probabilities = cross_val_predict(model, X, y, cv=5, method='predict_proba')[:, 1]

accuracy = np.mean(cv_scores)
mse = mean_squared_error(y, cv_predictions)
auc = roc_auc_score(y, cv_probabilities)

print(f'Cross-Validation Accuracy: {accuracy:.2f}')
print(f'Cross-Validation MSE: {mse:.2f}')
print(f'Cross-Validation AUC: {auc:.2f}')

plt.figure(figsize=(10, 6))
plt.plot(range(1, len(cv_scores) + 1), cv_scores, marker='o', label='Accuracy per Fold', color='blue')
plt.axhline(y=accuracy, color='red', linestyle='--', label='Mean Accuracy')
plt.title('Cross-Validation Accuracy Across Folds')
plt.xlabel('Fold')
plt.ylabel('Accuracy')
plt.legend()
plt.show()

fraction_of_positives, mean_predicted_value = calibration_curve(y, cv_probabilities, n_bins=10)

plt.figure(figsize=(10, 6))
plt.plot(mean_predicted_value, fraction_of_positives, marker='o', label='Calibration Curve', color='blue')
plt.plot([0, 1], [0, 1], linestyle='--', label='Perfect Calibration', color='red')
plt.title('Calibration Curve')
plt.xlabel('Mean Predicted Probability')
plt.ylabel('Fraction of Positives')
plt.legend()
plt.show()

cm = confusion_matrix(y, cv_predictions)

plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=['Defeat', 'Victory'], yticklabels=['Defeat', 'Victory'])
plt.title('Confusion Matrix')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.tight_layout()
plt.show()

model.fit(X, y)
intercept = model.named_steps['log_reg'].intercept_[0]
coefficients = model.named_steps['log_reg'].coef_[0]

roles = ['TOP', 'JGL', 'MID', 'ADC', 'SUP']
feature_names = ['KP', 'KDA', 'Duration'] + roles

plt.figure(figsize=(10, 6))
plt.bar(feature_names, coefficients, color='skyblue', edgecolor='black')
plt.axhline(0, color='red', linestyle='--', linewidth=1)
plt.title('Logistic Regression Coefficients')
plt.xlabel('Features')
plt.ylabel('Coefficient Value')
plt.xticks(rotation=45)
plt.tight_layout()

for i, coef in enumerate(coefficients):
    plt.text(i, coef, f'{coef:.2f}', ha='center', va='bottom' if coef > 0 else 'top')
plt.show()

print("Logistic Regression Formula:")
formula = f"Log-odds = {intercept:.2f} + " + " + ".join(
    [f"{coef:.2f}*{name}" for coef, name in zip(coefficients, feature_names)]
)
print(formula)

![REGRESSION_PLOT1](regression_model1.png)

![REGRESSION_PLOT2](regression_model2.png)

![REGRESSION_PLOT3](regression_model3.png)

![REGRESSION_PLOT4](regression_model4.png)