# Installations and Imports

In [1]:
!pip install openai==0.28



In [2]:
from tqdm import tqdm
import pandas as pd
import random
import json
import re
import requests
from io import BytesIO
import csv
import gdown
import openai

# Data Collection

The data is downloaded and gathered in our Google Drive as public .xls and .xlsx files. The code below merges them into one .csv file. (No need to run this anymore, since we already have the csv file in a google drive link)

In [None]:
# Example list of file IDs - replace with actual file IDs from Google Drive
file_ids = [
    '1k84aAAxrEQak7fgXXT1_ug2idu_B9a-m',
    '17cq-EPUq-c7mqqmDzouNsZ2jeIVeH04b',
    '1B5wEALt69dv3M3yFlgPjSvI5rK-eukzQ',
    '1PVFv4QPGNED3J01QXg-eyhAUNb1wZxoe',
    '1d8ozmri6Q5pURnAcCBfPDoT7G0ivNiKw',
    '1KL32QDe5YBEiG0WX2PuGowqr3yVzDbUe',
    '1RqdrLE1NuUjqCTSEVA4N_t252Wxp7nYZ',
    '1P_biFh4nCdCgS-2pEJI3KRlVEHAtWY61',
    '1HqkuYpiKP97QpMVnroPDoZfRPPviNBRM',
    '1fHEVlftkaAha4B1tTaeTeQ2KdKaIElHa',
    '11mxza_ukPVqP-2n7yJ6JqgzPA-48knAr',
    '1pv63r_Lr0dSJY8R6hdHFvROBAqeWi8aa',
    '1BiNdh8p1Zf6SsrYRfvJN4b4p2OHhYxpk',
    '18FgZMmRbYPsMmvdNUWLfKtgggfZrHWvM',
    '13_CQeYTdQDahocTXt4wNu6iMYtDX8maS',
    '1ohTWOm0dVqB26ppHGmNKyUFr6_lhU_Yi',
    '1rf7jY55s7ndpjxYFpqKiwv8AhXngoFoF',
    '1aNfVC27OKfDneov3Ushvf8Qe-wu2JGKW',
    '1s50rwFLVPdjXqjOSVuRQz1bzvsmbQ2o8',
    '1MgruYiUcxfHQQM5nbLHdhgz_NmZtVeBu',
    '1hQ-2WkY5H1IWn351b74rmA5pghlUFDVS',
    '19dfdFDHbe4DTYdfphG4QigvvXVRZ2bdT',
    '1Dtl5CzqenuLw247HbSD4yYXWTfmo7nH-'
]

all_data = pd.DataFrame()

for file_id in tqdm(file_ids):
    # Construct the direct download URL
    download_url = f'https://drive.google.com/uc?id={file_id}'

    # Download the file
    response = requests.get(download_url)

    # Check if the download was successful
    if response.status_code == 200:
        try:
            # Try reading as an xlsx file
            df = pd.read_excel(BytesIO(response.content))
        except ValueError:
            try:
                # If that fails, try reading as an older xls file
                df = pd.read_excel(BytesIO(response.content), engine='xlrd')
            except Exception as e:
                print(f"Error reading file with ID {file_id}: {e}")
                continue

        # Append the DataFrame to the combined data
        all_data = pd.concat([all_data, df], ignore_index=True)
    else:
        print(f"Failed to download file with ID {file_id}")

# Export the combined DataFrame to CSV
all_data.to_csv('combined_tennis_matches.csv', index=False)


# Data Preprocessing

Transfer the data in the csv file to a pandas DataFrame (we already have the csv file in a google drive link)

In [8]:
file_id = '1nlzwPoTlosd4qGW9mA1by05UK6xT06n-'

# URL to download the file
url = f'https://drive.google.com/uc?id={file_id}'

output_file = 'combined_tennis_matches.csv'
gdown.download(url, output_file, quiet=False)

# Read the downloaded CSV file into a DataFrame
df = pd.read_csv(output_file, low_memory=False)

# Convert columns to numeric, coercing errors
numeric_cols = ['ATP', 'Best of', 'WRank', 'LRank', 'WPts', 'LPts',
                'W1', 'L1', 'W2', 'L2', 'W3', 'L3', 'W4', 'L4', 'W5', 'L5',
                'Wsets', 'Lsets', 'B365W', 'B365L', 'PSW', 'PSL',
                'MaxW', 'MaxL', 'AvgW', 'AvgL']

for col in numeric_cols:
    df[col] = pd.to_numeric(df[col], errors='coerce')


Downloading...
From: https://drive.google.com/uc?id=1nlzwPoTlosd4qGW9mA1by05UK6xT06n-
To: /content/combined_tennis_matches.csv
100%|██████████| 482M/482M [00:04<00:00, 100MB/s]


In [9]:
print(len(df))
print(df.columns)

830760
Index(['ATP', 'Location', 'Tournament', 'Date', 'Series', 'Court', 'Surface',
       'Round', 'Best of', 'Winner', 'Loser', 'WRank', 'LRank', 'WPts', 'LPts',
       'W1', 'L1', 'W2', 'L2', 'W3', 'L3', 'W4', 'L4', 'W5', 'L5', 'Wsets',
       'Lsets', 'Comment', 'B365W', 'B365L', 'PSW', 'PSL', 'MaxW', 'MaxL',
       'AvgW', 'AvgL', 'EXW', 'EXL', 'LBW', 'LBL', 'SJW', 'SJL', 'UBW', 'UBL',
       'pl1_flag', 'pl1_year_pro', 'pl1_weight', 'pl1_height', 'pl1_hand',
       'pl2_flag', 'pl2_year_pro', 'pl2_weight', 'pl2_height', 'pl2_hand',
       'Summary'],
      dtype='object')


## Build samples to fine-tune the LLM (Chatcompletion or completion for gpt-3.5-turbo and davinci, respectively)

In [11]:
# RUN FOR TURBO

def create_chat_format(row):
    # Function to format set scores
    def format_set_score(w, l):
        if pd.isna(w) or pd.isna(l):
            return ''
        else:
            return f"{int(w)}-{int(l)}, "

    # Creating the score string
    score_str = ''.join([format_set_score(row[f'W{i}'], row[f'L{i}']) for i in range(1, 6)])
    score_str = score_str.rstrip(', ')

    # Randomly choose whether to list winner or loser first
    if random.choice([True, False]):
        first_player, second_player = row['Winner'], row['Loser']
        first_player_rank, second_player_rank = row['WRank'], row['LRank']
        first_player_pts, second_player_pts = row['WPts'], row['LPts']
        first_player_odds, second_player_odds = row['B365W'], row['B365L']
        winner_response = row['Winner']
    else:
        first_player, second_player = row['Loser'], row['Winner']
        first_player_rank, second_player_rank = row['LRank'], row['WRank']
        first_player_pts, second_player_pts = row['LPts'], row['WPts']
        first_player_odds, second_player_odds = row['B365L'], row['B365W']
        winner_response = row['Winner']

    # Create a system message setting the context
    context = (f"Discuss the following tennis match: {first_player} (Rank: {first_player_rank}, Points: {first_player_pts}, Odds: {first_player_odds}) "
                f"vs {second_player} (Rank: {second_player_rank}, Points: {second_player_pts}, Odds: {second_player_odds}) "
                f"on {row['Date']} during the {row['Tournament']} on a {row['Surface']} surface.")

    # Create user question
    user_question =  f"Who won the match between {first_player} and {second_player} on {row['Date']}?"

    assistant_answer = f"{winner_response} won the match against {row['Loser']}"


    return [context, user_question, assistant_answer]

# Initialize list to store the chat data
chat_data_list = []

# Iterate over the DataFrame rows
for index, row in tqdm(df.iterrows(), total=df.shape[0]):
    chat_format = create_chat_format(row)
    # Add a new dictionary for each set of context, question, and answer
    chat_data_list.append({"context": chat_format[0], "question": chat_format[1], "answer": chat_format[2]})

# Convert the list of dictionaries to a DataFrame
chat_data = pd.DataFrame(chat_data_list)

print(chat_data.iloc[:5])


100%|██████████| 830760/830760 [02:31<00:00, 5467.55it/s]


In [None]:
# RUN FOR DAVINCI

import random

def create_summary_and_qa(row):
    # Your existing format_set_score function
    def format_set_score(w, l):
        if pd.isna(w) or pd.isna(l):  # Check if either score is NaN
            return ''  # Return an empty string for NaN scores
        else:
            return f"{int(w)}-{int(l)}, "  # Format as integers

    # Creating the score string
    score_str = ''.join([format_set_score(row[f'W{i}'], row[f'L{i}']) for i in range(1, 6)])
    score_str = score_str.rstrip(', ')

    # Create summary
    summary = (f"On {row['Date']}, in the {row['Round']} of {row['Tournament']} which is part of the {row['Series']} series, "
               f"{row['Winner']} defeated {row['Loser']} with a score of {score_str} on {row['Surface']} surface. "
               f"The match was {'closely contested' if row['B365W'] == row['B365L'] else 'expected to be in favor of the winner'} "
               f"with betting odds of {row['B365W']} for {row['Winner']} and {row['B365L']} for {row['Loser']}.")

    def create_question(row):
        # Assemble statistics for both players
        stats = (f"Player {row['Winner' if random.choice([True, False]) else 'Loser']} (Rank: {row['WRank' if row['Winner'] else 'LRank']}, Points: {row['WPts' if row['Winner'] else 'LPts']}) "
            f"is playing Player {row['Loser' if row['Winner'] else 'Winner']} (Rank: {row['LRank' if row['Winner'] else 'WRank']}, Points: {row['LPts' if row['Winner'] else 'WPts']}). "
            f"The match will take place on a {row['Surface']} surface at the {row['Tournament']}, "
            f"{row['Location']}. The betting odds are {row['B365W' if row['Winner'] else 'B365L']} for {row['Winner' if row['Winner'] else 'Loser']} and {row['B365L' if row['Winner'] else 'B365W']} for {row['Loser' if row['Winner'] else 'Winner']}.")


        # Create the question
        question = f"On {row['Date']}, {stats} Who do you think will win this match?"

        return question

    question = create_question(row)
    # The answer is the winner's name
    answer = row['Winner']

    return summary, question, answer

# Initialize lists to store the data
summaries = []
questions = []
answers = []

# Iterate over the DataFrame rows with a progress bar
for index, row in tqdm(df.iterrows(), total=df.shape[0]):
    summary, question, answer = create_summary_and_qa(row)
    summaries.append(summary)
    questions.append(question)
    answers.append(answer)

# Assign the new data to the DataFrame
df['Summary'] = summaries
df['Question'] = questions
df['Answer'] = answers


# GPT API

If training is too large, only use a fraction of the data. Split into train/test

In [None]:
# Set the API key
openai.api_key = '' # fill in your openai secret key

In [None]:
# Use a fraction of data
df_use_now = chat_data.sample(frac=0.091, random_state=25)
df_hold_for_later = chat_data.drop(df_use_now.index)

# Split into train/test
train_df = df_use_now.sample(frac=0.8, random_state=25)
test_df = df_use_now.drop(train_df.index)

In [None]:

# Save the train and test samples to jsonl format for API
def save_to_jsonl_chat_format(df, file_path):
    with open(file_path, 'w', encoding='utf-8') as file:
        for _, row in tqdm(df.iterrows(), total=len(df)):
            conversation = {"messages": []}
            # Add each message to the conversation
            conversation["messages"].append({"role": "system", "content": row['context']})
            conversation["messages"].append({"role": "user", "content": row['question']})
            conversation["messages"].append({"role": "assistant", "content": row['answer']})
            file.write(json.dumps(conversation) + '\n')

save_to_jsonl_chat_format(train_df, 'train_data.jsonl')
save_to_jsonl_chat_format(test_df, 'test_data.jsonl')


In [None]:
# Upload training data for finetuning
upload_response = openai.File.create(
    file=open("train_data.jsonl", "rb"),
    purpose="fine-tune"
)

# Store the file ID for later use
train_file_id = upload_response['id']

In [None]:
# gpt-3.5-turbo

fine_tune_response = openai.FineTuningJob.create(
    training_file=train_file_id,  # Use the ID of the training file
    model="gpt-3.5-turbo"  # Specify the model
)

# Get the fine-tuning job ID
turbo_fine_tune_job_id = fine_tune_response['id']

In [None]:
# Davinci

fine_tune_response = openai.FineTune.create(
    training_file=train_file_id,  # Use the ID of the training file
    model='davinci',  # Choose from 'ada', 'babbage', 'curie', or 'davinci'
    n_epochs=4,  # Number of training epochs
    # Include other parameters as needed
)

# Get the fine-tuning job ID
davinci_fine_tune_job_id = fine_tune_response['id']

In [None]:
# OPTIONAL - to check the status of the fine-tuning job every 10 seconds

import time

fine_tune_job_id = turbo_fine_tune_job_id # or davinci

def check_fine_tune_status(job_id):
    status_response = openai.FineTune.retrieve(id=job_id)
    return status_response['status']

while True:
    status = check_fine_tune_status(fine_tune_job_id)
    print(f"Fine-tuning job status: {status}")

    # Break the loop if the job is completed or failed
    if status in ["succeeded", "failed"]:
        break

    time.sleep(10)

print("Fine-tuning job completed or failed.")

In [None]:
job_details = openai.FineTune.retrieve(id=fine_tune_job_id)

print(job_details)

# Testing

In [None]:
# FOR DAVINCI FORMAT

# Load the test set
test_df = pd.read_json('test_data.jsonl', lines=True)

responses = []

for index, row in tqdm(test_df.iterrows(), total=test_df.shape[0]):
      # Extract the messages, excluding the assistant's response
      messages = [message for message in row['messages'] if message['role'] != 'assistant']
      print(messages)

      response = openai.ChatCompletion.create(
          model="ft:gpt-3.5-turbo-0613:personal::8SzDtWEd",  # Model name
          messages=messages,
          max_tokens=50  # Length of response
      )
      responses.append(response.choices[0].message['content'].strip())
      print(response.choices[0].message['content'].strip())

      time.sleep(1)  # sleep to avoid rate limiting

# Add responses to the test DataFrame
test_df['ModelResponse'] = responses


In [None]:
print(test_df)

In [None]:
# FOR GPT-3.5-TURBO
test_df = pd.read_json('test_data.jsonl', lines=True)  # Assuming the file is in JSON Lines format

responses = []

for index, row in tqdm(test_df.iterrows(), total=test_df.shape[0]):
    # Parse the string representation of the list in 'messages' into an actual list
    messages_str = row['messages']
    try:
        # Convert the 'messages' string to an actual JSON object
        messages_list = json.loads(messages_str.replace("'", '"'))
    except json.JSONDecodeError as e:
        print(f"Failed to decode JSON for row {index}: {e}")
        continue  # Skip this row and move to the next

    # Extracting system and user messages to form the prompt
    system_content = next(m for m in messages_list if m['role'] == 'system')['content']
    user_content = next(m for m in messages_list if m['role'] == 'user')['content']
    messages = [
        {"role": "system", "content": system_content},
        {"role": "user", "content": user_content}
    ]
    print(messages)

    # Generating a response from the model using chat completions endpoint
    response = openai.ChatCompletion.create(
        model="ft:gpt-3.5-turbo-0613:personal::8TJfgWFZ",
        messages=messages,
        max_tokens=50  # Adjust as needed
    )
    print(response.choices[0].message['content'].strip())
    responses.append(response.choices[0].message['content'].strip())

# Add responses to the test DataFrame
test_df = responses


In [None]:
print(responses)

# Accuracy calculations

Note: Sometimes, we had some trouble calculating the test accuracy due to formatting issues, so accuracy was manually calculated among a smaller random test dataset of 361 matches.

In [None]:
# Load the CSV files
test_df = pd.read_csv('test_data_chat_format.csv')
model_df = pd.read_csv('gpt_model_with_responses.csv')

# Ensure both DataFrames have the same number of rows
if len(test_df) != len(model_df):
    print("The DataFrames have different lengths. Make sure both CSV files have the same number of rows.")
else:
    correct_count = 0

    for i in range(len(test_df)):
        # Extract assistant's response from test_df
        assistant_response = eval(test_df.loc[i, 'messages'])[-1]['content']

        # Extract model's response from model_df
        model_response = model_df.loc[i, 'ModelResponse']

        # Compare responses
        if assistant_response.strip().lower() == model_response.strip().lower():
            correct_count += 1

    # Calculate accuracy
    accuracy = correct_count / len(test_df)
    print(f"Accuracy: {accuracy:.2f} ({correct_count} out of {len(test_df)})")


In [None]:
# Function to properly format and load the JSON messages
def load_json(messages):
    # Replace single quotes with double quotes
    messages = messages.replace("'", '"')
    # Escape any double quotes inside strings
    messages = messages.replace('\"', '\\"')
    return json.loads(messages)

# Function to get the winner from the assistant's messages
def get_assistant_winner(messages):
    messages = load_json(messages)
    for message in messages:
        print(message)
        if messages['role'] == 'assistant':
            # Extract the winner's name from the assistant's message
            return message['content'].split(' ')[0]

# Function to check if the model's response is correct
def is_correct(model_response, assistant_winner):
    return assistant_winner in model_response

# Load your data
df = pd.read_csv('gpt_with_explanations_responses.csv')

# Count the correct predictions
correct_predictions = 0
for index, row in df.iterrows():
    assistant_winner = get_assistant_winner(row['messages'])
    model_is_correct = is_correct(row['ModelResponse'], assistant_winner)
    correct_predictions += int(model_is_correct)

# Calculate the accuracy
accuracy = correct_predictions / len(df)
print(f'Accuracy: {accuracy:.2f}')


In [None]:
import pandas as pd

# Load the CSV files
test_df = pd.read_csv('test_data_chat_format.csv')
model_df = pd.read_csv('gpt_model_with_responses.csv')

# Ensure both DataFrames have the same number of rows
if len(test_df) != len(model_df):
    print("The DataFrames have different lengths. Make sure both CSV files have the same number of rows.")
else:
    correct_count = 0

    for i in range(len(test_df)):
        # Extract assistant's response and get the winner's name
        assistant_response = eval(test_df.loc[i, 'messages'])[-1]['content']
        assistant_winner = assistant_response.split()[0]  # Get the first word, assuming it's the winner's name

        # Extract model's response and get the winner's name
        model_response = model_df.loc[i, 'ModelResponse']
        model_winner = model_response.split()[0]  # Get the first word, assuming it's the winner's name

        # Compare winners
        if assistant_winner.strip().lower() == model_winner.strip().lower():
            correct_count += 1

    # Calculate accuracy
    accuracy = correct_count / len(test_df)
    print(f"Accuracy: {accuracy:.2f} ({correct_count} out of {len(test_df)})")


In [None]:
def calculate_accuracy(df):
    correct = 0
    for index, row in df.iterrows():
        # Extract the loser's name from the prompt
        # Assumes the loser is the second player mentioned in the prompt
        loser_match = re.search(r'is playing Player ([\w\s.]+) \(', row['prompt'])
        if not loser_match:
            continue  # Skip this row if the loser's name cannot be extracted
        loser_name = loser_match.group(1).strip()

        # Count occurrences of winner's and loser's name in the ModelResponse
        winner_count = row['ModelResponse'].count(row['completion'])
        loser_count = row['ModelResponse'].count(loser_name)

        # Classify as correct if the winner's name appears more times than the loser's
        if winner_count < loser_count:
            correct += 1

    return correct / len(df)

accuracy = calculate_accuracy(test_df)
print(f"Model accuracy: {accuracy * 100:.2f}%")


In [None]:
# Regex pattern to match betting odds following a player's name
betting_pattern = r'(\b\w+\s\w+\b)\s*(\d+\.\d+)'

# Function to calculate accuracy based on regex matches
def calculate_accuracy(df):
    correct = 0
    for index, row in df.iterrows():
        # Find all matches of player name and betting odds
        matches = re.findall(betting_pattern, row['ModelResponse'])
        # Create a dictionary of player name to their associated odds
        player_odds = {match[0]: float(match[1]) for match in matches}

        # Determine if the winner is mentioned with lower odds than the loser
        winner = row['completion']
        loser_match = re.search(r'is playing Player ([\w\s.]+) \(', row['prompt'])
        if not loser_match:
            continue  # Skip this row if the loser's name cannot be extracted
        loser = loser_match.group(1).strip()
        print(player_odds)

        # Check if both winner and loser are in the player_odds and if winner's odds are lower
        if winner in player_odds and loser in player_odds and player_odds[winner] < player_odds[loser]:
            correct += 1

    return correct / len(df)

accuracy = calculate_accuracy(test_df)
print(f"Model accuracy: {accuracy * 100:.2f}%")


In [None]:
# Function to evaluate accuracy
def calculate_accuracy(df):
    correct = 0
    incorrect = 0
    for index, row in df.iterrows():
        loser_match = re.search(r'is playing Player ([\w\s.]+) \(', row['prompt'])
        if not loser_match:
            continue  # Skip this row if the loser's name cannot be extracted
        loser_name = loser_match.group(1).strip()
        if row['ModelResponse'] == row['completion']:
            correct += 1
        elif row['ModelResponse'] == loser_name:
            incorrect += 1
    print(correct)
    return correct / (correct + incorrect)

accuracy = calculate_accuracy(test_df)
print(f"Model accuracy: {accuracy * 100:.2f}%")


# Other helpful functions for data/file conversions

In [None]:
# convert list to csv

def list_to_csv(strings, filename):
    # Open the file in write mode
    with open(filename, 'w', newline='') as file:
        # Create a CSV writer
        writer = csv.writer(file)
        # Write each string in a separate row
        for string in strings:
            writer.writerow([string])

# Convert the list to a CSV file
list_to_csv(responses, 'output.csv')


In [None]:
# convert df to csv

test_df.to_csv('gpt_responses_with_explanations2.csv', index=False)


In [None]:
# convert jsonl to csv

with open('filename.jsonl', 'r') as json_file:
    json_list = [json.loads(json_str) for json_str in json_file]

headers = json_list[0].keys()

# Write the data to a CSV file
with open('filename.csv', 'w', newline='') as csv_file:
    writer = csv.DictWriter(csv_file, fieldnames=headers)
    writer.writeheader()

    for json_item in json_list:
        writer.writerow(json_item)


In [None]:
# Extract the 'content' of 'assistant' from the string
# which is in a JSON format

df = pd.read_csv('train_explanation.csv')

def extract_assistant_content(s):
    try:
        # Parse the string as JSON
        conversations = json.loads(s.replace("'", "\""))
        # Filter out the content where 'role' is 'assistant'
        assistant_responses = [conv['content'] for conv in conversations if conv['role'] == 'assistant']
        return assistant_responses
    except json.JSONDecodeError:
        return []  # Return an empty list if there's a JSON decoding error

# Apply the function to each row in the DataFrame and concatenate the results
assistant_contents = []
for row in df.itertuples(index=False):
    assistant_contents.extend(extract_assistant_content(row[0]))

assistant_contents


To finetune the gpt model with explanations, we had GPT 4 generate explanations for all of our training data match outcomes based on its existing knowledge about the player (ex.  'Rublev A. won the match against Berankis R. due to his powerful forehand and high-ranking experience.') Once we had the data in an array (explanations), we could insert it into the training dataset for finetuning.


In [None]:
# Update csv with explanations

csv_file_path = 'train_explanation.csv'
explanations = [] # insert with explanations (from GPT 4 for training)


def update_csv_with_explanations(csv_path, explanations_list):
    # Read the CSV file
    with open(csv_path, mode='r', encoding='utf-8') as file:
        csv_reader = csv.DictReader(file)
        rows = list(csv_reader)

    # Update the content with explanations
    for i, row in enumerate(rows):
        if i < len(explanations):
            row['content'] = explanations_list[i]

    # Write the updated content back to the CSV
    with open(csv_path, mode='w', encoding='utf-8', newline='') as file:
        fieldnames = rows[0].keys()
        csv_writer = csv.DictWriter(file, fieldnames=fieldnames)

        csv_writer.writeheader()
        for row in rows:
            csv_writer.writerow(row)

# Call the function with the path to your CSV file and the explanations array
update_csv_with_explanations(csv_file_path, explanations)
