# RAG with ChatGPT
Retrieval Augmented Generation using FAISS indexing

In [59]:
#!pip install faiss-cpu

In [60]:
#!pip install openai

In [61]:
#!pip install numpy==1.24.3
#!pip install pandas

In [62]:
#!pip install --upgrade bottleneck

In [63]:
import pandas as pd

In [64]:
import faiss
import numpy as np

In [65]:
#import openai
from openai import OpenAI
client = OpenAI()

In [66]:
# List of files for multiple years
files = {
    2021: '/Users/cg1123/Desktop/New Technologies/Gen AI/RAG/data/individual/2021.csv',
    2022: '/Users/cg1123/Desktop/New Technologies/Gen AI/RAG/data/individual/2022.csv',
    2023: '/Users/cg1123/Desktop/New Technologies/Gen AI/RAG/data/individual/2023.csv',
    2024: '/Users/cg1123/Desktop/New Technologies/Gen AI/RAG/data/individual/2024.csv'
}

In [67]:
# Function to create tagged summaries
def create_tagged_summary(row):
    return (
        f"Season: {row['season']}, City: {row['city']}, Date: {row['date']}, "
        f"Match Type: {row['match_type']}, Player of the Match: {row['player_of_match']}, "
        f"Venue: {row['venue']}, Team1: {row['team1']}, Team2: {row['team2']}, "
        f"Toss Winner: {row['toss_winner']}, Toss Decision: {row['toss_decision']}, "
        f"Winner: {row['winner']}, Result: {row['result']} by {row['result_margin']} runs/wickets, "
        f"Super Over: {row['super_over']}, Umpires: {row['umpire1']} and {row['umpire2']}."
    )

In [68]:
# Process each file
processed_data = {}
for year, file_path in files.items():
    # Load the CSV file
    data = pd.read_csv(file_path)
    # Generate tagged summaries
    data['summary'] = data.apply(create_tagged_summary, axis=1)
    processed_data[year] = data

In [69]:
processed_data[2021].head(1)

Unnamed: 0,id,season,city,date,match_type,player_of_match,venue,team1,team2,toss_winner,...,winner,result,result_margin,target_runs,target_overs,super_over,method,umpire1,umpire2,summary
0,1254058.0,2021.0,Chennai,09/04/2021,League,HV Patel,"MA Chidambaram Stadium, Chepauk, Chennai",Mumbai Indians,Royal Challengers Bangalore,Royal Challengers Bangalore,...,Royal Challengers Bangalore,wickets,2.0,160.0,20.0,N,,KN Ananthapadmanabhan,Nitin Menon,"Season: 2021.0, City: Chennai, Date: 09/04/202..."


In [70]:
# Function to generate embeddings using OpenAI client.embeddings.create
def get_embedding(text, model="text-embedding-ada-002"):
    response = client.embeddings.create(
        input=text,
        model=model
    )
    return response.data[0].embedding

In [71]:
test_text = "How many matches did CSK lose?"
test_embedding = get_embedding(test_text)
print(test_embedding)

[-0.0013754806714132428, -0.0032734812702983618, 0.03143583983182907, 0.020249679684638977, -0.053130995482206345, 0.014454755932092667, -0.005664701573550701, -0.0016815047711133957, -0.013647372834384441, -0.024312637746334076, 0.020666394382715225, 0.018322380259633064, -0.018595848232507706, 0.0028128172270953655, -0.010782466270029545, 0.0014943091664463282, 0.00723388884216547, -0.015613740310072899, 0.018179133534431458, -0.013647372834384441, -0.035550884902477264, 0.007800359278917313, -0.016251834109425545, -0.0010792233515530825, -0.00661858543753624, 0.010782466270029545, 0.01418128702789545, -0.0070645990781486034, 0.016603436321020126, -0.027919815853238106, 0.011205690912902355, -0.0077417585998773575, -0.02851884253323078, 0.009089566767215729, -0.04523947834968567, -0.004248525947332382, -0.0008871443569660187, 0.009336991235613823, 0.03088890202343464, -0.006533940322697163, 0.007748269941657782, 0.005114509258419275, -0.023505255579948425, 0.014741246588528156, -0.01

In [72]:
# Process each year's data
for year, data in processed_data.items():
    print(f"Processing embeddings for {year}...")
    
    # Generate embeddings for each match summary
    embeddings = [get_embedding(summary) for summary in data['summary']]
    
    # Save embeddings to FAISS
    embedding_dim = len(embeddings[0])
    index = faiss.IndexFlatL2(embedding_dim)
    index.add(np.array(embeddings))
    faiss.write_index(index, f"/Users/cg1123/Desktop/New Technologies/Gen AI/RAG/data/processed/{year}_embeddings.index")
    
    # Save the processed data with summaries
    data.to_csv(f"/Users/cg1123/Desktop/New Technologies/Gen AI/RAG/data/processed/{year}_processed.csv", index=False)

Processing embeddings for 2021...


KeyboardInterrupt: 

# Above is the configuration
Below is the runtime code for queries

In [101]:
# Function to query matches from multiple years
def query_matches(query, team=None, years=None, top_k=35):
    query_embedding = get_embedding(query)
    results = []

    # Filter years if specified
    if years:
        selected_files = {year: file for year, file in files.items() if year in years}
    else:
        selected_files = files

    for year, file_path in selected_files.items():
        print(f"Searching matches from {year}...")
        
        # Load the FAISS index and data
        index = faiss.read_index(f"/Users/cg1123/Desktop/New Technologies/Gen AI/RAG/data/processed/{year}_embeddings.index")
        data = pd.read_csv(f"/Users/cg1123/Desktop/New Technologies/Gen AI/RAG/data/processed/{year}_processed.csv")
        
        # Search the index
        distances, indices = index.search(np.array([query_embedding]), k=top_k)
        retrieved_matches = data.iloc[indices[0]]

        # Filter by team if specified
        if team:
            retrieved_matches = retrieved_matches[
                (retrieved_matches['team1'] == team) | (retrieved_matches['team2'] == team)
            ]

        # Add results to the list
        for _, row in retrieved_matches.iterrows():
            results.append({
                "year": year,
                "date": row['date'],
                "match_type": row['match_type'],
                "venue": row['venue'],
                "team1": row['team1'],
                "team2": row['team2'],
                "winner": row['winner'],
                "result": f"{row['result']} by {row['result_margin']} runs/wickets"
            })

    return results

In [102]:
# Function to format context for ChatGPT
def generate_chatgpt_response(query, results):
    # Format retrieved matches into a readable context
    context = "\n".join([
        f"Year: {match['year']}, Date: {match['date']}, Match_Type: {match['match_type']}, Venue: {match['venue']}, "
        f"{match['team1']} vs {match['team2']}, Winner: {match['winner']}, Result: {match['result']}."
        for match in results
    ])
    
    # Prompt to ChatGPT
    prompt = (
        f"The following information is relevant to the query:\n{context}\n\n"
        f"Now, answer the question: {query}"
    )
    
    # Call ChatGPT to generate the response
    response = client.chat.completions.create(
        model="gpt-4",
        messages=[
            {"role": "system", "content": "You are an IPL analyst."},
            {"role": "user", "content": prompt}
        ]
    )
    
    return response.choices[0].message.content


In [113]:

# Example query
query = "Summarise the second SRH vs LSG match in 2024 for me"
results = query_matches(query,  years=[2024]) #team="Sunrisers Hyderabad",

# Display results
for match in results:
    print(f"Year: {match['year']}, Date: {match['date']}, Venue: {match['venue']}, "
          f"{match['team1']} vs {match['team2']}, Winner: {match['winner']}, Result: {match['result']}")


Searching matches from 2024...
Year: 2024, Date: 02/05/2024, Venue: Rajiv Gandhi International Stadium, Uppal, Hyderabad, Sunrisers Hyderabad vs Rajasthan Royals, Winner: Sunrisers Hyderabad, Result: runs by 1 runs/wickets
Year: 2024, Date: 08/05/2024, Venue: Rajiv Gandhi International Stadium, Uppal, Hyderabad, Lucknow Super Giants vs Sunrisers Hyderabad, Winner: Sunrisers Hyderabad, Result: wickets by 10 runs/wickets
Year: 2024, Date: 19/05/2024, Venue: Rajiv Gandhi International Stadium, Uppal, Hyderabad, Punjab Kings vs Sunrisers Hyderabad, Winner: Sunrisers Hyderabad, Result: wickets by 4 runs/wickets
Year: 2024, Date: 05/04/2024, Venue: Rajiv Gandhi International Stadium, Uppal, Hyderabad, Chennai Super Kings vs Sunrisers Hyderabad, Winner: Sunrisers Hyderabad, Result: wickets by 6 runs/wickets
Year: 2024, Date: 24/03/2024, Venue: Sawai Mansingh Stadium, Jaipur, Rajasthan Royals vs Lucknow Super Giants, Winner: Rajasthan Royals, Result: runs by 20 runs/wickets
Year: 2024, Date: 2

In [114]:
# Generate and display ChatGPT's response
if results:
    response = generate_chatgpt_response(query, results)
    print(response)
else:
    print("No relevant matches found.")

The second match between Sunrisers Hyderabad (SRH) and Lucknow Super Giants (LSG) in the year 2024 took place on 08/05/2024. The match type was a league match and it was held at the Rajiv Gandhi International Stadium in Uppal, Hyderabad. Sunrisers Hyderabad emerged as the winner of the match by chasing the target set by Lucknow Super Giants. They won by a margin of 10 wickets.


In [115]:
query = ""
while(query!="done"):
    query = input("please type your query followed by enter(type \"done\" if you are finished):\n")
    
    if(query!="done"):
        q_team = None
        q_years = None
        
        q_team_raw = input("Enter team you would like to focus on (optional): ")
        if(len(q_team_raw)>0):
            q_team = q_team_raw
        
        q_years_raw = input("Enter specific years from 2021-2024, separated by comma: (optional)")
        if(len(q_years_raw)>0):
            q_years = [int(year) for year in q_years_raw.split(",")]
            
        #print(q_team,q_years)
        results = query_matches(query, team=q_team, years=q_years)
        print("Retrieved data for query:\n")
        for match in results:
            print(f"Year: {match['year']}, Date: {match['date']}, Match_Type: {match['match_type']}, Venue: {match['venue']}, "f"{match['team1']} vs {match['team2']}, Winner: {match['winner']}, Result: {match['result']}")
        if results:
            response = generate_chatgpt_response(query, results)
            print(response)
        else:
            print("No relevant matches found. Cannot perform RAG.")  

please type your query followed by enter(type "done" if you are finished):
What is the greatest margin of victory in IPL 2024?
Enter team you would like to focus on (optional): 
Enter specific years from 2021-2024, separated by comma: (optional)2024
Searching matches from 2024...
Retrieved data for query:

Year: 2024, Date: 14/05/2024, Match_Type: League, Venue: Arun Jaitley Stadium, Delhi, Delhi Capitals vs Lucknow Super Giants, Winner: Delhi Capitals, Result: runs by 19 runs/wickets
Year: 2024, Date: 03/05/2024, Match_Type: League, Venue: Wankhede Stadium, Mumbai, Kolkata Knight Riders vs Mumbai Indians, Winner: Kolkata Knight Riders, Result: runs by 24 runs/wickets
Year: 2024, Date: 27/04/2024, Match_Type: League, Venue: Arun Jaitley Stadium, Delhi, Delhi Capitals vs Mumbai Indians, Winner: Delhi Capitals, Result: runs by 10 runs/wickets
Year: 2024, Date: 22/04/2024, Match_Type: League, Venue: Sawai Mansingh Stadium, Jaipur, Mumbai Indians vs Rajasthan Royals, Winner: Rajasthan Roya

As an analyst, I can not definitively say which team performed the worst through the data provided as it doesn't cover all the matches played by each team, and also factors like total points, net run rate, and other performance metrics are not included. However, I can provide individual match outcomes. For
please type your query followed by enter(type "done" if you are finished):
done
