In [7]:
import pandas as pd

# Load CSV files
matches = pd.read_csv("matches.csv")
deliveries = pd.read_csv("deliveries.csv")

# Quick look
print("Matches dataset shape:", matches.shape)
print("Deliveries dataset shape:", deliveries.shape)
print("\nMatches sample:\n", matches.head())
print("\nDeliveries sample:\n", deliveries.head())


Matches dataset shape: (756, 18)
Deliveries dataset shape: (179078, 21)

Matches sample:
    id  season       city        date                        team1  \
0   1    2017  Hyderabad  2017-04-05          Sunrisers Hyderabad   
1   2    2017       Pune  2017-04-06               Mumbai Indians   
2   3    2017     Rajkot  2017-04-07                Gujarat Lions   
3   4    2017     Indore  2017-04-08       Rising Pune Supergiant   
4   5    2017  Bangalore  2017-04-08  Royal Challengers Bangalore   

                         team2                  toss_winner toss_decision  \
0  Royal Challengers Bangalore  Royal Challengers Bangalore         field   
1       Rising Pune Supergiant       Rising Pune Supergiant         field   
2        Kolkata Knight Riders        Kolkata Knight Riders         field   
3              Kings XI Punjab              Kings XI Punjab         field   
4             Delhi Daredevils  Royal Challengers Bangalore           bat   

   result  dl_applied           

In [8]:
# --- Matches Summary ---
total_matches = matches.shape[0]
seasons = matches['season'].nunique()
teams = matches['team1'].nunique()

top_winning_teams = matches['winner'].value_counts().head(5)

print(f"Total Matches: {total_matches}")
print(f"Total Seasons: {seasons}")
print(f"Unique Teams: {teams}")
print("\nTop 5 Winning Teams:\n", top_winning_teams)

# --- Deliveries Summary ---
total_runs = deliveries['total_runs'].sum()
total_wickets = deliveries[deliveries['player_dismissed'].notna()].shape[0]

top_batsmen = deliveries.groupby('batsman')['batsman_runs'].sum().sort_values(ascending=False).head(5)
top_bowlers = deliveries.groupby('bowler')['player_dismissed'].count().sort_values(ascending=False).head(5)

print(f"\nTotal Runs Scored: {total_runs}")
print(f"Total Wickets Taken: {total_wickets}")
print("\nTop 5 Batsmen (Runs):\n", top_batsmen)
print("\nTop 5 Bowlers (Wickets):\n", top_bowlers)


Total Matches: 756
Total Seasons: 12
Unique Teams: 15

Top 5 Winning Teams:
 winner
Mumbai Indians                 109
Chennai Super Kings            100
Kolkata Knight Riders           92
Royal Challengers Bangalore     84
Kings XI Punjab                 82
Name: count, dtype: int64

Total Runs Scored: 235290
Total Wickets Taken: 8834

Top 5 Batsmen (Runs):
 batsman
V Kohli      5434
SK Raina     5415
RG Sharma    4914
DA Warner    4741
S Dhawan     4632
Name: batsman_runs, dtype: int64

Top 5 Bowlers (Wickets):
 bowler
SL Malinga         188
DJ Bravo           168
A Mishra           165
Harbhajan Singh    161
PP Chawla          156
Name: player_dismissed, dtype: int64


In [9]:
# Create small subset for LLM
matches_subset = matches.head(20)
deliveries_subset = deliveries.head(100)

# Save subset CSVs
matches_subset.to_csv("matches_subset.csv", index=False)
deliveries_subset.to_csv("deliveries_subset.csv", index=False)

print("Subset datasets saved: matches_subset.csv & deliveries_subset.csv")

Subset datasets saved: matches_subset.csv & deliveries_subset.csv


In [11]:
import os

# Ensure the folder exists
os.makedirs("prompts", exist_ok=True)

# Prompt contents
factual_prompts = """
1. How many total matches were played in the dataset?
2. Which team has the most wins overall?
3. Who is the top run scorer in IPL history based on this dataset?
4. Who has taken the most wickets in IPL history?
5. What is the total number of runs scored in all matches?
6. How many unique teams have participated?
"""

reasoning_prompts = """
1. Does winning the toss increase the chances of winning the match? Support with data.
2. Which player is more impactful: the top run scorer or the top wicket taker?
3. If you were a coach and wanted to win 3 more matches next season, should you focus on batting or bowling?
4. Which batsman has the highest number of sixes, and what does that indicate about their playing style?
5. What is the best winning strategy based on margin of victory (runs vs wickets)?
6. If your team wants to improve by 10% in win rate, should you invest more in power hitters or death over bowlers? Use the dataset trends to support your recommendation.
"""

# Write to files
with open("prompts/factual_questions.txt", "w") as f:
    f.write(factual_prompts.strip())

with open("prompts/reasoning_questions.txt", "w") as f:
    f.write(reasoning_prompts.strip())

print("Prompt files created in 'prompts/' folder.")


Prompt files created in 'prompts/' folder.


In [16]:
import pandas as pd

# --- Load Data ---
matches = pd.read_csv("matches.csv")
deliveries = pd.read_csv("deliveries.csv")

# --- 1. Toss Impact ---
total_matches = matches.shape[0]
toss_winner_won = matches[matches['toss_winner'] == matches['winner']].shape[0]
toss_impact = (toss_winner_won / total_matches) * 100

# --- 2. Top Run Scorer vs Top Wicket Taker ---
batsman_runs = deliveries.groupby('batsman')['batsman_runs'].sum().sort_values(ascending=False)
top_batsman = batsman_runs.index[0]
top_batsman_runs = batsman_runs.iloc[0]

bowler_wickets = deliveries[deliveries['player_dismissed'].notna()]\
    .groupby('bowler')['player_dismissed'].count().sort_values(ascending=False)
top_bowler = bowler_wickets.index[0]
top_bowler_wickets = bowler_wickets.iloc[0]

# --- 3. Batting vs Bowling Focus (averages) ---
avg_runs_per_match = deliveries.groupby('match_id')['total_runs'].sum().mean()
avg_wickets_per_match = deliveries[deliveries['player_dismissed'].notna()]\
    .groupby('match_id').size().mean()

# --- 4. Batsman with most sixes ---
sixes = deliveries[deliveries['batsman_runs'] == 6].groupby('batsman').size().sort_values(ascending=False)
top_six_batsman = sixes.index[0]
top_sixes = sixes.iloc[0]

# --- 5. Winning Strategy (Runs vs Wickets) ---
by_runs = matches[matches['win_by_runs'] > 0].shape[0]
by_wickets = matches[matches['win_by_wickets'] > 0].shape[0]

# --- Generate Context Summary ---
context_summary = f"""
--- IPL Dataset Insights ---
1. Toss Impact:
   - Toss winner won {toss_impact:.2f}% of matches.

2. Top Player Impact:
   - Top run scorer: {top_batsman} ({top_batsman_runs} runs)
   - Top wicket taker: {top_bowler} ({top_bowler_wickets} wickets)

3. Match Averages:
   - Average runs per match: {avg_runs_per_match:.2f}
   - Average wickets per match: {avg_wickets_per_match:.2f}

4. Six-Hitters:
   - Batsman with most sixes: {top_six_batsman} ({top_sixes} sixes)

5. Winning Strategy Trends:
   - Matches won by runs: {by_runs}
   - Matches won by wickets: {by_wickets}
"""

# Save to file
with open("results/context_summary.txt", "w") as f:
    f.write(context_summary)

print("Context summary generated and saved to results/context_summary.txt")
print(context_summary)


Context summary generated and saved to results/context_summary.txt

--- IPL Dataset Insights ---
1. Toss Impact:
   - Toss winner won 51.98% of matches.

2. Top Player Impact:
   - Top run scorer: V Kohli (5434 runs)
   - Top wicket taker: SL Malinga (188 wickets)

3. Match Averages:
   - Average runs per match: 311.23
   - Average wickets per match: 11.69

4. Six-Hitters:
   - Batsman with most sixes: CH Gayle (327 sixes)

5. Winning Strategy Trends:
   - Matches won by runs: 337
   - Matches won by wickets: 406



In [13]:
import os
import matplotlib.pyplot as plt

# --- Ensure results folder exists ---
os.makedirs("results", exist_ok=True)

# --- Top Batsmen ---
top_batsmen = deliveries.groupby('batsman')['batsman_runs'].sum()\
    .sort_values(ascending=False).head(5)
top_batsmen.plot(kind='bar', figsize=(8,5))
plt.title("Top 5 Run Scorers")
plt.xlabel("Batsman")
plt.ylabel("Runs")
plt.tight_layout()
plt.savefig("results/top_batsmen.png")
plt.close()

# --- Top Bowlers ---
top_bowlers = deliveries[deliveries['player_dismissed'].notna()]\
    .groupby('bowler')['player_dismissed'].count()\
    .sort_values(ascending=False).head(5)
top_bowlers.plot(kind='bar', figsize=(8,5), color='orange')
plt.title("Top 5 Wicket Takers")
plt.xlabel("Bowler")
plt.ylabel("Wickets")
plt.tight_layout()
plt.savefig("results/top_bowlers.png")
plt.close()

print("Visualizations saved in results/")


Visualizations saved in results/
