In [1]:
import pandas as pd
from io import StringIO
from pathlib import Path

# paths
data_path = Path("data/basketball_2023_clean.csv")
out_path = Path("prompts/complex_questions_round2.txt")
out_path.parent.mkdir(parents=True, exist_ok=True)

# load your cleaned dataset
df = pd.read_csv(data_path)

# reorder columns if needed (only keeps those that exist)
cols_order = ["Player","GP-GS","PTS","AVG","Rebounds","Assists","Steals","Blocks","FG%","3PT%","FT%"]
cols_present = [c for c in cols_order if c in df.columns]
df = df[cols_present]

# convert to CSV text (no index)
dataset_text = df.to_csv(index=False)

prompt_header = """You are a basketball data analyst. Use ONLY the dataset provided below to answer.
Rules:
- If multiple players tie for a metric, list all of them in the answer.
- Do not guess. If a calculation cannot be made exactly, say “insufficient data.”
- GP is the first number before the hyphen in GP-GS.
- When calculating averages from totals, round to 2 decimal places.
- Show your reasoning briefly in 1-2 sentences per answer.

DATASET (CSV):
"""

questions = """
QUESTIONS:
1) Which player contributed the most total points per minute played?  
2) Who has the highest free throw accuracy among players with at least 50 free throw attempts?  
3) Which player has the highest total combined (Points + Rebounds + Assists) per game?  
4) Which player is the most efficient rebounder (total rebounds divided by games played)?  
5) If we define "impact score" as (Points + Rebounds + Assists + Steals + Blocks), who leads?  
6) Which 3 players would you recommend as starters based on overall contribution (impact score) and shooting efficiency (FG%)?  
7) If we remove players with fewer than 10 games played, who has the highest steals per game ratio?  
8) Identify the player with the largest gap between FG% and 3PT% (absolute value difference).  
9) Which player has the highest points-per-game above the team average points-per-game?  

Format:
- Give each answer as “Answer: <content>”
- Then give 1-2 sentences of reasoning.
- Use ONLY the dataset above, no outside data.
"""

# write file
with open(out_path, "w", encoding="utf-8") as f:
    f.write(prompt_header)
    f.write(dataset_text.strip() + "\n")
    f.write(questions.strip() + "\n")

print(f"✅ Wrote: {out_path.resolve()}")


✅ Wrote: D:\Python_ws\RA-Syracuse\Task_05_Descriptive_Stats\prompts\complex_questions_round2.txt


In [1]:
import pandas as pd
from pathlib import Path

# paths
data_path = Path("data/basketball_2023_clean.csv")
out_path = Path("prompts/simple_questions_round2.txt")
out_path.parent.mkdir(parents=True, exist_ok=True)

# load your cleaned dataset
df = pd.read_csv(data_path)

# keep/arrange columns (only those that exist)
cols_order = ["Player","GP-GS","PTS","AVG","Rebounds","Assists","Steals","Blocks","FG%","3PT%","FT%"]
df = df[[c for c in cols_order if c in df.columns]]

dataset_text = df.to_csv(index=False)

prompt_text = f"""You are a data analyst. Answer ONLY using the dataset I paste below. If a value is not present, say “insufficient data.”
Assumptions:
- The column GP is the number before the hyphen in “GP-GS”.
- Use exact names from the table in your answers.
- For each answer, provide an “Answer:” line and a one-sentence reason.

DATASET (CSV):
{dataset_text.strip()}

QUESTIONS:
1) How many games did the team play this season? (Use the left number of GP-GS.)
2) List the top 3 scorers by total PTS (highest to lowest).
3) Who has the highest AVG (points per game)?
4) Who has the best FG%?
5) Who has the best 3PT%?
6) Who has the most Rebounds?
7) Who has the most Assists?
8) Who has the most Steals?

Formatting:
- For each, give “Answer: <content>”
- Do not include any data not present in the table.
"""

with open(out_path, "w", encoding="utf-8") as f:
    f.write(prompt_text)

print(f"✅ Wrote: {out_path.resolve()}")


✅ Wrote: D:\Python_ws\RA-Syracuse\Task_05_Descriptive_Stats\prompts\simple_questions_round2.txt


In [2]:
from pathlib import Path

# 1) View the full LLM prompt (copy everything that prints)
p = Path("data/llm_prompt_round2.txt")
prompt_text = p.read_text(encoding="utf-8")
print(prompt_text)

# 2) Create result files if they don't exist (you will paste the LLM answers into these)
results_dir = Path("results")
results_dir.mkdir(parents=True, exist_ok=True)

simple_out = results_dir / "llm_responses_simple_round2.txt"
complex_out = results_dir / "llm_responses_complex_round2.txt"

for f in (simple_out, complex_out):
    if not f.exists():
        f.write_text("", encoding="utf-8")

print("\n✅ Now copy ALL of the printed prompt above and paste it into ChatGPT (or Claude/Copilot).")
print("   Ask the simple questions (from prompts/simple_prompts_round2.txt) and paste the answers into:")
print(f"   {simple_out.resolve()}")
print("   Then ask the complex questions (from prompts/complex_prompts_round2.txt) and paste the answers into:")
print(f"   {complex_out.resolve()}")



You are given the Syracuse Men's Basketball 2023-24 season player statistics.
The table includes both raw stats and derived metrics from previous analysis.

Dataset:
Player,GP-GS,PTS,AVG,Rebounds,Assists,Steals,Blocks,FG%,3PT%,FT%,GP,Rebounds_per_game,Impact,FG_3PT_gap

"Mintz, Judah",32-31,602,18.8,103,142,67,4,0.438,0.282,0.765,32,3.21875,918,0.156

"Starling, J.J.",32-32,426,13.3,103,60,27,6,0.458,0.324,0.714,32,3.21875,622,0.134

"Bell, Chris",32-32,384,12.0,75,21,18,22,0.434,0.42,0.842,32,2.34375,520,0.014

"Copeland, Quadir",32-1,307,9.6,148,89,47,6,0.478,0.25,0.683,32,4.625,597,0.2279999999999999

"Brown, Maliq",32-18,305,9.5,230,59,71,29,0.698,0.389,0.721,32,7.1875,694,0.3089999999999999

"Williams, Benny",18-0,97,5.4,71,22,17,16,0.4,0.206,0.538,18,3.9444444444444446,223,0.194

"Taylor, Justin",32-32,159,5.0,127,31,23,8,0.352,0.301,0.645,32,3.96875,348,0.0509999999999999

"McLeod, Naheem",14-14,54,3.9,60,1,3,27,0.594,0.0,0.696,14,4.285714285714286,145,0.594

"Cuffe Jr., Kyle",