17. What is the average number of breaks of serve per match? 


بارگذاری داده power

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

project_root = Path.cwd().parent
processed_v2_dir = project_root / "data" / "processed" / "v2"
figures_dir = project_root / "reports" / "figures"
figures_dir.mkdir(parents=True, exist_ok=True)

df_match = pd.read_parquet(processed_v2_dir / "match_clean_v2.parquet")
df_power = pd.read_parquet(processed_v2_dir / "power_clean_v2.parquet")

print("shape:", df_power.shape)
print(df_power.head())


shape: (469677, 5)
   match_id  set_num  game_num  value  break_occurred
0  11998445        1         1 -52.80           False
1  11998445        1         2  48.14           False
2  11998445        1         3 -51.62           False
3  11998445        1         4  10.00           False
4  11998445        1         5  26.60            True


بررسی ستون‌ها

In [3]:
print("columns:", df_power.columns.tolist())


columns: ['match_id', 'set_num', 'game_num', 'value', 'break_occurred']


شمارش بریک‌ها

In [4]:
# break_occurred = True
df_breaks = df_power[df_power["break_occurred"] == True]

breaks_per_match = (
    df_breaks.groupby("match_id")
    .size()
    .reset_index(name="num_breaks")
    .sort_values("num_breaks", ascending=False)
)

print("Top 10 matches by number of breaks:")
print(breaks_per_match.head(10))


Top 10 matches by number of breaks:
      match_id  num_breaks
9462  12175864          58
7742  12154228          57
976   12042193          57
3847  12088076          55
258   12018722          54
9763  12184920          54
8164  12158363          51
753   12039855          51
2655  12069348          51
2652  12069329          48


میانگین بریک‌ها

In [5]:
avg_breaks = breaks_per_match["num_breaks"].mean()
print("avg of breaks per match:", round(avg_breaks,2))


avg of breaks per match: 14.21


نمودار

In [None]:
project_root = Path.cwd().parent
fig_dir = project_root / "reports" / "figures"
fig_dir.mkdir(parents=True, exist_ok=True)

df_power["break_occurred"] = df_power["break_occurred"].astype(bool)
df_breaks = df_power[df_power["break_occurred"] == True]
breaks_per_match = df_breaks.groupby("match_id").size()

plt.figure(figsize=(10,5))
sns.histplot(breaks_per_match, bins=30, kde=True)
plt.xlabel("Number of Breaks")
plt.ylabel("Number of Matches")
plt.title("Distribution of Number of Breaks in Matches")
plt.savefig(fig_dir / "q17_breaks_distribution.png", dpi=300, bbox_inches="tight")
plt.close()


ذخیره نتیجه

In [None]:
import pandas as pd
from pathlib import Path

project_root = Path.cwd().parent

processed_v2_dir = project_root / "data" / "processed" / "v2"
answers_dir = project_root / "reports" / "answers"
answers_dir.mkdir(parents=True, exist_ok=True)

power_path = processed_v2_dir / "power_clean_v2.parquet"
if not power_path.exists():
    raise FileNotFoundError(f"{power_path} not found. Check processed v2 files.")

df_power = pd.read_parquet(power_path)

print("df_power shape:", df_power.shape)
print(df_power.columns.tolist()[:30])

if "break_occurred" not in df_power.columns:
    raise KeyError("Column 'break_occurred' not found in power table.")

df_power["break_occurred_bool"] = df_power["break_occurred"].astype(str).str.lower().map({
    "true": True, "false": False, "1": True, "0": False, "nan": False, "none": False
}).fillna(False)

breaks_per_match = (
    df_power.groupby("match_id")["break_occurred_bool"]
    .sum()
    .reset_index()
    .rename(columns={"break_occurred_bool": "breaks_count"})
)

power_rows_per_match = df_power.groupby("match_id").size().reset_index(name="power_rows")
breaks_per_match = breaks_per_match.merge(power_rows_per_match, on="match_id", how="left")

output_file = answers_dir / "q17_breaks_per_match.csv"
breaks_per_match.to_csv(output_file, index=False, encoding="utf-8-sig")
print(f"Saved to {output_file}")
print(breaks_per_match.head())


df_power shape: (469677, 5)
['match_id', 'set_num', 'game_num', 'value', 'break_occurred']
Saved to c:\Users\mit\Desktop\Finaaal\reports\answers\q17_breaks_per_match.csv
   match_id  breaks_count  power_rows
0  11998445            18          66
1  11998446            12          34
2  11998447             8          32
3  11998448             6          20
4  11998449             3          17
