In [None]:
from pathlib import Path
import pandas as pd
import pyarrow.parquet as pq

NOTEBOOK_DIR = Path.cwd()
PROJECT_ROOT = NOTEBOOK_DIR.parent

RAW_TENNIS_DIR = PROJECT_ROOT / "data" / "raw" / "tennis_data"

OUTPUT_RAW_DIR = PROJECT_ROOT / "data" / "raw"
OUTPUT_RAW_DIR.mkdir(parents=True, exist_ok=True)

wanted = [
    "player_id", "full_name", "name", "gender", "country", "height",
    "current_rank", "slug", "birthplace", "plays", "current_prize", "total_prize"
]

players_all = []

for day in RAW_TENNIS_DIR.iterdir():
    p = day / "data" / "raw" / "raw_match_parquet"
    if not p.exists():
        continue

    for f in p.glob("*_team_*.parquet"):
        schema_cols = set(pq.read_schema(f).names)
        cols = [c for c in wanted if c in schema_cols]

        if not cols:  # Skip if no wanted columns exist in the file
            continue

        df = pd.read_parquet(f, columns=cols)

        # If full_name is missing but "name" exists, use name as full_name
        if "full_name" not in df.columns and "name" in df.columns:
            df = df.rename(columns={"name": "full_name"})

        players_all.append(df)

players = pd.concat(players_all, ignore_index=True)

# Keep the most complete record per player (based on non-null count)
players["_nn"] = players.notna().sum(axis=1)
players_unique = (
    players.sort_values("_nn", ascending=False)
           .drop_duplicates(subset=["player_id"], keep="first")
           .drop(columns=["_nn"])
)

output_path = OUTPUT_RAW_DIR / "players.parquet"
players_unique.to_parquet(output_path, index=False)

print(f"Saved {players_unique.shape[0]} unique players to: {output_path}")

  players = pd.concat(players_all, ignore_index=True)


Saved 2644 unique players to: c:\Users\mit\Desktop\Finaaal\data\processed\players.parquet


In [3]:
from pathlib import Path
import pandas as pd

# مسیر از محل نوتبوک
NOTEBOOK_DIR = Path.cwd()
PROJECT_ROOT = NOTEBOOK_DIR.parent

# مسیر فایل پردازش‌شده
PROCESSED_DIR = PROJECT_ROOT / "data" / "processed"
players_path = PROCESSED_DIR / "players.parquet"

players = pd.read_parquet(players_path)

print("DataFrame Shape:", players.shape)
print("Columns:", players.columns.tolist())
print(players.head(10))

# To check columns


DataFrame Shape: (2644, 12)
Columns: ['player_id', 'full_name', 'name', 'gender', 'country', 'height', 'current_rank', 'slug', 'birthplace', 'plays', 'current_prize', 'total_prize']
   player_id               full_name                name gender  \
0      88992       Muller, Alexandre           Muller A.      M   
1     248846           Mayot, Harold            Mayot H.      M   
2     192013  Auger-Aliassime, Felix  Auger-Aliassime F.      M   
3     192862           Damas, Miguel            Damas M.      M   
4     269291         Forejtek, Jonas         Forejtek J.      M   
5     215205            Zhu, Michael              Zhu M.      M   
6      77223         Martinez, Pedro         Martínez P.      M   
7      63642   Kwiatkowski, Thai-Son      Kwiatkowski T.      M   
8     460898        Bervid, Victoria           Bervid V.      F   
9      52025             Wang, Yafan             Wang Y.      F   

          country  height  current_rank                   slug  \
0          Fra

To create matches_event.parquet 

In [None]:
import pandas as pd
from pathlib import Path

PROJECT_ROOT = Path.cwd().parent

RAW_TENNIS_DIR = PROJECT_ROOT / "data" / "raw" / "tennis_data"

OUTPUT_RAW_DIR = PROJECT_ROOT / "data" / "raw"
OUTPUT_RAW_DIR.mkdir(parents=True, exist_ok=True)

event_files = list(RAW_TENNIS_DIR.glob("**/data/raw/raw_match_parquet/event_*.parquet"))
print("Number of event files:", len(event_files))

dfs = []

for f in event_files:
    try:
        df = pd.read_parquet(f, engine="pyarrow")
        dfs.append(df)
    except Exception as e:
        print(f"Error reading {f}: {e}")

if dfs:
    events_df = pd.concat(dfs, ignore_index=True)

    # Drop duplicates by match_id
    if "match_id" in events_df.columns:
        events_df = events_df.drop_duplicates(subset=["match_id"])

    out_file = OUTPUT_RAW_DIR / "matches_event.parquet"
    events_df.to_parquet(out_file, engine="pyarrow", index=False)

    print("File created:", out_file)
    print("Number of rows:", len(events_df))
    print("Columns:", list(events_df.columns))
    print(events_df.head(5))


Number of event files: 35053
File created: c:\Users\mit\Desktop\Finaaal\data\raw\matches_event.parquet
Number of rows: 16873
Columns: ['match_id', 'first_to_serve', 'home_team_seed', 'away_team_seed', 'custom_id', 'winner_code', 'default_period_count', 'start_datetime', 'match_slug', 'final_result_only']
   match_id first_to_serve home_team_seed away_team_seed  custom_id  \
0  11974053           None           None           None    NNfsQTf   
1  11974066           None           None           None    hTfsrpg   
2  11998445              1           None              3  nPBbsdgpc   
3  11998446              2           None           None   PfAsFyjc   
4  11998447              1              4           None    FQAsyUF   

  winner_code  default_period_count  start_datetime               match_slug  \
0           1                     3      1706878800  switzerland-netherlands   
1           2                     3      1706871600              ukraine-usa   
2           2              

In [None]:
import numpy as np
import pandas as pd
from pathlib import Path

PROJECT_ROOT = Path.cwd().parent
DATA_RAW = PROJECT_ROOT / "data" / "raw"
DATA_PROCESSED = PROJECT_ROOT / "data" / "processed"
DATA_PROCESSED.mkdir(parents=True, exist_ok=True)

df = pd.read_parquet(DATA_RAW / "players.parquet").copy()

# Ensure 'height' is numeric, convert invalid entries to NaN
h = pd.to_numeric(df.get("height"), errors="coerce")

# Normalize heights
height_m = np.where(
    (h > 1.3) & (h < 2.3),
    h,
    np.where(
        (h >= 135) & (h <= 225),
        h / 100.0,
        np.nan
    )
)

# Round and keep only heights in the plausible range (1.35m–2.25m) ===
height_m = np.round(height_m, 2)
valid_mask = (height_m >= 1.35) & (height_m <= 2.25)

# Keep only valid rows and update 'height' column ===
clean = df.loc[valid_mask].copy()
clean["height"] = height_m[valid_mask]

# Drop 'height_clean' column if present
if "height_clean" in clean.columns:
    clean = clean.drop(columns=["height_clean"])

output_path = DATA_PROCESSED / "players_clean_heights.parquet"
clean.to_parquet(output_path, index=False)

print("Total players in original dataset:", len(df))
print("Players with valid height (in meters):", len(clean))
print("Cleaned dataset saved to:", output_path)
print("Sample rows:")
print(clean.head(10))


Total players in original dataset: 2644
Players with valid height (in meters): 1349
Cleaned dataset saved to: c:\Users\mit\Desktop\Finaaal\data\processed\players_clean_heights.parquet
Sample rows:
   player_id               full_name                name gender  \
0      88992       Muller, Alexandre           Muller A.      M   
1     248846           Mayot, Harold            Mayot H.      M   
2     192013  Auger-Aliassime, Felix  Auger-Aliassime F.      M   
3     192862           Damas, Miguel            Damas M.      M   
4     269291         Forejtek, Jonas         Forejtek J.      M   
5     215205            Zhu, Michael              Zhu M.      M   
6      77223         Martinez, Pedro         Martínez P.      M   
7      63642   Kwiatkowski, Thai-Son      Kwiatkowski T.      M   
8     460898        Bervid, Victoria           Bervid V.      F   
9      52025             Wang, Yafan             Wang Y.      F   

          country  height  current_rank                   slug  \

Outliers Check

In [11]:
import pandas as pd
from pathlib import Path

# مسیرهای پروژه
PROJECT_ROOT = Path.cwd().parent  # پوشه اصلی پروژه
DATA_PROCESSED = PROJECT_ROOT / "data" / "processed"

players = pd.read_parquet(DATA_PROCESSED / "players_clean_heights.parquet")

# بررسی داده‌های خارج از محدوده منطقی [1.35, 2.25] متر
outliers = players[(players["height"] < 1.35) | (players["height"] > 2.25)]
print("\nNumber of outliers outside [1.35, 2.25]:", len(outliers))



Number of outliers outside [1.35, 2.25]: 0


In [14]:
import pandas as pd
from pathlib import Path

# مسیر پروژه
PROJECT_ROOT = Path.cwd().parent
DATA_PROCESSED = PROJECT_ROOT / "data" / "processed"

# بارگذاری فایل
players = pd.read_parquet(DATA_PROCESSED / "players_clean_heights.parquet")

# محاسبه درصد NaN برای هر ستون
nan_percent = (players.isna().sum() / len(players) * 100).round(2)

# نمایش نتیجه
print("Percentage of NaN values per column:")
print(nan_percent)


Percentage of NaN values per column:
player_id         0.00
full_name         0.00
name              0.00
gender            0.07
country           0.00
height            0.00
current_rank      0.67
slug              0.00
birthplace        0.44
plays            28.17
current_prize     0.59
total_prize       0.00
dtype: float64


این کد داده‌های خام شلوغ و پراکنده از ۶۰ روز فولدر رو برمی‌داره، بازی‌ها رو پیدا می‌کنه، بازیکن‌های هر بازی رو با اطلاعات تورنومنت و برنده ترکیب می‌کنه، و یک فایل تمیز و یکتا تحویل می‌ده.

In [None]:
from pathlib import Path
import pandas as pd
import glob
import os

PROJECT_ROOT = Path.cwd().parent
RAW_TENNIS_DIR = PROJECT_ROOT / "data" / "raw" / "tennis_data"
RAW_DIR = PROJECT_ROOT / "data" / "raw"
RAW_DIR.mkdir(parents=True, exist_ok=True)

def load_events():
    raw_event_file = RAW_DIR / "matches_event.parquet"
    desired_cols = ["match_id", "start_datetime", "winner_code", "tournament_id", "tournament_name"]

    if raw_event_file.exists():
        df = pd.read_parquet(raw_event_file)
        return df[[c for c in desired_cols if c in df.columns]]

    files = glob.glob(str(RAW_TENNIS_DIR / "*" / "data" / "raw" / "raw_match_parquet" / "event_*.parquet"))
    dfs = []
    for f in files:
        try:
            dfs.append(pd.read_parquet(f, engine="pyarrow"))
        except Exception as e:
            print("read error:", f, e)
    if not dfs:
        raise SystemExit("No event files found.")

    evt = pd.concat(dfs, ignore_index=True)
    keep = [c for c in desired_cols if c in evt.columns]
    return evt[keep].drop_duplicates(subset=["match_id"]) if "match_id" in evt.columns else evt.drop_duplicates()

def load_team(side: str):
    raw_team_file = RAW_DIR / f"matches_{side}_team.parquet"
    pattern = f"{side}_team*.parquet"

    if raw_team_file.exists():
        return pd.read_parquet(raw_team_file)

    files = glob.glob(str(RAW_TENNIS_DIR / "*" / "data" / "raw" / "raw_match_parquet" / pattern))
    dfs = []
    for f in files:
        try:
            dfs.append(pd.read_parquet(f, engine="pyarrow"))
        except Exception as e:
            print("read error:", f, e)
    if not dfs:
        raise SystemExit(f"No {side}_team files found.")

    df = pd.concat(dfs, ignore_index=True)
    if {"match_id", "player_id"}.issubset(df.columns):
        df = df.drop_duplicates(subset=["match_id", "player_id"])
    else:
        df = df.drop_duplicates()
    return df

events = load_events()
home = load_team("home")
away = load_team("away")

def norm_team(df, side_label):
    cols = [c for c in ["match_id", "player_id", "full_name", "name", "team_name"] if c in df.columns]
    out = df[cols].copy()
    if "full_name" not in out.columns and "name" in out.columns:
        out = out.rename(columns={"name": "full_name"})
    out["side"] = side_label
    return out

home_n = norm_team(home, "home")
away_n = norm_team(away, "away")

# Combine players from both sides
players_per_match = pd.concat([home_n, away_n], ignore_index=True)

# Merge with event info
res = players_per_match.merge(
    events,
    on="match_id",
    how="left",
    validate="many_to_one" if "match_id" in events.columns else "many_to_many"
)

# Mark winners where possible
if "winner_code" in res.columns:
    res["won"] = (res["side"] == res["winner_code"]).where(res["winner_code"].notna(), pd.NA)
else:
    res["won"] = pd.NA

priority = ["match_id", "player_id", "full_name", "side", "won", "start_datetime", "tournament_id", "tournament_name"]
ordered_cols = [c for c in priority if c in res.columns] + [c for c in res.columns if c not in priority]
res = res[ordered_cols]

output_file = RAW_DIR / "match_results_player.parquet"
res.to_parquet(output_file, engine="pyarrow", index=False)

print(f"Saved {output_file} | rows: {len(res)}")
print(res.head(5))


  df = pd.concat(dfs, ignore_index=True)
  df = pd.concat(dfs, ignore_index=True)


Saved c:\Users\mit\Desktop\Finaaal\data\raw\match_results_player.parquet | rows: 57825
   match_id  player_id            full_name  side    won  start_datetime  \
0  11998445   287803.0       Cazaux, Arthur  home  False      1706810400   
1  11998446    62790.0  Lestienne, Constant  home  False      1706800800   
2  11998447    64580.0         Ćorić, Borna  home  False      1706794200   
3  11998448   131442.0        Mmoh, Michael  home  False      1706707800   
4  11998449    22218.0        Paire, Benoit  home  False      1706725200   

           name  winner_code  
0     Cazaux A.          2.0  
1  Lestienne C.          2.0  
2      Ćorić B.          1.0  
3       Mmoh M.          1.0  
4      Paire B.          2.0  


In [None]:
from pathlib import Path
import glob
import pandas as pd

PROJECT_ROOT = Path.cwd().parent
RAW_TENNIS_DIR = PROJECT_ROOT / "data" / "raw" / "tennis_data"
RAW_DIR = PROJECT_ROOT / "data" / "raw"
RAW_DIR.mkdir(parents=True, exist_ok=True)

stats_all_file = RAW_DIR / "statistics_all.parquet"

stat_files = glob.glob(
    str(RAW_TENNIS_DIR / "**" / "data" / "raw" / "raw_statistics_parquet" / "*.parquet"),
    recursive=True
)
print(f"Statistics files found: {len(stat_files)}")
if not stat_files:
    raise RuntimeError("No statistics files found.")

# Read and keep only important columns
use_cols = ["match_id", "period", "statistic_name", "home_stat", "away_stat"]
dfs = []
for f in stat_files:
    try:
        df = pd.read_parquet(f, columns=[c for c in use_cols if c != "period"])
        if "period" not in df.columns:
            df["period"] = pd.NA
        dfs.append(df)
    except Exception:
        continue

stats_all = pd.concat(dfs, ignore_index=True)

stats_all.to_parquet(stats_all_file, index=False)
print(f"Saved merged statistics to: {stats_all_file} | Rows: {len(stats_all):,}")
print(stats_all.head(5))


Statistics files found: 23291
Saved merged statistics to: c:\Users\mit\Desktop\Finaaal\data\raw\statistics_all.parquet | Rows: 1,358,234
   match_id      statistic_name     home_stat    away_stat period
0  11998445                aces            12            6    NaN
1  11998445       double_faults             2            7    NaN
2  11998445         first_serve  57/101 (56%)  53/90 (59%)    NaN
3  11998445        second_serve   42/44 (95%)  30/37 (81%)    NaN
4  11998445  first_serve_points   42/57 (74%)  39/53 (74%)    NaN


پر کردن وینر کد های خالی و ساخت یک  فایل پارکت جدید :

In [None]:
from pathlib import Path
import pandas as pd
import numpy as np

PROJECT_ROOT = Path.cwd().parent
stats_all_file = PROJECT_ROOT / "data" / "raw" / "statistics_all.parquet"
player_file = PROJECT_ROOT / "data" / "raw" / "match_results_player.parquet"
out_file = PROJECT_ROOT / "data" / "processed" / "match_results_player_filled.parquet"

stats = pd.read_parquet(stats_all_file)

stats = stats[stats["statistic_name"] == "total_won"].copy()

stats["_period_rank"] = np.where(stats["period"].fillna("") == "ALL", 0, 1)

# Keep one row per match_id (best period first)
stats = (
    stats
    .sort_values(["_period_rank"])
    .drop_duplicates(subset=["match_id"], keep="first")
)

# Clean & determine winner_code from stats
for c in ["home_stat", "away_stat"]:
    stats[c] = pd.to_numeric(stats[c], errors="coerce")

stats["winner_from_stats"] = np.where(
    stats["home_stat"] > stats["away_stat"], 1,
    np.where(stats["away_stat"] > stats["home_stat"], 2, np.nan)
)

winner_map = stats[["match_id", "winner_from_stats", "home_stat", "away_stat"]].rename(
    columns={"home_stat": "total_won_home", "away_stat": "total_won_away"}
)

# Load match_results_player.parquet and fill missing winner_code
players = pd.read_parquet(player_file)

before_null = players["winner_code"].isna().sum() if "winner_code" in players.columns else len(players)
if "winner_code" not in players.columns:
    players["winner_code"] = np.nan

# Merge and fill
merged = players.merge(winner_map, on="match_id", how="left")
mask_fill = merged["winner_code"].isna() & merged["winner_from_stats"].notna()
merged.loc[mask_fill, "winner_code"] = merged.loc[mask_fill, "winner_from_stats"]

after_null = merged["winner_code"].isna().sum()
filled_rows = int(before_null - after_null)


out_file.parent.mkdir(parents=True, exist_ok=True)
merged.to_parquet(out_file, index=False)

# Report
print(f"Saved: {out_file}")
print(f"Matches with total_won available: {winner_map['match_id'].nunique():,}")
print(f"Newly filled winner_code (from statistics): {filled_rows:,}")
print(f"Remaining NaN winner_code: {after_null:,}")


Saved: c:\Users\mit\Desktop\Finaaal\data\processed\match_results_player_filled.parquet
Matches with total_won available: 11,209
Newly filled winner_code (from statistics): 3,019
Remaining NaN winner_code: 4,636


In [24]:
import pandas as pd
from pathlib import Path

PROJECT_ROOT = Path.cwd().parent
TENIS_RAW = PROJECT_ROOT / "data" / "raw" / "tennis_data"
PLAYER_FILLED_PATH = PROJECT_ROOT / "data" / "processed" / "match_results_player_filled.parquet"

df_filled = pd.read_parquet(PLAYER_FILLED_PATH)

home_dfs = []
away_dfs = []

for date_dir in sorted(TENIS_RAW.iterdir()):
    if date_dir.is_dir() and "20240201" <= date_dir.name <= "20240331":
        raw_dir = date_dir / "data" / "raw" / "raw_match_parquet"
        if raw_dir.exists():
            for f in raw_dir.rglob("home_team_*.parquet"):
                df_home = pd.read_parquet(f).rename(columns={
                    "player_id": "home_player_id",
                    "full_name": "home_full_name"
                })
                home_dfs.append(df_home)
            for f in raw_dir.rglob("away_team_*.parquet"):
                df_away = pd.read_parquet(f).rename(columns={
                    "player_id": "away_player_id",
                    "full_name": "away_full_name"
                })
                away_dfs.append(df_away)

home_all = pd.concat(home_dfs, ignore_index=True)
away_all = pd.concat(away_dfs, ignore_index=True)

merged_home_away = pd.merge(home_all, away_all, on="match_id", suffixes=("_home", "_away"))
df_updated = pd.merge(df_filled, merged_home_away, on="match_id", how="left")

df_updated.to_parquet(PLAYER_FILLED_PATH, index=False)


  home_all = pd.concat(home_dfs, ignore_index=True)
  away_all = pd.concat(away_dfs, ignore_index=True)


In [25]:
import pandas as pd
from pathlib import Path

PROJECT_ROOT = Path.cwd().parent
INPUT_FILE = PROJECT_ROOT / "data" / "processed" / "match_results_player_filled.parquet"
OUTPUT_FILE = PROJECT_ROOT / "data" / "processed" / "match_results_with_winner.parquet"

df = pd.read_parquet(INPUT_FILE)

print(f"Total rows: {len(df)}")
print(f"Unique match_id: {df['match_id'].nunique()}")

df = df.drop_duplicates(subset="match_id").copy()

def get_winner(row):
    if row["winner_code"] == 1:
        return row["name_home"]
    if row["winner_code"] == 2:
        return row["name_away"]
    if pd.notna(row.get("total_won_home")) and pd.notna(row.get("total_won_away")):
        if row["total_won_home"] > row["total_won_away"]:
            return row["name_home"]
        if row["total_won_home"] < row["total_won_away"]:
            return row["name_away"]
    return None

df["winner_name"] = df.apply(get_winner, axis=1)

print(f"Matches without determined winner_name: {df['winner_name'].isna().sum()}")

df.to_parquet(OUTPUT_FILE, index=False)
print(f"File saved to: {OUTPUT_FILE}")


Total rows: 849506
Unique match_id: 16873
Matches without determined winner_name: 4478
File saved to: c:\Users\mit\Desktop\Finaaal\data\processed\match_results_with_winner.parquet


بازیکن با بیشترین برد :

In [19]:
import pandas as pd
from pathlib import Path

file_path = Path("../data/processed/match_results_with_winner.parquet")
df = pd.read_parquet(file_path)

# ساخت ستون slug برنده بر اساس winner_code
df["winner_slug"] = df.apply(
    lambda row: row["slug_home"] if row["winner_code"] == 1 else row["slug_away"],
    axis=1
)

# پیدا کردن نفر اول با بیشترین برد
top_winner_stats = (
    df.groupby("winner_slug")
    .size()
    .reset_index(name="win_count")
    .sort_values("win_count", ascending=False)
    .head(1)
)

print(top_winner_stats)

# I used slug to count more accurately because it's unique

       winner_slug  win_count
1685  popko-dmitry         32


In [34]:
import pandas as pd
from pathlib import Path

# تعیین روت پروژه (یک سطح بالاتر از اسکریپت فعلی)
project_root = Path.cwd().parent

# ساخت مسیر فایل players.parquet در مسیر raw
file_path = project_root / "data" / "raw" / "players.parquet"

# لود داده‌ها
df = pd.read_parquet(file_path)

# تعداد کل ردیف‌ها
total_rows = len(df)

# تعداد و درصد مقادیر NaN در ستون current_rank
nan_count = df["current_rank"].isna().sum()
nan_percent = (nan_count / total_rows) * 100

print(f"Nan Count: {nan_count}")
print(f"Nan Percentage: {nan_percent:.2f}%")

# Nan % is low. It's ready to analyze.


Nan Count: 60
Nan Percentage: 2.27%


In [37]:
import pandas as pd
from pathlib import Path

# 1. تعیین روت پروژه (یک سطح بالاتر از محل اجرای اسکریپت)
project_root = Path.cwd().parent

# 2. ساخت مسیر فایل players.parquet
file_path = project_root / "data" / "raw" / "players.parquet"

# 3. بارگذاری دیتاست
df = pd.read_parquet(file_path)

# 4. نمایش اولین 49 مقدار ستون 'plays'
print(df["plays"].head(10))

# 5. محاسبه درصد NaN در ستون 'plays'
nan_percentage = df["plays"].isna().mean() * 100
print(f"\nPercentage of NaN in 'plays': {nan_percentage:.2f}%")

# Displays the first 49 values of the 'plays' column
# from players.parquet and prints the percentage of missing (NaN) values in that column.


0    right-handed
1    right-handed
2    right-handed
3    right-handed
4    right-handed
5    right-handed
6    right-handed
7    right-handed
8    right-handed
9    right-handed
Name: plays, dtype: object

Percentage of NaN in 'plays': 56.66%


In [38]:
import pandas as pd
from difflib import get_close_matches
from pathlib import Path

# 1. تعیین مسیر روت پروژه (یک سطح بالاتر از پوشه اسکریپت جاری)
project_root = Path.cwd().parent

# 2. مسیر کامل به فایل players.parquet در data/raw
players_file = project_root / "data" / "raw" / "players.parquet"

# 3. بارگذاری داده‌ها
players = pd.read_parquet(players_file)

# 4. لیست یکتا و تمیز از کشورها
countries = sorted(players["country"].dropna().str.strip().unique())

# 5. پیدا کردن نام‌های مشابه
checked = set()
for country in countries:
    if country not in checked:
        matches = get_close_matches(country, countries, n=5, cutoff=0.85)
        if len(matches) > 1:
            print(matches)
        checked.update(matches)

        # Checks for duplicated or misspelled country names by
        # finding similar names with a high match ratio.
        # In this dataset countries are cleaned. (Australia and Austria are not the same)


['Australia', 'Austria']


In [39]:
import pandas as pd
from pathlib import Path

# 1. تعیین مسیر روت پروژه
project_root = Path.cwd().parent

# 2. مسیر کامل به فایل
match_file = project_root / "data" / "raw" / "match_results_player.parquet"

# 3. بارگذاری داده‌ها
df = pd.read_parquet(match_file)

# 4. محاسبه مجموع ردیف‌ها و درصد winner_codeهای خالی
total = len(df)
missing = df["winner_code"].isna().sum()
percent = (missing / total) * 100

# 5. چاپ نتایج
print(f"Total rows: {total}")
print(f"Missing winner_code: {missing}")
print(f"Percent missing: {percent:.2f}%")


Total rows: 57825
Missing winner_code: 7655
Percent missing: 13.24%


In [41]:
import pandas as pd
from pathlib import Path
import re

# Paths
PROJECT_ROOT = Path.cwd().parent
TENIS_RAW = PROJECT_ROOT / "data" / "raw" / "tennis_data"
PROC_DIR = PROJECT_ROOT / "data" / "processed"
PROC_DIR.mkdir(parents=True, exist_ok=True)

all_tournaments = []

# Loop through dated folders like 2024****
for date_dir in TENIS_RAW.glob("2024*"):
    if date_dir.is_dir() and re.match(r"2024\d{4}", date_dir.name):
        raw_match_path = date_dir / "data" / "raw" / "raw_match_parquet"
        if raw_match_path.exists():
            for file in raw_match_path.glob("tournament_*.parquet"):
                try:
                    df = pd.read_parquet(file)
                    needed_cols = [
                        "match_id", "tournament_id",
                        "tournament_name", "tournament_slug", "start_datetime"
                    ]
                    available_cols = [c for c in needed_cols if c in df.columns]
                    if available_cols:
                        all_tournaments.append(df[available_cols])
                except Exception as e:
                    print(f"Error reading {file}: {e}")

# Combine and save
if all_tournaments:
    tournaments_df = (
        pd.concat(all_tournaments, ignore_index=True)
          .drop_duplicates()
    )
    output_file = PROC_DIR / "tournaments_all_2024.parquet"
    tournaments_df.to_parquet(output_file, index=False)
    print(f"Saved: {output_file}")
    print(tournaments_df.head())
else:
    print("No tournament files found.")


Saved: c:\Users\mit\Desktop\Finaaal\data\processed\tournaments_all_2024.parquet
   match_id  tournament_id      tournament_name     tournament_slug
0  11974053          70826           Qualifiers          qualifiers
1  11974066          70826           Qualifiers          qualifiers
2  11998445         126168  Montpellier, France  montpellier-france
3  11998446         126168  Montpellier, France  montpellier-france
4  11998447         126168  Montpellier, France  montpellier-france


برای سوال نهم اطلاعات مربوط به راند بازی هارو به دو بخش فوریه و مارس تقسیم کردم : 

In [42]:
import pandas as pd
from pathlib import Path

# Paths
PROJECT_ROOT = Path.cwd().parent
TENIS_RAW = PROJECT_ROOT / "data" / "raw" / "tennis_data"
PROC_DIR = PROJECT_ROOT / "data" / "processed"
PROC_DIR.mkdir(parents=True, exist_ok=True)

# Month ranges
feb_dirs = [d for d in TENIS_RAW.glob("202402*") if d.is_dir()]
mar_dirs = [d for d in TENIS_RAW.glob("202403*") if d.is_dir()]

def process_month(dirs, save_name):
    dfs = []
    for date_dir in dirs:
        parquet_dir = date_dir / "data" / "raw" / "raw_match_parquet"
        if parquet_dir.exists():
            for file in parquet_dir.glob("round_*.parquet"):
                try:
                    df = pd.read_parquet(file)
                    required = ["match_id", "round_id", "slug", "cup_round_type"]
                    if set(required).issubset(df.columns):
                        dfs.append(df[required])
                    else:
                        print(f"Skipped (missing cols): {file}")
                except Exception as e:
                    print(f"Error reading {file}: {e}")
    if dfs:
        combined_df = pd.concat(dfs, ignore_index=True).drop_duplicates()
        save_path = PROC_DIR / save_name
        combined_df.to_parquet(save_path, index=False)
        print(f"Saved {len(combined_df)} rows to {save_path}")
    else:
        print(f"No valid round parquet files found for {save_name}")

# Feb & Mar 2024
process_month(feb_dirs, "match_round_info_february.parquet")
process_month(mar_dirs, "match_round_info_march.parquet")


Saved 4274 rows to c:\Users\mit\Desktop\Finaaal\data\processed\match_round_info_february.parquet
Saved 5159 rows to c:\Users\mit\Desktop\Finaaal\data\processed\match_round_info_march.parquet


In [None]:
import pandas as pd
from pathlib import Path

PROC_DIR = Path("../data/processed")

def show_cup_round_counts(filename, label):
    file_path = PROC_DIR / filename
    if file_path.exists():
        df = pd.read_parquet(file_path)

        print(f"\n{label} cup_round_type values:")
        print(df["cup_round_type"].value_counts(dropna=False).sort_index())

        # Calculate NaN percentage
        total_rows = len(df)
        nan_count = df["cup_round_type"].isna().sum()
        nan_percentage = (nan_count / total_rows) * 100
        print(f"NaN percentage: {nan_percentage:.2f}% ({nan_count}/{total_rows})")


show_cup_round_counts("match_round_info_february.parquet", "February")
show_cup_round_counts("match_round_info_march.parquet", "March")



February cup_round_type values:
cup_round_type
1.0      101
2.0      210
4.0      505
8.0     1100
16.0    1861
NaN      497
Name: count, dtype: int64
NaN percentage: 11.63% (497/4274)

March cup_round_type values:
cup_round_type
1.0      159
2.0      340
4.0      624
8.0     1112
16.0    2025
NaN      899
Name: count, dtype: int64
NaN percentage: 17.43% (899/5159)


To add ground_type to my tournament file:

In [None]:
import pandas as pd
from pathlib import Path

project_root = Path.cwd().parent
raw_dir = project_root / "data" / "raw" / "tennis_data"
processed_dir = project_root / "data" / "processed"
tourn_path = processed_dir / "tournaments_all_2024.parquet"

df_tourn = pd.read_parquet(tourn_path)
print("Before merge:", df_tourn.shape)

files = list(raw_dir.glob("2024*/data/raw/raw_match_parquet/tournament_*.parquet"))
print(f"Found {len(files)} tournament_*.parquet files")

dfs = []
for f in files:
    try:
        df_tmp = pd.read_parquet(f, columns=["match_id", "ground_type"])
        dfs.append(df_tmp)
    except Exception as e:
        print(f"Error reading {f}: {e}")

if dfs:
    df_ground = pd.concat(dfs, ignore_index=True).drop_duplicates(subset=["match_id"])

    df_merged = df_tourn.merge(df_ground, on="match_id", how="left")

    df_merged.to_parquet(tourn_path, index=False)
    print("After merge:", df_merged.shape)
    print(df_merged.head())
else:
    print("No tournament_*.parquet files with 'ground_type' found.")


Before merge: (16873, 4)
Found 35671 tournament_*.parquet files
After merge: (16873, 5)
   match_id  tournament_id      tournament_name     tournament_slug  \
0  11974053          70826           Qualifiers          qualifiers   
1  11974066          70826           Qualifiers          qualifiers   
2  11998445         126168  Montpellier, France  montpellier-france   
3  11998446         126168  Montpellier, France  montpellier-france   
4  11998447         126168  Montpellier, France  montpellier-france   

        ground_type  
0              None  
1              None  
2  Hardcourt indoor  
3  Hardcourt indoor  
4  Hardcourt indoor  


...

In [1]:
from pathlib import Path
import pandas as pd

project_root = Path.cwd().parent
raw_dir = project_root / "data" / "raw" / "tennis_data"

# پیدا کردن تمام فایل های time_*.parquet
files = list(raw_dir.glob("2024*/data/raw/raw_match_parquet/time_*.parquet"))
print(f"Found {len(files)} time_*.parquet files")

dfs = []
for f in files:
    try:
        df_tmp = pd.read_parquet(f)
        # فقط ستون های match_id و period_* رو برمی‌داریم
        period_cols = [c for c in df_tmp.columns if c.startswith("period_")]
        keep_cols = ["match_id"] + period_cols
        df_tmp = df_tmp[keep_cols]
        dfs.append(df_tmp)
    except Exception as e:
        print(f"Error reading {f}: {e}")

df_periods = pd.concat(dfs, ignore_index=True)

print("Shape:", df_periods.shape)
print(df_periods.head())

out_dir = project_root / "data" / "processed"
out_dir.mkdir(parents=True, exist_ok=True)
df_periods.to_parquet(out_dir / "all_match_periods.parquet", index=False)
print(f"Saved to {out_dir / 'all_match_periods.parquet'}")


Found 35671 time_*.parquet files
Shape: (35671, 6)
   match_id period_1 period_2 period_3 period_4 period_5
0  11974053     None     None     None     None     None
1  11974066     None     None     None     None     None
2  11998445     3259     2639     4202     None     None
3  11998446     2488     2375     None     None     None
4  11998447     3741     1913     None     None     None
Saved to c:\Users\mit\Desktop\Finaaal\data\processed\all_match_periods.parquet
