# TTTM - Table Tennis Score Prediction - Israel
## in this project we will try to predict the score of the matches, and the forms that are filled by each team


### Initial imports

In [1]:
import re
from datetime import datetime

import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup


### Open csv 

In [2]:
def read_csv(filename):
    return pd.read_csv("./" + filename)


### Dataset - links to every player personal profile page

#### crwaling to 'http://www.tttm.co.il', preparing the first dataset that will contain each player personal link

In [5]:
players_hrefs = []
top_players_df = None


def get_players_df():
    print("Start - Retrieve data from site")
    for i in range(1, 11):
        url = f"https://www.tttm.co.il/rk/MS-{i}/%D7%98%D7%A0%D7%99%D7%A1-%D7%A9%D7%95%D7%9C%D7%97%D7%9F-%D7%93%D7%99%D7%A8%D7%95%D7%92-%D7%92%D7%91%D7%A8%D7%99%D7%9D"
        res = requests.get(url)
        soup = BeautifulSoup(res.content, "html.parser")
        players = soup.find("table").find_all("tr")
        for player in players:
            a = player.find("a")
            try:
                players_hrefs.append(a["href"])
            except:
                continue

    players_to_df = {
        "name": [],
        "club": [],
        "rank": [],
        "category": [],
        "points": [],
        "id": [],
        "link": [],
    }

    for h in players_hrefs:
        res = requests.get("http://tttm.co.il" + h)
        soup = BeautifulSoup(res.content, "html.parser")
        name = soup.find("div", attrs={"class": "playerName"}).text
        player_presentation = soup.find("div", attrs={"class": "playerPresentation"})
        club = player_presentation.find("a").text
        rank = int(player_presentation.find("table").find("b").text)
        cat_points = player_presentation.find_all("span")
        category = cat_points[0].find("b").text
        points = float(cat_points[1].find("b").text)
        player_id = player_presentation.find("b").text.strip("\n").strip()
        print(name)
        players_to_df["name"].append(name)
        players_to_df["club"].append(club)
        players_to_df["rank"].append(rank)
        players_to_df["category"].append(category)
        players_to_df["points"].append(points)
        players_to_df["id"].append(player_id)
        players_to_df["link"].append(h)

    top_players_df = pd.DataFrame.from_dict(players_to_df)
    top_players_df["name"] = top_players_df["name"].str.strip("\n ")
    top_players_df["club"] = top_players_df["club"].str.strip("\n ")
    top_players_df.to_csv("players_data_table.csv", encoding="utf-8-sig")
    print("Done")

## Trying to open the players_data_table.csv. if it doesnt work we will use the function above


In [6]:
try:
    top_players_df = read_csv("./players_data_table.csv")
    players_hrefs = top_players_df["link"]
except:
    get_players_df()
    top_players_df = read_csv("./players_data_table.csv")
    players_hrefs = top_players_df["link"]

In [7]:
try:
    top_players_df = top_players_df.drop("Unnamed: 0", axis=1)
except:
    pass

In [8]:
top_players_df

Unnamed: 0,name,club,rank,category,points,id,link
0,יונתן שוסטרמן,מ. בני הרצליה,1,S,1807.8,745,/p/745/יונתן-שוסטרמן
1,מיכאל טאובר,עירוני גבעתיים,2,S,1774.0,853,/p/853/מיכאל-טאובר
2,טל ישראלי,הפועל חיפה,3,E20,1756.9,651,/p/651/טל-ישראלי
3,עמית גורן,הפועל עירוני נוף הגליל,4,S,1712.4,676,/p/676/עמית-גורן
4,גל אלגואטי,מ. בני הרצליה,5,S,1686.0,2183,/p/2183/גל-אלגואטי
...,...,...,...,...,...,...,...
995,איציק סנדרוסי,הפועל ירושלים,996,S50,210.4,3618,/p/3618/איציק-סנדרוסי
996,מיכאל אזורב,מועדון ספורט מעלה אדומים,997,S60,210.0,3545,/p/3545/מיכאל-אזורב
997,איתי רייך,מ. בני הרצליה,998,M13,208.9,7644,/p/7644/איתי-רייך
998,דניאל ירמיהו,הפועל ירושלים,999,S70,208.8,3464,/p/3464/דניאל-ירמיהו


### Let's impelement a function to return a single row of a player
#### get_player_row(player_name) -> return player's row

In [9]:
def get_player_row(player_name):
    try:
        row = top_players_df[top_players_df["name"] == player_name]
        return row
    except:
        return None

In [10]:
test_row = get_player_row("רון דימנט")
test_row["name"].values[0]

'רון דימנט'

## After saving the first dataframe as csv we need to acquire more data

### Creating a new dataset for all the games that the players which we crawled to achieve their data earlier

In [35]:
games_to_df = {
    "match_id": [],
    "match_type": [],
    "date": [],
    "p1_id": [],
    "p1_name": [],
    "p1_club": [],
    "p1_rank": [],
    "p1_sets": [],
    "p1_home": [],
    "p1_points_gained": [],
    "p2_id": [],
    "p2_name": [],
    "p2_club": [],
    "p2_rank": [],
    "p2_sets": [],
    "p2_home": [],
    "p2_points_gained": [],
    "winner_id": [],
}


## Creating the matches dataset

### We will now crawl on every player in top 500 and create a dataset of all the games that a player played 


In [36]:
for h in range(500):
    try:
        res = requests.get("http://tttm.co.il" + players_hrefs[h])
        soup = BeautifulSoup(res.content, "html.parser")
        pages_arr = soup.find("div", attrs={"class": "rankpages"}).find_all("a")
        player_presentation = soup.find("div", attrs={"class": "playerPresentation"})
        player_details = {}
        player_details["name"] = (
            player_presentation.find("div", attrs={"class": "playerName"})
            .text.strip("\n")
            .strip()
        )
        player_details["id"] = player_presentation.find("b").text.strip("\n").strip()
        player_details["club"] = player_presentation.find("a").text.strip("\n").strip()
        print("Player number: ", h, " ", player_details)
        for p in range(9):
            res = requests.get("http://tttm.co.il" + pages_arr[p]["href"])
            soup = BeautifulSoup(res.content, "html.parser")
            table = soup.find("table", attrs={"class": "lstMatchs"})
            trays = table.findAll("tr")
            for tr in trays:
                try:
                    match_type = tr.find("a", attrs={"class": "fsClub"}).text.strip(
                        "\r\n                        "
                    )
                    td_list = tr.find_all("td")
                    match_id = td_list[0].text
                    date_string = td_list[2].text
                    p1_points_gained = 0
                    p2_points_gained = 0
                    try:
                        p1_points_gained = float(td_list[10].find("span").text)
                        delta = float(td_list[9].find("b").text)
                        if p1_points_gained < 0:
                            p2_points_gained = abs(p1_points_gained) * delta
                        else:
                            p2_points_gained = float(p1_points_gained / delta)
                            p2_points_gained = (-1) * (p2_points_gained)
                    except:
                        p1_points_gained = 0
                        p2_points_gained = 0
                    p1_id = player_details["id"]
                    p1_name = player_details["name"]
                    p1_club = player_details["club"]
                    p1_rank = td_list[1].find("span").text
                    p1_rank = float(re.findall("\d+\.\d+", p1_rank)[0])
                    p2_rank = td_list[6].text
                    p2 = td_list[5].find_all("a")
                    p2_name = p2[0].text.strip("    \r\n                    ")
                    p2_club = p2[1].text
                    p2_rank = float(td_list[6].text)
                    p2_id = top_players_df[top_players_df["name"] == p2_name][
                        "id"
                    ].values[0]
                    sets = td_list[8].text.split("-")
                    p2_sets = int(sets[0])
                    p1_sets = int(sets[1])
                    p2_home = np.nan
                    p1_home = np.nan
                    try:
                        teams = td_list[3].find("i").text.split("-")
                        if p2_club in teams[0]:
                            p2_home = 1
                            p1_home = 0

                        elif p2_club in teams[1]:
                            p1_home = 1
                            p2_home = 0

                        games_to_df["p2_home"].append(p2_home)
                        games_to_df["p1_home"].append(p1_home)

                    except:
                        games_to_df["p2_home"].append(p2_home)
                        games_to_df["p1_home"].append(p1_home)

                    date_time_obj = datetime.strptime(date_string, "%d/%m/%Y")

                    winner = p1_id if p1_sets > p2_sets else p2_id
                    games_to_df["p1_points_gained"].append(p1_points_gained)
                    games_to_df["p2_points_gained"].append(p2_points_gained)
                    games_to_df["match_id"].append(match_id)
                    games_to_df["match_type"].append(match_type)
                    games_to_df["date"].append(date_time_obj)
                    games_to_df["p1_id"].append(p1_id)
                    games_to_df["p1_club"].append(p1_club)
                    games_to_df["p1_name"].append(p1_name)
                    games_to_df["p1_rank"].append(p1_rank)
                    games_to_df["p2_id"].append(p2_id)
                    games_to_df["p2_club"].append(p2_club)
                    games_to_df["p2_name"].append(p2_name)
                    games_to_df["p2_rank"].append(p2_rank)

                    games_to_df["p1_sets"].append(p1_sets)
                    games_to_df["p2_sets"].append(p2_sets)
                    games_to_df["winner_id"].append(winner)

                except:
                    continue
    except:
        continue


Player number:  0   {'name': 'יונתן שוסטרמן', 'id': '745', 'club': 'מ. בני הרצליה'}
Player number:  1   {'name': 'מיכאל טאובר', 'id': '853', 'club': 'עירוני גבעתיים'}
Player number:  2   {'name': 'טל ישראלי', 'id': '651', 'club': 'הפועל חיפה'}
Player number:  3   {'name': 'עמית גורן', 'id': '676', 'club': 'הפועל עירוני נוף הגליל'}
Player number:  5   {'name': 'יניב שרון', 'id': '882', 'club': "בית''ר ראשון לציון"}
Player number:  6   {'name': 'עמרי בן ארי', 'id': '894', 'club': 'עירוני גבעתיים'}
Player number:  7   {'name': 'אביב בן ארי', 'id': '524', 'club': 'מכבי זאב זכרון יעקב'}
Player number:  8   {'name': 'איתי אביבי', 'id': '2284', 'club': 'מ. בני הרצליה'}
Player number:  9   {'name': 'גל פרפל', 'id': '741', 'club': 'הישגי כרמיאל'}
Player number:  10   {'name': 'רון דוידוביץ', 'id': '881', 'club': 'הפועל עירוני נוף הגליל'}
Player number:  11   {'name': 'עידן אוגורצב', 'id': '862', 'club': 'מכבי תל אביב'}
Player number:  12   {'name': 'איתי שושן', 'id': '2625', 'club': 'כרמל מרכזי

In [38]:
games_df = pd.DataFrame.from_dict(games_to_df)
games_df

Unnamed: 0,match_id,match_type,date,p1_id,p1_name,p1_club,p1_rank,p1_sets,p1_home,p1_points_gained,p2_id,p2_name,p2_club,p2_rank,p2_sets,p2_home,p2_points_gained,winner_id
0,338816,ליגת על גברים 2022-2023,2023-01-03,745,יונתן שוסטרמן,מ. בני הרצליה,1807.8,2,1.0,-10.5,676,עמית גורן,הפועל עירוני נוף הגליל,1712.4,3,0.0,15.75,676
1,337989,ליגת על גברים 2022-2023,2022-12-27,745,יונתן שוסטרמן,מ. בני הרצליה,1800.3,3,0.0,1.5,926,שימי אסרף,הפועל חיפה,1071.1,0,1.0,-1.00,745
2,337992,ליגת על גברים 2022-2023,2022-12-27,745,יונתן שוסטרמן,מ. בני הרצליה,1800.3,3,0.0,1.5,1844,נדב דהן,הפועל חיפה,1133.6,0,1.0,-1.00,745
3,337511,ליגת על גברים 2022-2023,2022-12-25,745,יונתן שוסטרמן,מ. בני הרצליה,1800.3,3,1.0,1.5,6601,מקסים גולדין,כרמל מרכזי טניס שולחן חיפה,1057.6,0,0.0,-1.00,745
4,337516,ליגת על גברים 2022-2023,2022-12-25,745,יונתן שוסטרמן,מ. בני הרצליה,1800.3,3,1.0,3.0,685,יניב קרמזין,כרמל מרכזי טניס שולחן חיפה,1368.9,0,0.0,-2.00,745
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
65099,293649,ליגת א גברים 2021-2022,2022-03-02,3015,טארק קעדאן,הפועל מתנ''ס בקה אל גרבייה,469.0,0,0.0,-1.0,1579,אסף גונן,מכבי זאב זכרון יעקב,871.6,3,1.0,1.00,1579
65100,293651,ליגת א גברים 2021-2022,2022-03-02,3015,טארק קעדאן,הפועל מתנ''ס בקה אל גרבייה,469.0,3,0.0,5.0,3947,ליאב כהן,מכבי זאב זכרון יעקב,302.1,1,1.0,-5.00,3015
65101,293653,ליגת א גברים 2021-2022,2022-03-02,3015,טארק קעדאן,הפועל מתנ''ס בקה אל גרבייה,469.0,0,0.0,-2.0,526,גילעד הוזליך,מכבי זאב זכרון יעקב,721.2,3,1.0,2.00,526
65102,290032,ליגת א גברים 2021-2022,2022-02-16,3015,טארק קעדאן,הפועל מתנ''ס בקה אל גרבייה,459.0,3,1.0,6.0,1514,אלכסנדר וולוך,מכבי זאב זכרון יעקב,351.3,0,0.0,-6.00,3015


In [15]:
# games_df = games_df.drop_duplicates('match_id')
# We remove the duplicated games later in the processing

In [39]:
games_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 65104 entries, 0 to 65103
Data columns (total 18 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   match_id          65104 non-null  object        
 1   match_type        65104 non-null  object        
 2   date              65104 non-null  datetime64[ns]
 3   p1_id             65104 non-null  object        
 4   p1_name           65104 non-null  object        
 5   p1_club           65104 non-null  object        
 6   p1_rank           65104 non-null  float64       
 7   p1_sets           65104 non-null  int64         
 8   p1_home           41681 non-null  float64       
 9   p1_points_gained  65104 non-null  float64       
 10  p2_id             65104 non-null  int64         
 11  p2_name           65104 non-null  object        
 12  p2_club           65104 non-null  object        
 13  p2_rank           65104 non-null  float64       
 14  p2_sets           6510

In [14]:
games_df.to_csv("games_data_table.csv", sep=",", encoding="utf-8-sig")