In [None]:
!pip install requests
!pip install pandas
!pip install beautifulsoup4

In [10]:
import requests
import pandas as pd
import re
from bs4 import BeautifulSoup

In [7]:
def get_transactions(transactions, dataframe_to_append):
    df = dataframe_to_append

    # Loop through each transaction
    for transaction in transactions:
        
        # Get text of individual transaction
        text = transaction.get_text(strip=False)

        # Keep transaction only if it's a trade involving at least one pitcher
        if "traded" in text and "HP" in text:

            # Split the trade into blocks based on semicolons for multi-team trades
            trade_blocks = text.split(';')

            # Process each trade block separately
            for block in trade_blocks:
                words = block.split()
                

                # Skip empty or irrelevant blocks
                if not words or 'HP' not in ' '.join(words):
                    continue

                # Remove the entire phrase "and a player to be named later or cash considerations"
                block = re.sub(r'and a player to be named later or cash considerations', '', block, flags=re.IGNORECASE).strip()
                
                # Re-split the block into words after removing the unwanted phrase
                words = block.split()

                # Get index for words to split sentence by
                try:
                    for_index = words.index('for')
                    to_index = words.index('to')
                    traded_index = words.index('traded')
                except ValueError:
                    # Skip invalid block if it doesn't follow the expected format
                    continue

                # Get team names
                team_1 = ' '.join(words[1:traded_index])
                team_2 = ' '.join(words[to_index+1:for_index])

                # Clean and split trade_from_team_1
                trade_from_team_1 = [
                    re.sub(r'\s*and cash$', '', item.strip())  # Remove "and cash" if present
                    for item in re.split(r'[,.]+| and ', ' '.join(words[traded_index + 1:to_index]))  # Split by comma, period, or "and"
                    if item.strip()
                ]

                # Clean and split trade_from_team_2
                trade_from_team_2 = [
                    re.sub(r'\s*and cash$', '', item.strip())  # Remove "and cash" if present
                    for item in re.split(r'[,.]+| and ', ' '.join(words[for_index + 1:])) 
                    if item.strip()
                ]

                # Iterate through players in trade_from_team_1
                for player in trade_from_team_1:
                    if "HP" in player:  # If it's a pitcher
                        player_pos = player.split()[0]  # Position
                        player_name = ' '.join(player.split()[1:])  # Player name
                        traded_from_team = team_1
                        traded_to_team = team_2
                        date = words[0]  # Date

                        # Append the new trade data to the dataframe
                        tmp = pd.DataFrame([{
                            "date": date, 
                            "team_traded_from": traded_from_team, 
                            "player_name": player_name, 
                            "player_pos": player_pos, 
                            "team_traded_to": traded_to_team
                        }])
                        df = pd.concat([df, tmp], ignore_index=True)

                # Iterate through players in trade_from_team_2
                for player in trade_from_team_2:
                    if "HP" in player:  # If it's a pitcher
                        player_pos = player.split()[0]  # Position
                        player_name = ' '.join(player.split()[1:])  # Player name
                        traded_from_team = team_2
                        traded_to_team = team_1
                        date = words[0]  # Date

                        # Append the new trade data to the dataframe
                        tmp = pd.DataFrame([{
                            "date": date, 
                            "team_traded_from": traded_from_team, 
                            "player_name": player_name, 
                            "player_pos": player_pos, 
                            "team_traded_to": traded_to_team
                        }])
                        df = pd.concat([df, tmp], ignore_index=True)

    # Drop duplicates if any
    df.drop_duplicates(inplace=True)

    # Return the updated dataframe
    return df


In [79]:
import requests
import pandas as pd
import re
from bs4 import BeautifulSoup

# create dataframe to store transactions
df = pd.DataFrame(columns=["date", "team_traded_from", "player_name", "player_pos", "team_traded_to"])

# set date range for 2024 season
date_range = pd.date_range(start="2021-07-12", end="2021-07-12")

# loop through all dates in date range
for date_og in date_range:
    # format date for url
    date = date_og.strftime("%Y/%m/%d")
    # set page number to 1
    page_num = 1

    while True:
        # URL of the transactions page
        url = f"https://www.mlb.com/transactions/{date}/p-{page_num}"

        # Fetch the page content
        response = requests.get(url)
        response.raise_for_status()  # Raise error for failed requests

        # Parse the HTML of the page
        soup = BeautifulSoup(response.text, "html.parser")

        # Find the div with id "transactions" to check if there are transactions
        transactions_div = soup.find("div", {"id": "transactions"})

        if transactions_div and 'isMlb' in transactions_div.get('class', []):
            # Extract transaction text if transactions are present
            transactions = transactions_div.find_all("td")

            # Process the transactions
            df = get_transactions(transactions, df)

            # Move to the next page
            page_num += 1

        else:
            break  # Break the loop if no valid transactions div is found

    # Print the dataframe
    print(df)

    # If dataframe is not empty, export to CSV
    if not df.empty:
        date_str = date_og.strftime("%Y_%m_%d")
        df.to_csv(f"transactions/transactions_{date_str}.csv", index=False)

    # Clear dataframe for the next date
    df = pd.DataFrame(columns=["date", "team_traded_from", "player_name", "player_pos", "team_traded_to"])

Empty DataFrame
Columns: [date, team_traded_from, player_name, player_pos, team_traded_to]
Index: []


In [81]:
import requests
import pandas as pd
import re
import time
from bs4 import BeautifulSoup

# Function to process transactions (assumed to be defined elsewhere)
def get_transactions(transactions, df):
    # Add processing logic here
    return df  # Placeholder

# Create dataframe to store transactions
df = pd.DataFrame(columns=["date", "team_traded_from", "player_name", "player_pos", "team_traded_to"])

# Set date range for the 2024 season
date_range = pd.date_range(start="2021-07-01", end="2021-07-09")

# Loop through all dates in date range
for date_og in date_range:
    start_time = time.time()  # Start timer for this date
    date = date_og.strftime("%Y/%m/%d")  # Format date for URL
    page_num = 1

    while True:
        # Check if the script has exceeded the 5-second limit
        if time.time() - start_time > 5:
            print(f"Skipping {date_og} due to timeout.")
            break  # Exit to move to the next date

        # URL of the transactions page
        url = f"https://www.mlb.com/transactions/{date}/p-{page_num}"

        try:
            response = requests.get(url, timeout=5)  # Set request timeout
            response.raise_for_status()  # Raise error for failed requests
        except requests.RequestException as e:
            print(f"Error fetching {url}: {e}")
            break  # Skip to the next date if request fails

        # Parse the HTML of the page
        soup = BeautifulSoup(response.text, "html.parser")

        # Find the div with id "transactions" to check if there are transactions
        transactions_div = soup.find("div", {"id": "transactions"})

        if transactions_div and 'isMlb' in transactions_div.get('class', []):
            # Extract transaction text if transactions are present
            transactions = transactions_div.find_all("td")

            # Process the transactions
            df = get_transactions(transactions, df)

            # Move to the next page
            page_num += 1
        else:
            break  # Break the loop if no valid transactions div is found

    # Print the dataframe
    print(df)

    # If dataframe is not empty, export to CSV
    if not df.empty:
        date_str = date_og.strftime("%Y_%m_%d")
        df.to_csv(f"transactions/transactions_{date_str}.csv", index=False)

    # Clear dataframe for the next date
    df = pd.DataFrame(columns=["date", "team_traded_from", "player_name", "player_pos", "team_traded_to"])


Skipping 2021-07-01 00:00:00 due to timeout.
Empty DataFrame
Columns: [date, team_traded_from, player_name, player_pos, team_traded_to]
Index: []
Empty DataFrame
Columns: [date, team_traded_from, player_name, player_pos, team_traded_to]
Index: []
Empty DataFrame
Columns: [date, team_traded_from, player_name, player_pos, team_traded_to]
Index: []
Skipping 2021-07-04 00:00:00 due to timeout.
Empty DataFrame
Columns: [date, team_traded_from, player_name, player_pos, team_traded_to]
Index: []
Skipping 2021-07-05 00:00:00 due to timeout.
Empty DataFrame
Columns: [date, team_traded_from, player_name, player_pos, team_traded_to]
Index: []
Empty DataFrame
Columns: [date, team_traded_from, player_name, player_pos, team_traded_to]
Index: []
Empty DataFrame
Columns: [date, team_traded_from, player_name, player_pos, team_traded_to]
Index: []
Skipping 2021-07-08 00:00:00 due to timeout.
Empty DataFrame
Columns: [date, team_traded_from, player_name, player_pos, team_traded_to]
Index: []
Empty DataFr