In [1]:
#ALL IMPORTS AND FUNCTIONS

from bs4 import BeautifulSoup
from pyvirtualdisplay import Display
import requests
import time
from datetime import datetime
import pandas as pd
import re
from requests import Session
from requests.exceptions import MissingSchema, InvalidURL, SSLError, RequestException, Timeout
import numpy as np

_HEADERS = {'User-Agent': 'Mozilla/5.0'}
base_ncaa_url = 'https://www.ncaa.com'
year = '2024'
prior_year = str(int(year) - 1)  #Prior year

In [60]:
def remove_leading_parts(list_of_urls):
    new_list = []
    for url in list_of_urls:
        index = url.find('.com/')
        if index != -1:
            new_url = url[index+4:]  # +4 because len('.com') is 4
            new_list.append(new_url)
        else:
            new_list.append(url)  # If '.com/' is not found, keep the original url
    return new_list

# FROM NCAA WEBSITE
def get_all_school_media_links():
    media_links_df = pd.DataFrame(columns=['school_name', 'official_school_website', 'twitter', 'facebook'])
    counter = 1
    #23 IS THE NUMBER OF PAGES IN SCHOOLS-INDEX
    while counter <= 23:
        ncaa_schools_url = base_ncaa_url+'/schools-index/'+str(counter)
        with Session() as s:
            r = s.get(ncaa_schools_url, headers=_HEADERS, timeout = 5)
        if r.status_code == 403:
            print('An error occurred with the GET Request')
            print('403 Error: blocked request')
            continue
        soup = BeautifulSoup(r.text, features='lxml')
        # Find all 'a' tags in the table
        table = soup.find('table', {'class': 'responsive-enabled'})
        links = table.find_all('a')
        # Extract the URLs and the school names from each 'a' tag
        urls_and_names = [(link.get('href'), link.text) for link in links]
        
        for url, name in urls_and_names:
            individual_school_url = base_ncaa_url+url
            with Session() as s:
                r = s.get(individual_school_url, headers=_HEADERS, timeout = 5)
            if r.status_code == 403:
                print('An error occurred with the GET Request')
                print('403 Error: blocked request')
            soup = BeautifulSoup(r.text, features='lxml')
            # Find all 'a' tags in the div
            div = soup.find('div', {'class': 'school-links'})
            if div is None:
                continue
            media_links = div.find_all('a')
        
            # Extract the URLs from the 'href' attribute of each 'a' tag
            media_urls = [media_link.get('href') for media_link in media_links]
            # Add the URLs and the school name to the DataFrame
            temp_df = pd.DataFrame({
                'school_name': [name],
                'official_school_website': [media_urls[0] if len(media_urls) > 0 else None],
                'twitter': [media_urls[1] if len(media_urls) > 1 else None],
                'facebook': [media_urls[2] if len(media_urls) > 2 else None]
            })
            if not name == '':
                print(f"got links for {name}")
                media_links_df = pd.concat([media_links_df, temp_df], ignore_index=True)
            
        counter+=1
        print(f"finished scraping page {counter}")
        # print(media_links_df.tail)
    
    # Drop duplicates after all pages have been processed
    media_links_df = media_links_df.drop_duplicates()
    return media_links_df

def get_player_name_from_player_bio(soup):
    try:
        name_span = soup.find('span', {'class': 'sidearm-roster-player-name'})
        
        # Try to find the first name span
        first_name_span = name_span.find('span', {'class': 'sidearm-roster-player-first-name'})
        if first_name_span is not None:
            first_name = first_name_span.text
        else:
            # Try something else if the first name span doesn't exist
            spans = name_span.find_all('span')
            first_name = spans[0].text if spans else ''
        
        # Try to find the last name span
        last_name_span = name_span.find('span', {'class': 'sidearm-roster-player-last-name'})
        if last_name_span is not None:
            last_name = last_name_span.text
        else:
            # Try something else if the last name span doesn't exist
            spans = name_span.find_all('span')
            last_name = spans[1].text if len(spans) > 1 else ''
        
        name = f"{first_name} {last_name}"
    
    except AttributeError:
        name = ''  # Handle missing name
    
    return name

def get_jersey_number_from_player_bio(soup):
    try:
        jersey_number_span = soup.find('span', {'class': 'sidearm-roster-player-jersey-number'})
        jersey_number_text = jersey_number_span.text.strip()
        # Use regex to find the first number in the text
        match = re.search(r'\d+', jersey_number_text)
        if match:
            jersey_number = match.group()  # The first number in the text
        else:
            jersey_number = ''  # No number found in the text
    except AttributeError:
        jersey_number = ''  # Handle missing jersey number
    return jersey_number

def get_miscellaneous_player_bio_data(soup):
    try:
        ul = soup.find('ul', {'class': 'flex flex-item-1 row flex-wrap'})
        dl_tags = ul.find_all('dl')
        headers = [tag.find('dt').text.rstrip(':') for tag in dl_tags]  # Remove colons
        data = [tag.find('dd').text for tag in dl_tags]
    except AttributeError:
        data = '' # Handle missing player bio info
    return headers, data


def get_social_media_link(soup):
    try:
        social_media_div = soup.find('div', {'class': 'sidearm-roster-player-social flex flex-wrap'})
        if social_media_div is not None:
            social_media_a = social_media_div.find('a', {'class': 'sidearm-roster-player-social-link'})
            if social_media_a is not None:
                social_media_link = social_media_a['href']
            else:
                social_media_link = ''  # Handle missing social media a tag
        else:
            social_media_link = ''  # Handle missing social media div
    except KeyError:
        social_media_link = ''  # Handle missing href attribute

    return social_media_link

def get_roster_soup(base_url, year, prior_year, _HEADERS):
    if base_url.startswith("http:"):
        base_url = base_url.replace("http:", "https:", 1)
    if base_url.endswith("/"):
        base_url = base_url[:-1]
    urls = [
        base_url+'/sports/baseball/roster/'+year,
        base_url+'/baseball/roster/'+year,
        base_url+'/sports/baseball/roster/',
        base_url+'/baseball/roster/',
        base_url+'/sport/m-basebl/roster/',
        base_url+'/sports/bsb/'+prior_year+'-'+year+'/roster'
    ]
    check_urls = [
        '/sports/baseball/roster/'+year,
        '/baseball/roster/'+year,
        '/sports/baseball/roster/',
        '/baseball/roster/',
        '/sport/m-basebl/roster/',
        '/sports/bsb/'+prior_year+'-'+year+'/roster'
    ]
    for url, check_url in zip(urls, check_urls):
        try:
            with Session() as s:
                r = s.get(url, headers=_HEADERS, timeout=5)
            if not r.url.endswith(check_url) and not r.url.endswith(check_url+'/'):  # If final URL does not end with check_url, a redirect occurred
                print('Redirect occurred for:', url,'Didnt match:', r.url)
                continue
            elif r.status_code == 404:
                print('404 Error')
                continue
            elif r.status_code == 403:
                print('An error occurred with the GET Request')
                print('403 Error: blocked request')
                continue
            elif r.status_code // 100 == 3:  # If status code is in 300s, it's a redirect
                print('Redirect occurred with the GET Request for URL:', url)
                continue 
            else:
                soup = BeautifulSoup(r.text, 'lxml')
                print(url)
                return soup
        except (MissingSchema, InvalidURL, SSLError, RequestException, Timeout) as e:
            print('Broken URL:', url)
            continue
    return None


def get_player_bio_links(soup):
    links = []
    for link in soup.find_all('a'):
        href = link.get('href')
        if href and '/roster/' in href:
            links.append(href)
    links = remove_leading_parts(links)
    return list(set(links)) 

def get_player_bio_soup(div, base_url, _HEADERS):
    a = div.find('a')
    if a is None:
        return None
    player_bio_url = base_url + a['href']
    try:
        with Session() as s:
            r = s.get(player_bio_url, headers=_HEADERS, timeout = 5)
        if r.status_code == 403:
            print('An error occurred with the GET Request')
            print('403 Error: blocked request')
            return None
    except (MissingSchema, InvalidURL, SSLError, RequestException, Timeout) as e:
        print('broken URL:', player_bio_url)
        return None
    soup = BeautifulSoup(r.text, features='lxml')
    return soup


#FUTURE WORK CAN REPLACE THIS WITH A FUNCTION
# with Session() as s:
#         r = s.get(player_bio_url, headers=_HEADERS)
#     if r.status_code == 403:
#         print('An error occurred with the GET Request')
#         print('403 Error: blocked request')
#         return None

#     soup = BeautifulSoup(r.text, features='lxml')

In [61]:
##START WITH DATAFRAME OF ALL SCHOOL SPORT WEBSITES AND SCRAPE PLAYER BIO DATA FOR ALL PLAYERS ON THAT ROSTER

# Start the timer
start_time = time.time()

#This comes from the script Get All D1 School Website URL's
official_websites = pd.read_csv('test_data/all_skipped_rosters_3_13.csv')
official_websites = official_websites.dropna(subset=['school_website_url'])
# test_websites = official_websites[official_websites['school_website_url']=='http://appstatesports.com']
# print(test_websites)

player_bio_df = pd.DataFrame()
skipped_rosters = []
skipped_players = []

#Loop through all schools on NCAA website and try a url to retrieve their roster
for index, base_url in enumerate(official_websites['school_website_url']):
# for index, base_url in enumerate(test_websites['school_website_url']):
    
    print(f"base_url: {base_url}")
    if base_url is None:
        print('skipped roster')
        skipped_rosters.append(base_url)
        continue
    roster_soup = get_roster_soup(base_url, year, prior_year, _HEADERS)
    if roster_soup is None:  
        print('skipped roster b/c roster page had no soup')
        skipped_rosters.append(base_url)
        continue    
    all_player_bio_links = get_player_bio_links(roster_soup)
    print(all_player_bio_links)

    # if len(roster_soup.find_all('div', {'class': 'sidearm-roster-player-image'})) == 0:
    #     print('skipped roster b/c there was no div class sidearm-roster-player-image')
    #     skipped_rosters.append(base_url)
    #     continue    
    # Need to do some sort of coalesce(
    #s-person-details__personal-single-line s-text-paragraph-bold flex items-center gap-2
    
    visited_bio_urls = set()
    
    for div in roster_soup.find_all('div', {'class': 'sidearm-roster-player-image'}):

        player_bio_soup, player_bio_url = get_player_bio_soup(div, base_url, _HEADERS)
        if player_bio_soup is None:
            print('skipped player b/c bio_soup returned none')
            skipped_players.append(player_bio_url)
            continue
        if player_bio_url in visited_bio_urls or 'roster/coaches/' in player_bio_url or '/roster/staff/' in player_bio_url:
            print('on to next roster')
            break
            
        print(f" Retrieving roster information from: {player_bio_url}")
        visited_bio_urls.add(player_bio_url)
        
        name = get_player_name_from_player_bio(player_bio_soup)
        if name == '':
            print('skipped player')
            skipped_players.append(player_bio_url)
            continue
        jersey_number = get_jersey_number_from_player_bio(player_bio_soup)
        social_media_link = get_social_media_link(player_bio_soup)
        headers, data = get_miscellaneous_player_bio_data(player_bio_soup)     
        
        player_data = {'Team': official_websites.loc[index, 'school'], 'Name': name, 'Jersey Number': jersey_number, 'Social Media': social_media_link}
        player_data.update(dict(zip(headers, data)))
        player_df = pd.DataFrame([player_data])      
        player_bio_df = pd.concat([player_bio_df, player_df], ignore_index=True, sort=False)

end_time = time.time()
print(f"Time taken: {end_time - start_time} seconds")


#MAKE SURE ALL SITES ARE SCRAPED. IF NOT SCRAPED FIND OUT WHY
#NEED TO SEE IF USING LIST METHOD GET_PLAYER_BIO_LINKS IS MORE SUITABLE THEN roster_soup.find_all('div', {'class': 'sidearm-roster-player-image'})
#MIGHT BE A COMBO OF THE TWO
    #Add check to ensure all rosters were successfully scraped
#IF ROSTER IS VALID BUT FOR SOME REASON SCRAPING DOESN'T WORK ADD TO A LIST TO KEEP REPEATING THROUGH UNTIL LIST COMPLETE
#NEED TO GRAB NAME & JERSEY NUMBER FROM NCAA.STATS TEAM PAGE TO MATCH WITH BOX SCORES


#CONSIDER SCRAPING DIRECTLY FROM ROSTER INSTEAD OF GOING TO INDIVIDUAL PLAYER BIOS
#CONSIDER MAKING THIS A GENERATOR FUNCTION EVENUTALLY
#NEED TO ADD TRANSFORMATION TO TIE TOGETHER ALL COLUMNS THAT ARE ACTUALLY THE SAME. (PARSE HT/WT TO SEPERATE COLUMNS, DELETE UNECESSARY COLUMNS, ADD URL'S TO SOCIAL MEDIA SITES)
#NEED TO CREATE PROPER KEY TO JOIN TO OTHER TABLES (REMOVE NICKNAMES LAWRENCE "Q" NOBLE) AND TURN INTO NOBLE, LAWRENCE
#IF JOIN BETWEEN NAMES IS NOT WORKING, CAN TRY TO GRAB JERSEY NUMBER FROM NCAA STATS AND JOIN ON THAT TO ROSTER+TEAM (MAY CREATE NEW TABLE OF ALL PLAYERS, JERSEY NUMBER AND WEBSITE NAME)
#REORGANIZE AND TURN INTO DIFFERENT FILES AND DEFINITIONS
#join box score stats to ncaa roster based on name and jersey number
    #ALTERNATIVELY CAN GRAB TEAM ID FROM NCAA OR TRY TO GET CORRECT JERSEY NUMBER FROM NCAA STATS

base_url: http://gozips.com
https://gozips.com/sports/baseball/roster/2024
['/sports/baseball/roster/dawson-tourney/8900', '/sports/baseball/roster/emmett-gillies/8762', '/sports/baseball/roster/james-strom/8907', '/sports/baseball/roster/ben-brombaugh/8901', '/sports/baseball/roster/spencer-atkins/8749', '/sports/baseball/roster/nicky-sackett/8903', '/sports/baseball/roster/michael-sprockett/8778', '/sports/baseball/roster/josiah-ross/8914', '/sports/baseball/roster/andrew-horvath/8764', '/sports/baseball/roster/michael-orlowski/8917', '/sports/baseball/roster/evan-bottone/8905', '/sports/baseball/roster/charlie-schebler/8896', '/sports/baseball/roster/jared-schaeffer/8918', '/sports/baseball/roster/yassir-kahook/8767', '/sports/baseball/roster/ryan-brown/8909', '/sports/baseball/roster/jack-poist/8773', '/sports/baseball/roster/anthony-fett/8759', '/sports/baseball/roster/reece-sutphin/8904', '/sports/baseball/roster/nick-kemper/8912', '/sports/baseball/roster/brett-dietrich/8757', '

In [62]:

skipped_rosters_df = pd.DataFrame(skipped_rosters)
skipped_players_df = pd.DataFrame(skipped_players)


In [63]:
player_bio_df.to_csv(f"test_data/all_player_bios_3_13.csv", index=False)
skipped_rosters_df.to_csv('test_data/all_skipped_rosters_3_13.csv', index=False)
skipped_players_df.to_csv('test_data/all_skipped_players_3_13.csv', index=False)

In [67]:
##SCRAPE SINGLE ROSTER FOR ALL PLAYER BIO LINKS AND DOWNLOAD PLAYER BIO DATA INTO DATAFRAME

school_url = 'https://goheels.com/'
with Session() as s:
    r = s.get(school_url+'/sports/baseball/roster'+year, headers=_HEADERS)
if r.status_code == 403:
    print('An error occurred with the GET Request')
    print('403 Error: blocked request')
soup = BeautifulSoup(r.text, features='lxml')

player_urls = []

# Initialize an empty DataFrame with all possible column names to collect roster data
# all_possible_columns = ['Name', 'Jersey Number', 'Social Media', 'Class', 'Position', 'Bat/Throw', 'B/T','Ht./Wt.', 'Hometown', 'High School','Major', 'Height','Weight','Year']
df = pd.DataFrame()

for div in soup.find_all('div', {'class': 'sidearm-roster-player-image'}):
    a = div.find('a')
    player_bio_url = a['href']
    print(f" Retrieving roster information from: {player_bio_url}")
    
    with Session() as s:
        r = s.get(school_url+'/sports/baseball/roster', headers=_HEADERS)
    if r.status_code == 403:
        print('An error occurred with the GET Request')
        print('403 Error: blocked request')
    soup = BeautifulSoup(r.text, features='lxml')

    #START RETRIEVING DATA FROM PLAYER BIO WEBPAGE
    try:
        name_span = soup.find('span', {'class': 'sidearm-roster-player-name'})
        
        # Try to find the first name span
        first_name_span = name_span.find('span', {'class': 'sidearm-roster-player-first-name'})
        if first_name_span is not None:
            first_name = first_name_span.text
        else:
            # Try something else if the first name span doesn't exist
            spans = name_span.find_all('span')
            first_name = spans[0].text if spans else ''
        
        # Try to find the last name span
        last_name_span = name_span.find('span', {'class': 'sidearm-roster-player-last-name'})
        if last_name_span is not None:
            last_name = last_name_span.text
        else:
            # Try something else if the last name span doesn't exist
            spans = name_span.find_all('span')
            last_name = spans[1].text if len(spans) > 1 else ''
        
        name = f"{first_name} {last_name}"

    except AttributeError:
        name = ''  # Handle missing name
    
    # Get jersey number
    try:
        jersey_number_span = soup.find('span', {'class': 'sidearm-roster-player-jersey-number'})
        jersey_number = jersey_number_span.text.strip()
    except AttributeError:
        jersey_number = ''  # Handle missing jersey number
    
    # Get player attributes
    ul = soup.find('ul', {'class': 'flex flex-item-1 row flex-wrap'})
    dl_tags = ul.find_all('dl')
    headers = [tag.find('dt').text.rstrip(':') for tag in dl_tags]  # Remove colons
    # print(f"headers: {headers}")
    data = [tag.find('dd').text for tag in dl_tags]
    
    # Get social media link
    try:
        social_media_div = soup.find('div', {'class': 'sidearm-roster-player-social flex flex-wrap'})
        if social_media_div is not None:
            social_media_a = social_media_div.find('a', {'class': 'sidearm-roster-player-social-link'})
            if social_media_a is not None:
                social_media_link = social_media_a['href']
            else:
                social_media_link = ''  # Handle missing social media a tag
        else:
            social_media_link = ''  # Handle missing social media div
    except KeyError:
        social_media_link = ''  # Handle missing href attribute
    
    
    # Add name, jersey number, and social media link to the list of headers and data
    headers = ['Name', 'Jersey Number', 'Social Media'] + headers
    data = [name, jersey_number, social_media_link] + data
    
    # Create a dictionary with the player's data
    player_data = {'Name': name, 'Jersey Number': jersey_number, 'Social Media': social_media_link}
    player_data.update(dict(zip(headers, data)))
    
    # Convert the dictionary to a DataFrame
    player_df = pd.DataFrame([player_data])
    
    # Append the player's data to the DataFrame
    df = pd.concat([df, player_df], ignore_index=True, sort=False)
    
    # print(df.columns)
    print(df)
    # break #Temporarily only loop once to get all types of fields and different html structures


# # print(player_urls)

In [None]:
#SCRAPE SINGLE PLAYER BIO PAGE

from bs4 import BeautifulSoup
import pandas as pd
from requests import Session

# Your player bio URL
player_bio_url = 'https://clariongoldeneagles.com/sports/baseball/roster/kasey-shughart/9657'

# Initialize an empty DataFrame with all possible column names
all_possible_columns = ['Name', 'Jersey Number', 'Social Media', 'Class', 'Position', 'Bat/Throw', 'B/T','Ht./Wt.', 'Hometown', 'High School','Major', 'Height','Weight','Year']
player_bio_df = pd.DataFrame(columns=all_possible_columns)

# Create a session
with Session() as s:
    # Send a GET request to the player bio URL
    r = s.get(player_bio_url, headers=_HEADERS)

# Check the status code of the response
if r.status_code == 403:
    print('An error occurred with the GET Request')
    print('403 Error: blocked request')

# Parse the HTML with BeautifulSoup
soup = BeautifulSoup(r.text, 'lxml')
# print(soup)

##__________________ GET PLAYER NAME

#START RETRIEVING DATA FROM PLAYER BIO WEBPAGE
 # Get player name
try:
    name_span = soup.find('span', {'class': 'sidearm-roster-player-name'})
    
    # Try to find the first name span
    first_name_span = name_span.find('span', {'class': 'sidearm-roster-player-first-name'})
    if first_name_span is not None:
        first_name = first_name_span.text
    else:
        # Try something else if the first name span doesn't exist
        spans = name_span.find_all('span')
        first_name = spans[0].text if spans else ''
    
    # Try to find the last name span
    last_name_span = name_span.find('span', {'class': 'sidearm-roster-player-last-name'})
    if last_name_span is not None:
        last_name = last_name_span.text
    else:
        # Try something else if the last name span doesn't exist
        spans = name_span.find_all('span')
        last_name = spans[1].text if len(spans) > 1 else ''
    
    name = f"{first_name} {last_name}"

except AttributeError:
    name = ''  # Handle missing name

# Get jersey number
try:
    jersey_number_span = soup.find('span', {'class': 'sidearm-roster-player-jersey-number'})
    jersey_number = jersey_number_span.text.strip()
except AttributeError:
    jersey_number = ''  # Handle missing jersey number

# Get player attributes
try:
    ul = soup.find('ul', {'class': 'flex flex-item-1 row flex-wrap'})
    dl_tags = ul.find_all('dl')
    headers = [tag.find('dt').text.rstrip(':') for tag in dl_tags]  # Remove colons
    # print(f"headers: {headers}")
    data = [tag.find('dd').text for tag in dl_tags]
except AttributeError:
    data = '' #Handle missing player bio info
# Get social media link
try:
    social_media_div = soup.find('div', {'class': 'sidearm-roster-player-social flex flex-wrap'})
    if social_media_div is not None:
        social_media_a = social_media_div.find('a', {'class': 'sidearm-roster-player-social-link'})
        if social_media_a is not None:
            social_media_link = social_media_a['href']
        else:
            social_media_link = ''  # Handle missing social media a tag
    else:
        social_media_link = ''  # Handle missing social media div
except KeyError:
    social_media_link = ''  # Handle missing href attribute


# Add name, jersey number, and social media link to the list of headers and data
# headers = ['Name', 'Jersey Number', 'Social Media'] + headers
# data = [name, jersey_number, social_media_link] + data

# Create a dictionary with the player's data
player_data = {'Name': name, 'Jersey Number': jersey_number, 'Social Media': social_media_link}
player_data.update(dict(zip(headers, data)))

# Convert the dictionary to a DataFrame
player_df = pd.DataFrame([player_data])

# Append the player's data to the DataFrame
player_bio_df = pd.concat([player_bio_df, player_df], ignore_index=True, sort=False)

# print(df.columns)
print(player_bio_df)
## NEED TO FIGURE OUT HOW TO COMBINE ROSTER STATS FROM TABLES THAT DON:T HAVE LIKE COLUMNS BUT WILL HAVE OVERLAPPING COLUMNS SOMEWHAT


In [None]:
#SCRAPE INDIVIDUAL PLAYER BIO LINK/PAGE

In [None]:
print(player_bio_df['Name'])

In [None]:
official_websites.to_csv(f"data/all_schools_and_media_links_w_roster_urls.csv", index=False)

In [None]:
print(official_websites)
