In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
from tqdm.notebook import tqdm
import warnings
import os
import concurrent.futures
from concurrent.futures import ThreadPoolExecutor, as_completed
import numpy as np
import re
import csv
from datetime import datetime
import glob
from xml.etree.ElementTree import fromstring 
import shutil

# Scrape DWCS Event URLs

In [2]:
import os, shutil
if os.path.exists('data-dwcs'):
    shutil.rmtree('data-dwcs')
os.makedirs('data-dwcs')

In [3]:
### Scrape DWCS Event URL's

# Initialize an empty list to store event URLs
event_urls = []

# The URL of the webpage containing DWCS events
url = "https://www.sherdog.com/organizations/Dana-Whites-Contender-Series-12411"

# Set User-Agent in headers
headers = {'User-Agent': 'Mozilla/5.0'}

# Fetch the webpage content
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.content, 'html.parser')

# Find the section containing the events
event_section = soup.find('div', {'id': 'events_list'})

if event_section:
    # Loop through each event and extract the URL
    for event in event_section.find_all('a', {'itemprop': 'url'}):
        event_url = event.get('href')
        full_url = f"https://www.sherdog.com{event_url}"
        event_urls.append(full_url)
    
    # Save the event URLs to a CSV file
    df = pd.DataFrame({'Event_URLs': event_urls})
    df.to_csv('./data-dwcs/DWCS_Event_URLs.csv', index=False)
    print("Scraped event URLs have been saved to 'DWCS_Event_URLs.csv'")
else:
    print("Could not find the event section. The webpage structure might have changed.")


Scraped event URLs have been saved to 'DWCS_Event_URLs.csv'


In [4]:
### Move Upcoming Bouts to a Different CSV file

# Read the CSV file
df = pd.read_csv('./data-dwcs/DWCS_Event_URLs.csv')

# List of URLs to delete
upcoming_event_urls = [
    # "https://www.sherdog.com/events/Dana-Whites-Contender-Series-Contender-Series-2024-Week-2-102756",
    # "https://www.sherdog.com/events/Dana-Whites-Contender-Series-Contender-Series-2024-Week-3-102757",
    # "https://www.sherdog.com/events/Dana-Whites-Contender-Series-Contender-Series-2024-Week-4-102758",
    "https://www.sherdog.com/events/Dana-Whites-Contender-Series-Contender-Series-2024-Week-5-102759",
    "https://www.sherdog.com/events/Dana-Whites-Contender-Series-Contender-Series-2024-Week-6-102760",
    "https://www.sherdog.com/events/Dana-Whites-Contender-Series-Contender-Series-2024-Week-7-102761",
    "https://www.sherdog.com/events/Dana-Whites-Contender-Series-Contender-Series-2024-Week-8-102762",
    "https://www.sherdog.com/events/Dana-Whites-Contender-Series-Contender-Series-2024-Week-9-102763",
    "https://www.sherdog.com/events/Dana-Whites-Contender-Series-Contender-Series-2024-Week-10-102764"
]

# Create a dataframe for the rows to delete
rows_to_delete = df[df['Event_URLs'].isin(upcoming_event_urls)]

# Create a dataframe for the remaining rows
remaining_rows = df[~df['Event_URLs'].isin(upcoming_event_urls)]

# Save the deleted rows to a new CSV
rows_to_delete.to_csv('./data-dwcs/DWCS_Upcoming_Event_URLs.csv', index=False)

# Save the remaining rows back to the original CSV
remaining_rows.to_csv('./data-dwcs/DWCS_Event_URLs.csv', index=False)

print("Deleted rows saved and modified CSV with remaining rows saved.")

Deleted rows saved and modified CSV with remaining rows saved.


# Scrape Past Event Data

In [5]:
### Main Event Data Loop

csv_path_fighter_ids = './data-dwcs/DWCS_Fighter_ID_Sherdog.csv'

# Set up headers to use in the request to mimic a real browser
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}

# Read URLs from the CSV file
url_df = pd.read_csv('./data-dwcs/DWCS_Event_URLs.csv')
url_list = url_df['Event_URLs']

# Initialize tqdm with the total number of URLs
pbar = tqdm(total=len(url_list), desc="Scraping Progress")

# Iterate through the URLs and scrape data
for url_to_scrape in url_list:
    # Make the request
    response = requests.get(url_to_scrape, headers=headers)

    if response.status_code != 200:
        print(f"Failed to get the webpage. Status code: {response.status_code}")
        continue

    # Parse the HTML using BeautifulSoup
    soup = BeautifulSoup(response.text, 'html.parser')

    # Your existing scraping code...
    event_name = soup.find('span', itemprop='name').text.strip()
    event_location = soup.find('span', itemprop='location').text.strip()
    event_date = soup.find('meta', itemprop='startDate').next_sibling.strip()

    fighter1_class = soup.find('div', class_='fighter left_side')
    fighter2_class = soup.find('div', class_='fighter right_side')
    fighter1 = fighter1_class.find('span', itemprop='name').text.strip()
    fighter2 = fighter2_class.find('span', itemprop='name').text.strip()

    weight_class = soup.find('span', class_='weight_class').text.strip()
    winning_fighter = fighter1

    winning_method_em = soup.find('em', string='Method').parent
    winning_method = winning_method_em.contents[2].strip()

    winning_round_em = soup.find('em', string='Round').parent
    winning_round = winning_round_em.contents[2].strip()

    winning_time_em = soup.find('em', string='Time').parent
    winning_time = winning_time_em.contents[2].strip()

    referee_em = soup.find('em', string='Referee').parent
    referee = referee_em.find('a').text.strip()
    # End of your existing scraping code
    
    ### Fighter IDs ###
    fighters = soup.find_all('div', class_='fighter')
    fighter1_url = fighters[0].find('a', itemprop='url')['href']
    fighter2_url = fighters[1].find('a', itemprop='url')['href']
    fighter1_id = (str(fighter1_url)).split('-')[-1]
    fighter2_id = (str(fighter2_url)).split('-')[-1]

    # Append Fighter IDs to existing CSV
    new_row_for_df2 = pd.DataFrame([
        {'Fighter': fighter1, 'Fighter_ID': fighter1_id},
        {'Fighter': fighter2, 'Fighter_ID': fighter2_id}
    ])
    
    if not os.path.exists(csv_path_fighter_ids):
        new_row_for_df2.to_csv(csv_path_fighter_ids, mode='w', index=False, header=True)
    else:
        new_row_for_df2.to_csv(csv_path_fighter_ids, mode='a', index=False, header=False)
#    new_row_for_df2.to_csv('./data-dwcs/DWCS_Fighter_ID_Sherdog.csv', mode='a', header=False, index=False)
    ### End Fighter IDs

    # Store the data
    data_to_append = {
        'Event Name': event_name,
        'Event Location': event_location,
        'Event Date': event_date,
        'Fighter 1': fighter1,
        'Fighter 2': fighter2,
        'Weight Class': weight_class,
        'Winning Fighter': winning_fighter,
        'Winning Method': winning_method,
        'Winning Round': winning_round,
        'Winning Time': winning_time,
        'Referee': referee
    }

    # Save Data
    try:
        df = pd.read_csv('./data-dwcs/DWCS_Event_Data_Sherdog.csv')
    except FileNotFoundError:
        df = pd.DataFrame()

    df = pd.concat([df, pd.DataFrame([data_to_append])])
    df.to_csv('./data-dwcs/DWCS_Event_Data_Sherdog.csv', index=False)
    
    # Update the progress bar
    pbar.update(1)

# Close the progress bar
pbar.close()

print("Scraping completed and data saved.")

Scraping Progress:   0%|          | 0/73 [00:00<?, ?it/s]

Scraping completed and data saved.


In [6]:
### Other Bout Data Loop

# Read URLs from the CSV file
url_df = pd.read_csv('./data-dwcs/DWCS_Event_URLs.csv')
url_list = url_df['Event_URLs']

# Initialize tqdm with the total number of URLs
pbar = tqdm(total=len(url_list), desc="Scraping Progress")

# Iterate through the URLs and scrape data
for url_to_scrape in url_list:
    # Make the request
    response = requests.get(url_to_scrape, headers=headers)

    if response.status_code != 200:
        print(f"Failed to get the webpage. Status code: {response.status_code}")
        continue

    # Parse the HTML using BeautifulSoup
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Scrape the Other Bout Data
    rows = soup.find_all('tr', itemprop='subEvent')
    for row in rows:
        fighters = row.find_all('div', class_='fighter_list')
        fighter1 = fighters[0].find('img')['title']
        fighter2 = fighters[1].find('img')['title']

        weight_class = soup.find('span', class_='weight_class').text.strip()

        winning_method = row.find('td', class_='winby').find('b').get_text(strip=True)

        winning_round = row.find_all('td')[-2].get_text(strip=True)
        winning_time = row.find_all('td')[-1].get_text(strip=True)

        referee = row.find('td', class_='winby').find('a').get_text(strip=True)

        ### Fighter IDs ###
        fighter1_url = fighters[0].find('a', itemprop='url')['href']
        fighter2_url = fighters[1].find('a', itemprop='url')['href']
        fighter1_id = (str(fighter1_url)).split('-')[-1]
        fighter2_id = (str(fighter2_url)).split('-')[-1]

        # Append Fighter IDs to existing CSV
        new_row_for_df2 = pd.DataFrame([
            {'Fighter': fighter1, 'Fighter_ID': fighter1_id},
            {'Fighter': fighter2, 'Fighter_ID': fighter2_id}
        ])
        new_row_for_df2.to_csv('./data-dwcs/DWCS_Fighter_ID_Sherdog.csv', mode='a', header=False, index=False)
        ### End Fighter IDs

        # Store the Data for Other Bouts
        data_to_append = {
            'Event Name': event_name,  # Replace with actual data
            'Event Location': event_location,  # Replace with actual data
            'Event Date': event_date,  # Replace with actual data
            'Fighter 1': fighter1,
            'Fighter 2': fighter2,
            'Weight Class': weight_class,
            'Winning Fighter': fighter1,  # You might need to adjust this based on the winning fighter
            'Winning Method': winning_method,
            'Winning Round': winning_round,
            'Winning Time': winning_time,
            'Referee': referee
        }
        
        # Append the new row directly to the existing CSV
        new_row = pd.DataFrame([data_to_append])
        new_row.to_csv('./data-dwcs/DWCS_Event_Data_Sherdog.csv', mode='a', header=False, index=False)
        
    print("Other Bout Data Saved to DWCS_Event_Data_Sherdog.csv")
    
    # Update the progress bar
    pbar.update(1)

# Close the progress bar
pbar.close()

print("Scraping completed and data saved.")

Scraping Progress:   0%|          | 0/73 [00:00<?, ?it/s]

Other Bout Data Saved to DWCS_Event_Data_Sherdog.csv
Other Bout Data Saved to DWCS_Event_Data_Sherdog.csv
Other Bout Data Saved to DWCS_Event_Data_Sherdog.csv
Other Bout Data Saved to DWCS_Event_Data_Sherdog.csv
Other Bout Data Saved to DWCS_Event_Data_Sherdog.csv
Other Bout Data Saved to DWCS_Event_Data_Sherdog.csv
Other Bout Data Saved to DWCS_Event_Data_Sherdog.csv
Other Bout Data Saved to DWCS_Event_Data_Sherdog.csv
Other Bout Data Saved to DWCS_Event_Data_Sherdog.csv
Other Bout Data Saved to DWCS_Event_Data_Sherdog.csv
Other Bout Data Saved to DWCS_Event_Data_Sherdog.csv
Other Bout Data Saved to DWCS_Event_Data_Sherdog.csv
Other Bout Data Saved to DWCS_Event_Data_Sherdog.csv
Other Bout Data Saved to DWCS_Event_Data_Sherdog.csv
Other Bout Data Saved to DWCS_Event_Data_Sherdog.csv
Other Bout Data Saved to DWCS_Event_Data_Sherdog.csv
Other Bout Data Saved to DWCS_Event_Data_Sherdog.csv
Other Bout Data Saved to DWCS_Event_Data_Sherdog.csv
Other Bout Data Saved to DWCS_Event_Data_Sherd

# Scrape Upcoming Event Data

In [7]:
### Upcoming Main Event Data

# Set up headers to use in the request to mimic a real browser
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}

# Read the CSV file with Event URLs, skipping the header row
url_df = pd.read_csv('./data-dwcs/DWCS_Upcoming_Event_URLs.csv')  # Replace with the actual path
event_url = url_df.iloc[0, 0]
print(f"Processing URL: {event_url}")  # Print the URL you're processing

if not url_df.empty:
    # Get the URL from the first row
    event_url = url_df.iloc[0, 0]

    # Make the request
    response = requests.get(event_url, headers=headers)

    # Check the status code of the response
    if response.status_code != 200:
        print(f"Failed to get the webpage. Status code: {response.status_code}")
    else:
        # Parse the HTML using BeautifulSoup
        soup = BeautifulSoup(response.text, 'html.parser')

        event_name = soup.find('span', itemprop='name').text.strip()
        event_location = soup.find('span', itemprop='location').text.strip()
        event_date = soup.find('meta', itemprop='startDate').next_sibling.strip()

        fighter1_class = soup.find('div', class_='fighter left_side')
        fighter2_class = soup.find('div', class_='fighter right_side')
        fighter1 = fighter1_class.find('span', itemprop='name').text.strip()
        fighter2 = fighter2_class.find('span', itemprop='name').text.strip()

        weight_class = soup.find('span', class_='weight_class').text.strip()
        
#         winning_fighter = fighter1
#         winning_method_em = soup.find('em', string='Method').parent
#         winning_method = winning_method_em.contents[2].strip()
#         winning_round_em = soup.find('em', string='Round').parent
#         winning_round = winning_round_em.contents[2].strip()
#         winning_time_em = soup.find('em', string='Time').parent
#         winning_time = winning_time_em.contents[2].strip()

#         referee_em = soup.find('em', string='Referee').parent
#         referee = referee_em.find('a').text.strip()
        
        ### Fighter IDs ###
        fighters = soup.find_all('div', class_='fighter')
        fighter1_url = fighters[0].find('a', itemprop='url')['href']
        fighter2_url = fighters[1].find('a', itemprop='url')['href']
        fighter1_id = (str(fighter1_url)).split('-')[-1]
        fighter2_id = (str(fighter2_url)).split('-')[-1]

        # Append Fighter IDs to existing CSV
        new_row_for_df2 = pd.DataFrame([
            {'Fighter': fighter1, 'Fighter_ID': fighter1_id},
            {'Fighter': fighter2, 'Fighter_ID': fighter2_id}
        ])
        new_row_for_df2.to_csv('./data-dwcs/DWCS_Fighter_ID_Sherdog.csv', mode='a', header=False, index=False)
        ### End Fighter IDs

        # Store the data
        data_to_append = {
            'Event Name': event_name,
            'Event Location': event_location,
            'Event Date': event_date,
            'Fighter 1': fighter1,
            'Fighter 2': fighter2,
            'Weight Class': weight_class,
#             'Winning Fighter': winning_fighter,
#             'Winning Method': winning_method,
#             'Winning Round': winning_round,
#             'Winning Time': winning_time,
#             'Referee': referee
        }

        # Save Data
        try:
            df = pd.read_csv('./data-dwcs/DWCS_Upcoming_Event_Data_Sherdog.csv')
        except FileNotFoundError:
            df = pd.DataFrame()

        df = pd.concat([df, pd.DataFrame([data_to_append])])
        df.to_csv('./data-dwcs/DWCS_Upcoming_Event_Data_Sherdog.csv', index=False)
        print("Main Event Data Saved to DWCS_Upcoming_Event_Data_Sherdog.csv")
else:
    print("CSV file is empty.")


Processing URL: https://www.sherdog.com/events/Dana-Whites-Contender-Series-Contender-Series-2024-Week-5-102759
Main Event Data Saved to DWCS_Upcoming_Event_Data_Sherdog.csv


In [8]:
### Upcoming Other Bout Data

# Set up headers to use in the request to mimic a real browser
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}

# Read the CSV file with Event URLs, skipping the header row
url_df = pd.read_csv('./data-dwcs/DWCS_Upcoming_Event_URLs.csv')  # Replace with the actual path
event_url = url_df.iloc[0, 0]
print(f"Processing URL: {event_url}")  # Print the URL you're processing

# Make the request
response = requests.get(event_url, headers=headers)

# Check the status code of the response
if response.status_code != 200:
    print(f"Failed to get the webpage. Status code: {response.status_code}")
else:
    # Parse the HTML using BeautifulSoup
    soup = BeautifulSoup(response.text, 'html.parser')

    # Scrape the Other Bout Data
    rows = soup.find_all('tr', itemprop='subEvent')
    for row in rows:
        fighters = row.find_all('div', class_='fighter_list')
        fighter1 = fighters[0].find('img')['title']
        fighter2 = fighters[1].find('img')['title']

        weight_class = soup.find('span', class_='weight_class').text.strip()

#         winning_method = row.find('td', class_='winby').find('b').get_text(strip=True)
#         winning_round = row.find_all('td')[-2].get_text(strip=True)
#         winning_time = row.find_all('td')[-1].get_text(strip=True)
#         referee = row.find('td', class_='winby').find('a').get_text(strip=True)

        ### Fighter IDs ###
        fighter1_url = fighters[0].find('a', itemprop='url')['href']
        fighter2_url = fighters[1].find('a', itemprop='url')['href']
        fighter1_id = (str(fighter1_url)).split('-')[-1]
        fighter2_id = (str(fighter2_url)).split('-')[-1]

        # Append Fighter IDs to existing CSV
        new_row_for_df2 = pd.DataFrame([
            {'Fighter': fighter1, 'Fighter_ID': fighter1_id},
            {'Fighter': fighter2, 'Fighter_ID': fighter2_id}
        ])
        new_row_for_df2.to_csv('./data-dwcs/DWCS_Fighter_ID_Sherdog.csv', mode='a', header=False, index=False)
        ### End Fighter IDs

        # Store the Data for Other Bouts
        data_to_append = {
            'Event Name': event_name,  # Replace with actual data
            'Event Location': event_location,  # Replace with actual data
            'Event Date': event_date,  # Replace with actual data
            'Fighter 1': fighter1,
            'Fighter 2': fighter2,
            'Weight Class': weight_class,
#             'Winning Fighter': fighter1,  # You might need to adjust this based on the winning fighter
#             'Winning Method': winning_method,
#             'Winning Round': winning_round,
#             'Winning Time': winning_time,
#             'Referee': referee
        }
        
        # Append the new row directly to the existing CSV
        new_row = pd.DataFrame([data_to_append])
        new_row.to_csv('./data-dwcs/DWCS_Upcoming_Event_Data_Sherdog.csv', mode='a', header=False, index=False)
        
    print("Other Bout Data Saved to DWCS_Upcoming_Event_Data_Sherdog.csv")


Processing URL: https://www.sherdog.com/events/Dana-Whites-Contender-Series-Contender-Series-2024-Week-5-102759
Other Bout Data Saved to DWCS_Upcoming_Event_Data_Sherdog.csv


# Scrape General Fighter Data

In [9]:
### Create DWCS_Fighter_Info.csv

# Define the column names
column_names = ['Fighter', 'Nickname', 'Height', 'Weight', 'Reach', 'Stance', 'Win_KO', 'Win_Sub', 'Win_Dec', 'Loss_KO', 'Loss_Sub', 'Loss_Dec', 'Sherdog URL']

# Create an empty DataFrame with the specified column names
df_empty = pd.DataFrame(columns=column_names)

# Define the file path
file_path = './data-dwcs/DWCS_Fighter_Info.csv'

# Write the empty DataFrame to the CSV file
df_empty.to_csv(file_path, index=False)


In [10]:
## Scrape data from fighter page

def scrape_fighter_general_info_sherdog(fighter, soup):
    
    fighter_dict = {}

    ## Fighter Data
    try:
        fighter_data = soup.find('div', class_='fighter-data')
    except AttributeError:
        fighter_data = None
        
    ## Birth Date
    try:
        birthdate = soup.find('span', itemprop='birthDate')
        birthdate = (birthdate.text).strip('""')
    except AttributeError:
        birthdate = '-'
    
    ## Nationality
    try:
        nationality = soup.find('strong', itemprop='nationality')
        nationality = (nationality.text).strip()
    except AttributeError:
        nationality = '-'
        
    ## Hometown
    try:
        hometown = soup.find('span', {'itemprop': 'addressLocality'}).text
        hometown = hometown.strip()
    except AttributeError:
        hometown = '-'
        
    ## Association
    try:
        association = soup.find('span', {'itemprop': 'name'}).text
        association = association.strip()
    except AttributeError:
        association = '-'
        
    ## Weight Class
    try:
        weight_class_div = fighter_data.find('div', {'class': 'association-class'})
        links = weight_class_div.find_all('a')
        weight_class = links[-1].text
        weight_class = weight_class.strip()
    except AttributeError:
        weight_class = ''
    except IndexError:
        weight_class = ''
        
    ## Nickname
    try:
        nickname = soup.find('span', class_='nickname')
        nickname = (nickname.text).strip('"')
    except AttributeError:
        nickname = '-'

    ## Height
    try:
        height = soup.find('b', itemprop='height')
        height = (height.text).strip('"')
    except AttributeError:
        height = '-'

    ## Wins
    try:
        wins = soup.find('div', class_='winloses win').find_all('span')[1]
        wins = (wins.text).strip()
    except AttributeError:
        wins = '-'

    ## Losses
    try:
        losses = soup.find('div', class_='winloses lose').find_all('span')[1]
        losses = (losses.text).strip()
    except AttributeError:
        losses = '-'

    ## Decisions
    dec_data_list = []
    try:
        win_type = fighter_data.find_all('div', class_='meter-title', string='DECISIONS')
        for method in win_type:
            if method.text.startswith('DECISIONS'):
                dec_data = method.find_next('div', class_='pl').text
                dec_data_list.append(dec_data)
        wins_dec = (dec_data_list[0]).strip()
        losses_dec = (dec_data_list[1]).strip()
    except (AttributeError, IndexError):
        wins_dec = '-'
        losses_dec = '-'

    ## Knockouts
    ko_data_list = []
    try:
        win_type = soup.find_all('div', class_='meter-title')
        for method in win_type:
            if method.text.startswith('KO'):
                ko_data = method.find_next('div', class_='pl').text
                ko_data_list.append(ko_data)
        wins_ko = (ko_data_list[0]).strip()
        losses_ko = (ko_data_list[1]).strip()
    except (AttributeError, IndexError):
        wins_ko = '-'
        losses_ko = '-'

    ## Submissions
    sub_data_list = []
    try:
        win_type = fighter_data.find_all('div', class_='meter-title', string='SUBMISSIONS')
        for method in win_type:
            if method.text.startswith('SUBMISSIONS'):
                sub_data = method.find_next('div', class_='pl').text
                sub_data_list.append(sub_data)
        wins_sub = (sub_data_list[0]).strip()
        losses_sub = (sub_data_list[1]).strip()
    except (AttributeError, IndexError):
        wins_sub = '-'
        losses_sub = '-'

    ## Store the Data
    fighter_dict = {
        'Fighter': fighter,
        'Nickname': nickname,
        'Birth Date': birthdate,
        'Nationality': nationality,
        'Hometown': hometown,
        'Association': association,
        'Weight Class': weight_class,
        'Height': height,
        'Wins': wins,
        'Losses': losses,
        'Win_Decision': wins_dec,
        'Win_KO': wins_ko,
        'Win_Sub': wins_sub,
        'Loss_Decision': losses_dec,
        'Loss_KO': losses_ko,
        'Loss_Sub': losses_sub
    }
    
    df = pd.read_csv('./data-dwcs/DWCS_Fighter_Info.csv')
    new_data = pd.DataFrame([fighter_dict]) # Convert fighter_dict to a DataFrame before using pd.concat
    df = pd.concat([df, new_data], ignore_index=True) 
    df.to_csv('./data-dwcs/DWCS_Fighter_Info.csv', index=False)


In [11]:
## Loop through fighter pages

def scrape_fighter_general_info_sherdog_loop():
    # Ignore warnings
    warnings.filterwarnings("ignore", category=FutureWarning)

    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"
    }

    df_fighter_id = pd.read_csv('./data-dwcs/DWCS_Fighter_ID_Sherdog.csv')

    urls = []  # List to store the generated URLs

    for index, row in tqdm(df_fighter_id.iterrows(), total=df_fighter_id.shape[0],
                           bar_format="{desc}: {percentage:3.0f}%|{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}, {postfix}]"):

        fighter = row['Fighter']
        print(fighter)
        # Split fighter name
        fighter_split = row['Fighter'].split()
        # Assign the first and second words to first_name and last_name
        first_name = fighter_split[0]
        last_name = fighter_split[1] if len(fighter_split) > 1 else ''
        # Get fighter ID for sherdog
        fighter_id = row['Fighter_ID']
        # Form URL
        url = f'https://www.sherdog.com/fighter/{first_name}-{last_name}-{fighter_id}'
        urls.append(url)  # Store the generated URL in the list
        # Get response from URL
        response = requests.get(url, headers=headers)
        # Check if the URL is valid
        if response.status_code != 200:
            print("URL is not valid or the request was unsuccessful.")
            print(row)
            continue  # skip to the next iteration

        soup = BeautifulSoup(response.content, 'html.parser')
        scrape_fighter_general_info_sherdog(fighter, soup)

    # Create a new DataFrame to store the URLs
    df_urls = pd.DataFrame({'Sherdog URL': urls})

    # Read the existing DWCS_Fighter_Info.csv file (if it exists)
    try:
        df_existing = pd.read_csv('./data-dwcs/DWCS_Fighter_Info.csv')
        # Merge the existing DataFrame with the new URLs DataFrame based on the index
        df_combined = pd.concat([df_existing, df_urls], axis=1)
    except FileNotFoundError:
        df_combined = df_urls  # Use only the new URLs DataFrame if the file doesn't exist

    # Write the combined DataFrame to the DWCS_Fighter_Info.csv file
    df_combined.to_csv('./data-dwcs/DWCS_Fighter_Info.csv', index=False)
    

    
## Call the Function
## BUGS
## Making new column Sherdog URL even if it exists
scrape_fighter_general_info_sherdog_loop()

  0%|          | 0/730 [00:00<?, ]

Seok Hyun Ko
Igor Cavalcanti
Andrey Pulyaev
Liam Anderson
Andreas Gustafsson
Pat Pytlik
Mansur Abdul-Malik
Wesley Schultz
Ramon Taveras
Cortavious Romious
Rodolfo Bellato
Murtaza Talha Ali
Danny Barlow
Raheam Forest
Shamil Gaziev
Greg Velasco
James Llontop
Malik Lewis
Brendson Ribeiro
Bruno Lopes
Carlos Prates
Mitch Ramirez
Zachary Reese
Eli Aronov
Abdul-Kareem Al-Selwady
George Hardwick
Cesar Almeida
Lucas Fernando
Bo Nickal
Donovan Beard
Brunno Ferreira
Leon Aliu
Farid Basharat
Allan Begosso
Vitor Petrino
Rodolfo Bellato
Yusaku Kinoshita
Jose Henrique Souza
Mick Parkin
Eduardo Neves
Esteban Ribovics
Thomas Paull
Bo Nickal
Zachary Borrego
Chris Duncan
Charlie Campbell
Joe Pyfer
Osman Diaz
Maheshate Hayisaer
Achilles Estremadura
Gadzhi Omargadzhiev
Jansey Silva
Jonny Parsons
Solomon Renfro
Christian Rodriguez
Reyes Cortez Jr.
Mike Malott
Shimon Smotritsky
Ihor Potieria
Lukasz Sudolski
A.J. Dobson
Hashem Arkhagha
Jailton Almeida
Nasrudin Nasrudinov
Josh Quinlan
Logan Urban
Azamat Murzak

# Clean 

In [12]:
# Delete empty columns

df = pd.read_csv('./data-dwcs/DWCS_Fighter_Info.csv')
columns_to_delete = ['Weight', 'Reach', 'Stance', 'Sherdog URL'] # 'Win_Dec', 'Loss_Dec'
df.drop(columns=columns_to_delete, inplace=True)
df.to_csv(file_path, index=False)
print(f"Deleted columns: {', '.join(columns_to_delete)}")

Deleted columns: Weight, Reach, Stance, Sherdog URL


In [13]:
!open ./data-dwcs/DWCS_Fighter_Info.csv

In [14]:
!open ./data-dwcs/DWCS_Event_Data_Sherdog.csv

# Fighters Fights 

In [15]:
!rm -rf ./data/fighters/

directory_path = './data-dwcs/fighters/'
if not os.path.exists(directory_path):
    os.makedirs(directory_path)

In [16]:
# # from concurrent.futures import ThreadPoolExecutor, as_completed

# # def scrape_fighter_fights_sherdog(fighter_name, fighter_id, fighter_url):
# #     headers = {
# #         'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
# #     }
# #     response = requests.get(fighter_url, headers=headers, timeout=60)
    
# #     if response.status_code == 200:
# #         soup = BeautifulSoup(response.content, 'html.parser')
# #         table = soup.find('table', {'class': 'new_table fighter'})
# #         rows = table.find_all('tr')[1:]
# #         fight_data = []
# #         new_opponents = []

# #         for row in rows:
# #             cols = row.find_all('td')
# #             fight_dict = {
# #                 'Result': cols[0].text.strip(),
# #                 'Opponent': cols[1].find('a').text.strip() if cols[1].find('a') else '-',
# #                 'Event Date': cols[2].find_all('span')[-1].text.strip() if cols[2].find_all('span') else '-',
# #                 'Method/Referee': cols[3].text.strip().split('\n')[0],
# #                 'Rounds': cols[4].text.strip(),
# #                 'Time': cols[5].text.strip()
# #             }
# #             fight_data.append(fight_dict)
# #             opponent_link = cols[1].find('a')['href'] if cols[1].find('a') else None
# #             if opponent_link:
# #                 opponent_id = opponent_link.split('-')[-1]
# #                 new_opponents.append({'Fighter': fight_dict['Opponent'], 'Fighter_ID': opponent_id})
# #         return fighter_id, fighter_name, fight_data, new_opponents
# #     return fighter_id, fighter_name, [], []

# # df_fighter_id = pd.read_csv('./data-dwcs/DWCS_Fighter_ID_Sherdog.csv')
# # all_new_opponents = []

# # def process_fighter(row):
# #     fighter_url = f"https://www.sherdog.com/fighter/{row['Fighter'].replace(' ', '-')}-{row['Fighter_ID']}"
# #     return scrape_fighter_fights_sherdog(row['Fighter'], row['Fighter_ID'], fighter_url)

# # total_fighters = len(df_fighter_id)
# # fighters_processed = 0

# # with ThreadPoolExecutor(max_workers=10) as executor:
# #     futures = [executor.submit(process_fighter, row) for _, row in df_fighter_id.iterrows()]
# #     for future in as_completed(futures):
# #         fighter_id, fighter_name, fight_data, new_opponents = future.result()
# #         fighters_processed += 1
# #         print(f"Processed {fighters_processed}/{total_fighters} fighters.")
# #         if fight_data:
# #             pd.DataFrame(fight_data).to_csv(f"./data-dwcs/fighters/{fighter_name.replace(' ', '_')}_{fighter_id}.csv", index=False)
# #             all_new_opponents.extend(new_opponents)

# # if all_new_opponents:
# #     df_new_opponents = pd.DataFrame(all_new_opponents).drop_duplicates()
# #     df_fighter_id = pd.concat([df_fighter_id, df_new_opponents], ignore_index=True).drop_duplicates()
# #     df_fighter_id.to_csv('./data-dwcs/DWCS_Fighter_ID_Sherdog.csv', index=False)
# from concurrent.futures import ThreadPoolExecutor, as_completed
# import os

# def scrape_fighter_fights_sherdog(fighter_name, fighter_id, fighter_url):
#     headers = {
#         'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
#     }
#     response = requests.get(fighter_url, headers=headers, timeout=60)
    
#     if response.status_code == 200:
#         soup = BeautifulSoup(response.content, 'html.parser')
#         table = soup.find('table', {'class': 'new_table fighter'})
#         rows = table.find_all('tr')[1:]
#         fight_data = []
#         new_opponents = []

#         for row in rows:
#             cols = row.find_all('td')
#             fight_dict = {
#                 'Result': cols[0].text.strip(),
#                 'Opponent': cols[1].find('a').text.strip() if cols[1].find('a') else '-',
#                 'Event Date': cols[2].find_all('span')[-1].text.strip() if cols[2].find_all('span') else '-',
#                 'Method/Referee': cols[3].text.strip().split('\n')[0],
#                 'Rounds': cols[4].text.strip(),
#                 'Time': cols[5].text.strip()
#             }
#             fight_data.append(fight_dict)
#             opponent_link = cols[1].find('a')['href'] if cols[1].find('a') else None
#             if opponent_link:
#                 opponent_id = opponent_link.split('-')[-1]
#                 new_opponents.append({'Fighter': fight_dict['Opponent'], 'Fighter_ID': opponent_id})
#         return fighter_id, fighter_name, fight_data, new_opponents
#     return fighter_id, fighter_name, [], []

# df_fighter_id = pd.read_csv('./data-dwcs/DWCS_Fighter_ID_Sherdog.csv')
# all_new_opponents = []

# def process_fighter(row):
#     fighter_url = f"https://www.sherdog.com/fighter/{row['Fighter'].replace(' ', '-')}-{row['Fighter_ID']}"
#     return scrape_fighter_fights_sherdog(row['Fighter'], row['Fighter_ID'], fighter_url)

# total_fighters = len(df_fighter_id)
# fighters_processed = 0

# with ThreadPoolExecutor(max_workers=10) as executor:
#     futures = [executor.submit(process_fighter, row) for _, row in df_fighter_id.iterrows()]
#     for future in as_completed(futures):
#         fighter_id, fighter_name, fight_data, new_opponents = future.result()
#         fighters_processed += 1
#         print(f"Processed {fighters_processed}/{total_fighters} fighters.")
#         if fight_data:
#             # Cleaning the fighter_name inline to make it valid for filenames
#             clean_fighter_name = fighter_name.replace(' ', '_').replace("'", "").replace('"', '').replace("/", "_")
#             # Ensure the directory exists before saving the file
#             os.makedirs("./data-dwcs/fighters/", exist_ok=True)
#             pd.DataFrame(fight_data).to_csv(f"./data-dwcs/fighters/{clean_fighter_name}_{fighter_id}.csv", index=False)
#             all_new_opponents.extend(new_opponents)

# if all_new_opponents:
#     df_new_opponents = pd.DataFrame(all_new_opponents).drop_duplicates()
#     df_fighter_id = pd.concat([df_fighter_id, df_new_opponents], ignore_index=True).drop_duplicates()
#     df_fighter_id.to_csv('./data-dwcs/DWCS_Fighter_ID_Sherdog.csv', index=False)

def scrape_fighter_fights_sherdog(fighter_name, fighter_id, fighter_url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
    }
    response = requests.get(fighter_url, headers=headers, timeout=60)
    
    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')
        table = soup.find('table', {'class': 'new_table fighter'})
        rows = table.find_all('tr')[1:]
        fight_data = []
        new_opponents = []

        for row in rows:
            cols = row.find_all('td')
            
            # Extracting event name, event link, and event date
            event_element = cols[2].find('a')
            event_name = event_element.text.strip() if event_element else '-'
            event_link = event_element['href'] if event_element else '-'
            event_date_element = cols[2].find('span', {'class': 'sub_line'})
            event_date = event_date_element.text.strip() if event_date_element else '-'
            
            fight_dict = {
                'Result': cols[0].text.strip(),
                'Opponent': cols[1].find('a').text.strip() if cols[1].find('a') else '-',
                'Event Name': event_name,
                'Event Link': f"https://www.sherdog.com{event_link}" if event_link != '-' else '-',
                'Event Date': event_date,
                'Method/Referee': cols[3].text.strip().split('\n')[0],
                'Rounds': cols[4].text.strip(),
                'Time': cols[5].text.strip()
            }
            fight_data.append(fight_dict)
            opponent_link = cols[1].find('a')['href'] if cols[1].find('a') else None
            if opponent_link:
                opponent_id = opponent_link.split('-')[-1]
                new_opponents.append({'Fighter': fight_dict['Opponent'], 'Fighter_ID': opponent_id})
                
        return fighter_id, fighter_name, fight_data, new_opponents
    return fighter_id, fighter_name, [], []

df_fighter_id = pd.read_csv('./data-dwcs/DWCS_Fighter_ID_Sherdog.csv')
all_new_opponents = []

def process_fighter(row):
    fighter_url = f"https://www.sherdog.com/fighter/{row['Fighter'].replace(' ', '-')}-{row['Fighter_ID']}"
    return scrape_fighter_fights_sherdog(row['Fighter'], row['Fighter_ID'], fighter_url)

total_fighters = len(df_fighter_id)
fighters_processed = 0

with ThreadPoolExecutor(max_workers=10) as executor:
    futures = [executor.submit(process_fighter, row) for _, row in df_fighter_id.iterrows()]
    for future in as_completed(futures):
        fighter_id, fighter_name, fight_data, new_opponents = future.result()
        fighters_processed += 1
        print(f"Processed {fighters_processed}/{total_fighters} fighters.")
        if fight_data:
            # Cleaning the fighter_name inline to make it valid for filenames
            clean_fighter_name = fighter_name.replace(' ', '_').replace("'", "").replace('"', '').replace("/", "_")
            # Ensure the directory exists before saving the file
            os.makedirs("./data-dwcs/fighters/", exist_ok=True)
            pd.DataFrame(fight_data).to_csv(f"./data-dwcs/fighters/{clean_fighter_name}_{fighter_id}.csv", index=False)
            all_new_opponents.extend(new_opponents)

if all_new_opponents:
    df_new_opponents = pd.DataFrame(all_new_opponents).drop_duplicates()
    df_fighter_id = pd.concat([df_fighter_id, df_new_opponents], ignore_index=True).drop_duplicates()
    df_fighter_id.to_csv('./data-dwcs/DWCS_Fighter_ID_Sherdog.csv', index=False)


Processed 1/730 fighters.
Processed 2/730 fighters.
Processed 3/730 fighters.
Processed 4/730 fighters.
Processed 5/730 fighters.
Processed 6/730 fighters.
Processed 7/730 fighters.
Processed 8/730 fighters.
Processed 9/730 fighters.
Processed 10/730 fighters.
Processed 11/730 fighters.
Processed 12/730 fighters.
Processed 13/730 fighters.
Processed 14/730 fighters.
Processed 15/730 fighters.
Processed 16/730 fighters.
Processed 17/730 fighters.
Processed 18/730 fighters.
Processed 19/730 fighters.
Processed 20/730 fighters.
Processed 21/730 fighters.
Processed 22/730 fighters.
Processed 23/730 fighters.
Processed 24/730 fighters.
Processed 25/730 fighters.
Processed 26/730 fighters.
Processed 27/730 fighters.
Processed 28/730 fighters.
Processed 29/730 fighters.
Processed 30/730 fighters.
Processed 31/730 fighters.
Processed 32/730 fighters.
Processed 33/730 fighters.
Processed 34/730 fighters.
Processed 35/730 fighters.
Processed 36/730 fighters.
Processed 37/730 fighters.
Processed 

In [17]:
directory_path = './data-dwcs/fighters/'
output_file = './data-dwcs/DWCS-Fighters-All-Fights.csv'
first_file = True

for filename in os.listdir(directory_path):
    if filename.endswith('.csv'):  # Check if the file is a CSV
        file_path = os.path.join(directory_path, filename)
        df = pd.read_csv(file_path)
        df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)  # Use applymap for element-wise operation

        # Append to the output file, and only write the header for the first file
        df.to_csv(output_file, mode='a', header=first_file, index=False)
        first_file = False  # Ensure header is written only once


# Odds Scraping

In [18]:
# https://www.bestfightodds.com/archive
# https://www.bestfightodds.com/events/dana-white-s-contender-series-1-1330
# https://www.bestfightodds.com/events/dana-white-s-contender-series-3-1338

# https://fightodds.io/mma-events/4054/contender-series-2022-week-2/odds

In [19]:
# ### Loop through urls

# import requests

# # Base URL
# base_url = 'https://www.bestfightodds.com/events/dana-white-s-contender-series-'

# # Iterate over each URL
# for i in range(1, 100):
#     url = base_url + str(i)
    
#     # Send HTTP GET request
#     response = requests.get(url)
    
#     # Check if the request was successful
#     if response.status_code == 200:
#         # The request was successful, you can now parse the response.text
#         # with a library like BeautifulSoup if you need to extract information from the page
#         # ...
        
#         # Print or save the extracted information
#         # ...
#     else:
#         print(f'Failed to retrieve page {i}. Status code: {response.status_code}')

# print('Completed.')
