# Creating methods

In [1]:
# Creating a list of dataframes which show the statistics of each round
def create_statsdf(url):
    dataframes = []
    tables = pd.read_html(url)

    if len(tables) > 0:
        first_table = tables[0]
        
    # Creating dataframes for the aforementioned fights
    for x in tables:
        df = pd.DataFrame.from_records(x)
        df.dropna(axis=0, how='any', inplace=True)
        df_reset = df.reset_index(drop=True)
        dataframes.append(df_reset)
    
    # This line corrects the fight label from 'Unnamed: 1' to 'Name' and print it out as well as its type
    for x in dataframes:
        z = list(x.columns.values.tolist())
        z[1] = 'Name'
        x.columns = z
        
    return dataframes

In [2]:
# Creating a data dictionary with the keys being the names of the rounds and the values being the statistics of the dataframe
def create_datadict(z,y):
    data_dict = {a: b for a, b in zip(z, y)}

In [3]:
# Creating xlsx files from data
def create_xlsx(y):
    # Specifying the directory where I want to save the files
    output_directory = r"C:\Users\gbore\Downloads\MMA"

    # Checking if the directory exists, if not, it creates it
    if not os.path.exists(output_directory):
        os.makedirs(output_directory)

    # Converting url to string
    urlstring = str(url)
    # Extract the relevant part from the URL to create a subdirectory
    subdirectory = urlstring.replace("https://sports-statistics.com/ufc/ufc-fight-statistics/", "")

    # Create a subdirectory based on the extracted part
    subdirectory_path = os.path.join(output_directory, subdirectory)
    if not os.path.exists(subdirectory_path):
        os.makedirs(subdirectory_path)
    
    # Create a file path based on the URL within the subdirectory
    file_path = os.path.join(subdirectory_path, f"{subdirectory_path}stats.xlsx")
    
    # Creating an ExcelWriter object and writing each URL's data to a unique file path
    with pd.ExcelWriter(file_path, engine='xlsxwriter') as writer:
        # Iterate over the dictionary and write each DataFrame to a different sheet
        for (sheet_name, df) in y.items():
            df.to_excel(writer, sheet_name=sheet_name, index=False)

In [4]:
# We first import necessary packages
import re
import csv
# Now, we create our method to get the round, event and fight names of each fight
def create_round_names(url):
    req = Request(url)
    html_page = urlopen(req)

    # This converts the html page to a soup object
    soup = BeautifulSoup(html_page, "lxml")

    # This creates a string of each fighter's statistics in each round
    variablenames = []
    for x in soup.find_all("div", {"class": "totals_by_round"}):
        for y in soup.find_all("h3"):
            for z in soup.find_all("h4"):
                a = str(y) + str(z)
                variablenames.append(a)

    # This makes sure there are no duplicates
    finalnames = [set(variablenames)]

    listnames = ["Fight Totals By Round-All", "Significant Strikes By Round-All"]

    # Finally, we remove extraneous text like <h3> and <h4> tags
    for x in finalnames:
        for y in x:
            z = str(y)
            z = z.replace("<h3>", "")
            z = z.replace("</h3><h4>", "-")
            z = z.replace("</h4>", "")
            if z.startswith("Topics") == False:
                listnames.append(z)

    # Sorting the list name in order
    finallistnames = []

    # Shortening the name of the list and printing it
    for x in listnames:
        z = x.replace(" By Round", "")
        finallistnames.append(z)

    # Sorting the list name in order
    finallist = sorted(finallistnames)

    return finallist
        

# Scraping required links

In [5]:
# Importing all necessary packages
from bs4 import BeautifulSoup
from urllib.request import Request, urlopen
import re

In [6]:
#Choosing the main page with all the links to the events and converting that into an html page
req = Request("https://sports-statistics.com/ufc/ufc-fight-statistics/")
html_page = urlopen(req)

# Converting the html page to a BeautifulSoup object
soup = BeautifulSoup(html_page, "lxml")

# Extracting all the href links
links = []
for link in soup.findAll('a'):
    links.append(link.get('href'))

# Concatenating the main page and the href links to have a list of objects
fulllinks = []
for x in links:
    fulllink = 'https://sports-statistics.com/ufc/ufc-fight-statistics/' + x
    fulllinks.append(fulllink)
    
# Filtering out links unrelated to the UFC like https://sports-statistics.com/nba
finalinitiallinks = []
for x in fulllinks:
    if x.startswith('https://sports-statistics.com/ufc/ufc-fight-statistics/ufc') or x.startswith('https://sports-statistics.com/ufc/ufc-fight-statistics/the'):
        finalinitiallinks.append(x[:-1])

# Printing out the final, relevant filtered UFC links
for x in finalinitiallinks:
    print(x)



https://sports-statistics.com/ufc/ufc-fight-statistics/ufc-297-strickland-vs-du-plessis
https://sports-statistics.com/ufc/ufc-fight-statistics/ufc-fight-night-ankalaev-vs-walker-2
https://sports-statistics.com/ufc/ufc-fight-statistics/ufc-296-edwards-vs-covington
https://sports-statistics.com/ufc/ufc-fight-statistics/ufc-fight-night-song-vs-gutierrez
https://sports-statistics.com/ufc/ufc-fight-statistics/ufc-fight-night-dariush-vs-tsarukyan
https://sports-statistics.com/ufc/ufc-fight-statistics/ufc-fight-night-allen-vs-craig
https://sports-statistics.com/ufc/ufc-fight-statistics/ufc-295-prochazka-vs-pereira
https://sports-statistics.com/ufc/ufc-fight-statistics/ufc-fight-night-almeida-vs-lewis
https://sports-statistics.com/ufc/ufc-fight-statistics/ufc-294-makhachev-vs-volkanovski-2
https://sports-statistics.com/ufc/ufc-fight-statistics/ufc-fight-night-yusuff-vs-barboza
https://sports-statistics.com/ufc/ufc-fight-statistics/ufc-fight-night-dawson-vs-green
https://sports-statistics.com/u

In [7]:
# Importing datetime module
from datetime import datetime
# Counting the number of links
a = str(len(finalinitiallinks))
b = datetime.now()
c = b.date()
print("There have been " + a + " events as of", c, " excluding events which have been cancelled.")

There have been 663 events as of 2024-01-27  excluding events which have been cancelled.


In [10]:
# Scraping out the links to each and every UFC fight from the links to each UFC event and storing it in linksfinal
linksfinal = []
for x in finalinitiallinks:
    try:
        print(x)
        req = Request(x)
        html_page = urlopen(req)

        soup = BeautifulSoup(html_page, "lxml")

        for link in soup.findAll('a'):
            linksfinal.append(link.get('href'))
    except:
        if isinstance(e.reason, socket.timeout):
            print(f"Timeout error accessing {x}")
        else:
            print(f"Error accessing {x}: {e}")

https://sports-statistics.com/ufc/ufc-fight-statistics/ufc-297-strickland-vs-du-plessis
https://sports-statistics.com/ufc/ufc-fight-statistics/ufc-fight-night-ankalaev-vs-walker-2
https://sports-statistics.com/ufc/ufc-fight-statistics/ufc-296-edwards-vs-covington
https://sports-statistics.com/ufc/ufc-fight-statistics/ufc-fight-night-song-vs-gutierrez
https://sports-statistics.com/ufc/ufc-fight-statistics/ufc-fight-night-dariush-vs-tsarukyan
https://sports-statistics.com/ufc/ufc-fight-statistics/ufc-fight-night-allen-vs-craig
https://sports-statistics.com/ufc/ufc-fight-statistics/ufc-295-prochazka-vs-pereira
https://sports-statistics.com/ufc/ufc-fight-statistics/ufc-fight-night-almeida-vs-lewis
https://sports-statistics.com/ufc/ufc-fight-statistics/ufc-294-makhachev-vs-volkanovski-2
https://sports-statistics.com/ufc/ufc-fight-statistics/ufc-fight-night-yusuff-vs-barboza
https://sports-statistics.com/ufc/ufc-fight-statistics/ufc-fight-night-dawson-vs-green
https://sports-statistics.com/u

In [11]:
# Printing out these links and counting the number of entries
for x in linksfinal:
    print(x)
len(linksfinal)

/
/ufc/ufc-fight-statistics/
/nba/
/nfl/
/mlb/
/nhl/
/tennis/
#subnav-box-nav-off_prem
/
/ufc/ufc-fight-statistics/
#masthead
#subnav-box-nav-on_prem
#
/f1/
/darts/
/nba/
/wnba/
/nfl/
/nhl/
/mlb/
/tennis/
/ufc/ufc-fight-statistics/
/sports/
#masthead
#subnav-box-nav-software
#
/nba/odds/
/nfl/odds/
/nhl/odds/
/mlb/odds/
/ufc/odds/
/odds-converter/
/features/vegas-odds-sportsbetting-guide/
#masthead
#subnav-box-nav-offbeat
#
/privacy-policy/
/contact/
#masthead
/
/ufc/ufc-fight-statistics/
/ufc/ufc-fight-statistics/ufc-297-strickland-vs-du-plessis/sean-strickland-v-dricus-du-plessis/
/ufc/ufc-fight-statistics/ufc-297-strickland-vs-du-plessis/raquel-pennington-v-mayra-bueno-silva/
/ufc/ufc-fight-statistics/ufc-297-strickland-vs-du-plessis/neil-magny-v-mike-malott/
/ufc/ufc-fight-statistics/ufc-297-strickland-vs-du-plessis/chris-curtis-v-marc-andre-barriault/
/ufc/ufc-fight-statistics/ufc-297-strickland-vs-du-plessis/arnold-allen-v-movsar-evloev/
/ufc/ufc-fight-statistics/ufc-297-strickla

43783

In [12]:
# Choosing only links that start with '/ufc/ufc-fight-statistics/ufc' or '/ufc/ufc-fight-statistics/the' as the other links are not related to
# UFC and appending the correlated links to a list
# Final blank list which will be filled
fulllinksfinal = []
# Loop that will fill the list
for y in linksfinal:
    if y.startswith('/ufc/ufc-fight-statistics/ufc') or y.startswith('/ufc/ufc-fight-statistics/the'):
        fulllink = 'https://sports-statistics.com' + y
        fulllinksfinal.append(fulllink)
        

In [13]:
#Printing out the values of the list with every ufc fight along with their type
for x in fulllinksfinal:
    print(x)
    print(type(x))

https://sports-statistics.com/ufc/ufc-fight-statistics/ufc-297-strickland-vs-du-plessis/sean-strickland-v-dricus-du-plessis/
<class 'str'>
https://sports-statistics.com/ufc/ufc-fight-statistics/ufc-297-strickland-vs-du-plessis/raquel-pennington-v-mayra-bueno-silva/
<class 'str'>
https://sports-statistics.com/ufc/ufc-fight-statistics/ufc-297-strickland-vs-du-plessis/neil-magny-v-mike-malott/
<class 'str'>
https://sports-statistics.com/ufc/ufc-fight-statistics/ufc-297-strickland-vs-du-plessis/chris-curtis-v-marc-andre-barriault/
<class 'str'>
https://sports-statistics.com/ufc/ufc-fight-statistics/ufc-297-strickland-vs-du-plessis/arnold-allen-v-movsar-evloev/
<class 'str'>
https://sports-statistics.com/ufc/ufc-fight-statistics/ufc-297-strickland-vs-du-plessis/brad-katona-v-garrett-armfield/
<class 'str'>
https://sports-statistics.com/ufc/ufc-fight-statistics/ufc-297-strickland-vs-du-plessis/charles-jourdain-v-sean-woodson/
<class 'str'>
https://sports-statistics.com/ufc/ufc-fight-statisti


https://sports-statistics.com/ufc/ufc-fight-statistics/ufc-fight-night-barboza-vs-chikadze/edson-barboza-v-giga-chikadze/
<class 'str'>
https://sports-statistics.com/ufc/ufc-fight-statistics/ufc-fight-night-barboza-vs-chikadze/bryan-battle-v-gilbert-urbina/
<class 'str'>
https://sports-statistics.com/ufc/ufc-fight-statistics/ufc-fight-night-barboza-vs-chikadze/ricky-turcios-v-brady-hiestand/
<class 'str'>
https://sports-statistics.com/ufc/ufc-fight-statistics/ufc-fight-night-barboza-vs-chikadze/kevin-lee-v-daniel-rodriguez/
<class 'str'>
https://sports-statistics.com/ufc/ufc-fight-statistics/ufc-fight-night-barboza-vs-chikadze/andre-petroski-v-micheal-gillmore/
<class 'str'>
https://sports-statistics.com/ufc/ufc-fight-statistics/ufc-fight-night-barboza-vs-chikadze/makhmud-muradov-v-gerald-meerschaert/
<class 'str'>
https://sports-statistics.com/ufc/ufc-fight-statistics/ufc-fight-night-barboza-vs-chikadze/alessio-di-chirico-v-abdul-razak-alhassan/
<class 'str'>
https://sports-statistic

They are strings as expected

In [14]:
# Importing datetime module
from datetime import datetime
# Counting the number of links
a = str(len(fulllinksfinal))
b = datetime.now()
c = b.date()
print("There have been " + a + " fights as of", c, " excluding fights which have been cancelled.")

There have been 7318 fights as of 2024-01-27  excluding fights which have been cancelled.


# Testing models using fights of various rounds 

Once again, some fights were omitted but this is ok as we are looking for trends. There were actually 7323 fights as of 08/08/2023.

In [15]:
# Importing additional packages
from bs4 import BeautifulSoup
import re
import pandas as pd
import requests

# Scraping recent Jan Blachowicz vs Alex Perira fight statistics and printing these out
url = "https://sports-statistics.com/ufc/ufc-fight-statistics/ufc-291-poirier-vs-gaethje-2/jan-blachowicz-v-alex-pereira/"
tables = pd.read_html(url)

if len(tables) > 0:
    first_table = tables[0]
    
#Creating dataframes for the aforementioned Pereira Blachowicz fight
dataframes = []
for x in tables:
    df = pd.DataFrame.from_records(x)
    df.dropna(axis=0, how='any', inplace=True)
    df_reset = df.reset_index(drop=True)
    dataframes.append(df_reset)

# This for loop is to correct the fight label from 'Unnamed: 1' to 'Name'
for x in dataframes:
    z = list(x.columns.values.tolist())
    z[1] = 'Name'
    x.columns = z
    print(x)
    print(type(x))

# We then see the number of values in each dataframe list
len(dataframes)

   Jan Blachowicz                           Name Alex Pereira
0               0                     Knockdowns            0
1        52 of 81            Significant Strikes     70 of 92
2              81  Significant Strikes Attempted           92
3          64.20%          Significant Strikes %       76.09%
4              82                  Total Strikes          112
5             117        Total Strikes Attempted          138
6          3 of 8                      Takedowns       0 of 0
7               8            Takedowns Attempted            0
8          37.50%                     Takedown %           0%
9               2          Submissions Attempted            1
10              0                         Passes            0
11              0                      Reversals            0
<class 'pandas.core.frame.DataFrame'>
  Jan Blachowicz                   Name Alex Pereira
0              0             Knockdowns            0
1         6 of 6    Significant Strikes       2 of

8

There are 8 dataframes as expected. 3 showing significant strikes in each round and 3 showing fight totals in each round and then total significant strikes and stats.

In [16]:
# Next, we print out the names of each round in each table individually
req = Request("https://sports-statistics.com/ufc/ufc-fight-statistics/ufc-291-poirier-vs-gaethje-2/jan-blachowicz-v-alex-pereira/")
html_page = urlopen(req)

# This converts the html page to a soup object
soup = BeautifulSoup(html_page, "lxml")

# This creates a string of each fighter's statistics in each round
variablenames = []
for x in soup.find_all("div", {"class": "totals_by_round"}):
    for y in soup.find_all("h3"):
        for z in soup.find_all("h4"):
            a = str(y) + str(z)
            variablenames.append(a)

# This makes sure there are no duplicates
finalnames = [set(variablenames)]

listnames = ["Fight Totals By Round-All", "Significant Strikes By Round-All"]

# Finally, we remove extraneous text like <h3> and <h4> tags
for x in finalnames:
    for y in x:
        z = str(y)
        z = z.replace("<h3>", "")
        z = z.replace("</h3><h4>", "-")
        z = z.replace("</h4>", "")
        if z.startswith("Topics") == False:
            listnames.append(z)

# Shortening the name of the list and printing it
for x in listnames:
    z = x.replace(" By Round", "")
    
finallist = sorted(listnames)

# Shortening the name of the list and printing it
for x in finallist:
    z = x.replace(" By Round", "")
    print(z)


Fight Totals-All
Fight Totals-Round 1
Fight Totals-Round 2
Fight Totals-Round 3
Significant Strikes-All
Significant Strikes-Round 1
Significant Strikes-Round 2
Significant Strikes-Round 3


All 3 round and significant strikes were printed as expected.

We are now going to run this code and test it with 2 and 4 round fights.

In [17]:
# 2 round fight test
# Scraping recent Jan Blachowicz vs Alex Perira fight statistics and printing these out
url = "https://sports-statistics.com/ufc/ufc-fight-statistics/ufc-fight-night-holm-vs-bueno-silva/holly-holm-v-mayra-bueno-silva/"
tables = pd.read_html(url)

if len(tables) > 0:
    first_table = tables[0]
    
#Creating dataframes for the aforementioned Holm Bueno-Silva fight
dataframes = []
for x in tables:
    df = pd.DataFrame.from_records(x)
    df.dropna(axis=0, how='any', inplace=True)
    df_reset = df.reset_index(drop=True)
    dataframes.append(df_reset)

# This for loop is to correct the fight label from 'Unnamed: 1' to 'Name'
for x in dataframes:
    z = list(x.columns.values.tolist())
    z[1] = 'Name'
    x.columns = z
    print(x)
    print(type(x))

# We then see the number of values in each dataframe list
len(dataframes)

   Holly Holm                           Name Mayra Bueno Silva
0           0                     Knockdowns                 0
1    27 of 39            Significant Strikes          20 of 37
2          39  Significant Strikes Attempted                37
3      69.23%          Significant Strikes %            54.05%
4          50                  Total Strikes                31
5          70        Total Strikes Attempted                50
6      0 of 1                      Takedowns            0 of 0
7           1            Takedowns Attempted                 0
8          0%                     Takedown %                0%
9           0          Submissions Attempted                 1
10          0                         Passes                 0
11          0                      Reversals                 0
<class 'pandas.core.frame.DataFrame'>
  Holly Holm                   Name Mayra Bueno Silva
0          0             Knockdowns                 0
1   26 of 38    Significant Strikes

6

Everything printed was as expected as there were 6 dataframes(2 for the first round, 2 for the significant and 2 for total significant strike totals and round totals)

In [18]:
# Importing necessary packages
# Importing necessary libraries for this
from bs4 import BeautifulSoup
from urllib.request import Request, urlopen
import re

# Next, we print out the names of each round in each table individually
req = Request("https://sports-statistics.com/ufc/ufc-fight-statistics/ufc-fight-night-holm-vs-bueno-silva/holly-holm-v-mayra-bueno-silva/")
html_page = urlopen(req)

# This converts the html page to a soup object
soup = BeautifulSoup(html_page, "lxml")

# This creates a string of each fighter's statistics in each round
variablenames = []
for x in soup.find_all("div", {"class": "totals_by_round"}):
    for y in soup.find_all("h3"):
        for z in soup.find_all("h4"):
            a = str(y) + str(z)
            variablenames.append(a)

# This makes sure there are no duplicates
finalnames = [set(variablenames)]

listnames = ["Fight Totals By Round-All", "Significant Strikes By Round-All"]

# Finally, we remove extraneous text like <h3> and <h4> tags
for x in finalnames:
    for y in x:
        z = str(y)
        z = z.replace("<h3>", "")
        z = z.replace("</h3><h4>", "-")
        z = z.replace("</h4>", "")
        if z.startswith("Topics") == False:
            listnames.append(z)

# Shortening the name of the list and printing it
for x in listnames:
    z = x.replace(" By Round", "")
    
finallist = sorted(listnames)

for x in finallist:
    print(x)

Fight Totals By Round-All
Fight Totals By Round-Round 1
Fight Totals By Round-Round 2
Significant Strikes By Round-All
Significant Strikes By Round-Round 1
Significant Strikes By Round-Round 2


There are 4 dataframes in total. 2 rounds of significant strikes data, 2 rounds of overall stats data. We will now see this with a 4 round fight as one final test. We did not print all the rounds here.

In [19]:
# Next, we print out the names of each round in each table individually
req = Request("https://sports-statistics.com/ufc/ufc-fight-statistics/ufc-291-poirier-vs-gaethje-2/dustin-poirier-v-justin-gaethje/")
html_page = urlopen(req)

# This converts the html page to a soup object
soup = BeautifulSoup(html_page, "lxml")

# This creates a string of each fighter's statistics in each round
variablenames = []
for x in soup.find_all("div", {"class": "totals_by_round"}):
    for y in soup.find_all("h3"):
        for z in soup.find_all("h4"):
            a = str(y) + str(z)
            variablenames.append(a)

# This makes sure there are no duplicates
finalnames = [set(variablenames)]

listnames = ["Fight Totals By Round-All", "Significant Strikes By Round-All"]

# Finally, we remove extraneous text like <h3> and <h4> tags
for x in finalnames:
    for y in x:
        z = str(y)
        z = z.replace("<h3>", "")
        z = z.replace("</h3><h4>", "-")
        z = z.replace("</h4>", "")
        if z.startswith("Topics") == False:
            listnames.append(z)

# Shortening the name of the list and printing it
for x in listnames:
    z = x.replace(" By Round", "")

# Sorting the list name in order
finallist = sorted(listnames)

# Shortening the name of the list and printing it
for x in finallist:
    z = x.replace(" By Round", "")
    print(z)

len(finallist)

Fight Totals-All
Fight Totals-Round 1
Fight Totals-Round 2
Fight Totals-Round 3
Fight Totals-Round 4
Significant Strikes-All
Significant Strikes-Round 1
Significant Strikes-Round 2
Significant Strikes-Round 3
Significant Strikes-Round 4


10

We acurately printed significant strike and round totals for each of the 4 rounds here also.

In [20]:
# 2 round fight test
# Scraping recent Jan Blachowicz vs Alex Perira fight statistics and printing these out
url = "https://sports-statistics.com/ufc/ufc-fight-statistics/ufc-291-poirier-vs-gaethje-2/dustin-poirier-v-justin-gaethje/"
tables = pd.read_html(url)

if len(tables) > 0:
    first_table = tables[0]
    
#Creating dataframes for the aforementioned Holm Bueno-Silva fight
dataframes = []
for x in tables:
    df = pd.DataFrame.from_records(x)
    df.dropna(axis=0, how='any', inplace=True)
    df_reset = df.reset_index(drop=True)
    dataframes.append(df_reset)

# This for loop is to correct the fight label from 'Unnamed: 1' to 'Name'
for x in dataframes:
    z = list(x.columns.values.tolist())
    z[1] = 'Name'
    x.columns = z
    print(x)
    print(type(x))

# We then see the number of values in each dataframe list
len(dataframes)

   Dustin Poirier                           Name Justin Gaethje
0               0                     Knockdowns              0
1      174 of 351            Significant Strikes     115 of 212
2             351  Significant Strikes Attempted            212
3          49.57%          Significant Strikes %         54.25%
4             179                  Total Strikes            116
5             357        Total Strikes Attempted            213
6          0 of 5                      Takedowns         0 of 0
7               5            Takedowns Attempted              0
8              0%                     Takedown %             0%
9               0          Submissions Attempted              0
10              0                         Passes              0
11              0                      Reversals              0
<class 'pandas.core.frame.DataFrame'>
  Dustin Poirier                   Name Justin Gaethje
0              0             Knockdowns              0
1      63 of 132    

10

We printed significant strike and round totals for each of the four rounds and the overall fight accurately.

Thus, the testing was accurate. We just now have to see if they are equivalent to the models in our methods.

In [21]:
z = create_round_names("https://sports-statistics.com/ufc/ufc-fight-statistics/ufc-291-poirier-vs-gaethje-2/dustin-poirier-v-justin-gaethje/")
print(z)

['Fight Totals-All', 'Fight Totals-Round 1', 'Fight Totals-Round 2', 'Fight Totals-Round 3', 'Fight Totals-Round 4', 'Significant Strikes-All', 'Significant Strikes-Round 1', 'Significant Strikes-Round 2', 'Significant Strikes-Round 3', 'Significant Strikes-Round 4']


In [22]:
# Importing os
import os

# List of URLs
urls = ["https://sports-statistics.com/ufc/ufc-fight-statistics/ufc-291-poirier-vs-gaethje-2/jan-blachowicz-v-alex-pereira/",
        "https://sports-statistics.com/ufc/ufc-fight-statistics/ufc-fight-night-holm-vs-bueno-silva/holly-holm-v-mayra-bueno-silva/",
        "https://sports-statistics.com/ufc/ufc-fight-statistics/ufc-291-poirier-vs-gaethje-2/dustin-poirier-v-justin-gaethje/"]

# Specifying the directory where I want to save the files
output_directory = r"C:\Users\gbore\Downloads\MMA"

# Checking if the directory exists, if not, it creates it
if not os.path.exists(output_directory):
    os.makedirs(output_directory)

# Iterating over each URL and writing it to a file
for url in urls:
    urlstring = str(url)
    # Extract the relevant part from the URL to create a subdirectory
    subdirectory = urlstring.replace("https://sports-statistics.com/ufc/ufc-fight-statistics/", "")

    # Create a subdirectory based on the extracted part
    subdirectory_path = os.path.join(output_directory, subdirectory)
    if not os.path.exists(subdirectory_path):
        os.makedirs(subdirectory_path)

    y = create_statsdf(url)
    z = create_round_names(url)
    data_dict = {a: b for a, b in zip(z, y)}
    
     # Create a file path based on the URL within the subdirectory
    file_path = os.path.join(subdirectory_path, f"{subdirectory_path}stats.xlsx")
    
    # Creating an ExcelWriter object and writing each URL's data to a unique file path
    with pd.ExcelWriter(file_path, engine='xlsxwriter') as writer:
        # Iterate over the dictionary and write each DataFrame to a different sheet
        for (sheet_name, df) in data_dict.items():
            df.to_excel(writer, sheet_name=sheet_name, index=False)

    print(type(subdirectory))
    print(subdirectory)


<class 'str'>
ufc-291-poirier-vs-gaethje-2/jan-blachowicz-v-alex-pereira/
<class 'str'>
ufc-fight-night-holm-vs-bueno-silva/holly-holm-v-mayra-bueno-silva/
<class 'str'>
ufc-291-poirier-vs-gaethje-2/dustin-poirier-v-justin-gaethje/


# Cleaning the data to find only decision fights

We will only evaluate 4000 fights at a time to avoid timeout errors

In [23]:
# Looking at their urls
for x in fulllinksfinal[0:4000]:
    print(x)

https://sports-statistics.com/ufc/ufc-fight-statistics/ufc-297-strickland-vs-du-plessis/sean-strickland-v-dricus-du-plessis/
https://sports-statistics.com/ufc/ufc-fight-statistics/ufc-297-strickland-vs-du-plessis/raquel-pennington-v-mayra-bueno-silva/
https://sports-statistics.com/ufc/ufc-fight-statistics/ufc-297-strickland-vs-du-plessis/neil-magny-v-mike-malott/
https://sports-statistics.com/ufc/ufc-fight-statistics/ufc-297-strickland-vs-du-plessis/chris-curtis-v-marc-andre-barriault/
https://sports-statistics.com/ufc/ufc-fight-statistics/ufc-297-strickland-vs-du-plessis/arnold-allen-v-movsar-evloev/
https://sports-statistics.com/ufc/ufc-fight-statistics/ufc-297-strickland-vs-du-plessis/brad-katona-v-garrett-armfield/
https://sports-statistics.com/ufc/ufc-fight-statistics/ufc-297-strickland-vs-du-plessis/charles-jourdain-v-sean-woodson/
https://sports-statistics.com/ufc/ufc-fight-statistics/ufc-297-strickland-vs-du-plessis/serhiy-sidey-v-ramon-taveras/
https://sports-statistics.com/uf


https://sports-statistics.com/ufc/ufc-fight-statistics/ufc-fight-night-ngannou-vs-dos-santos/alonzo-menifield-v-paul-craig/
https://sports-statistics.com/ufc/ufc-fight-statistics/ufc-fight-night-ngannou-vs-dos-santos/ricardo-ramos-v-journey-newson/
https://sports-statistics.com/ufc/ufc-fight-statistics/ufc-fight-night-ngannou-vs-dos-santos/eryk-anders-v-vinicius-moreira/
https://sports-statistics.com/ufc/ufc-fight-statistics/ufc-fight-night-ngannou-vs-dos-santos/jared-gordon-v-dan-moret/
https://sports-statistics.com/ufc/ufc-fight-statistics/ufc-fight-night-ngannou-vs-dos-santos/dalcha-lungiambula-v-dequan-townsend/
https://sports-statistics.com/ufc/ufc-fight-statistics/ufc-fight-night-ngannou-vs-dos-santos/emily-whitmire-v-amanda-ribas/
https://sports-statistics.com/ufc/ufc-fight-statistics/ufc-fight-night-ngannou-vs-dos-santos/maurice-greene-v-junior-albini/
https://sports-statistics.com/ufc/ufc-fight-statistics/ufc-fight-night-moicano-vs-the-korean-zombie/renato-moicano-v-chan-sung

In [24]:
# Adding their urls to a list
# Importing necessary packages
from urllib.request import urlopen
from urllib.error import URLError
from urllib.request import Request

# Importing Counter from collections which we need to find unique elements
from collections import Counter

# Creating a list for decisionfights
decisionfights = list()

# Writing a loop which prints out the unique ways of winning for the first 4000 fights    
for x in fulllinksfinal[0:4000]:
    # Converting these links into html pages
    req = Request(x)
    
    try:
        html_page = urlopen(req)  # Set a longer timeout (e.g., 10 seconds)
        
        # This converts the html page to a soup object
        soup = BeautifulSoup(html_page, "lxml")

        # We then create the list of fights which ended by decision
        for z in soup.find_all("li"):
            finaltext = z.get_text()
            if finaltext.startswith("Method: Decision") == True:
                print(x)
                decisionfights.append(x)
    
    except URLError as e:
        if isinstance(e.reason, socket.timeout):
            print(f"Timeout error accessing {x}")
        else:
            print(f"Error accessing {x}: {e}")

https://sports-statistics.com/ufc/ufc-fight-statistics/ufc-297-strickland-vs-du-plessis/sean-strickland-v-dricus-du-plessis/
https://sports-statistics.com/ufc/ufc-fight-statistics/ufc-297-strickland-vs-du-plessis/raquel-pennington-v-mayra-bueno-silva/
https://sports-statistics.com/ufc/ufc-fight-statistics/ufc-297-strickland-vs-du-plessis/chris-curtis-v-marc-andre-barriault/
https://sports-statistics.com/ufc/ufc-fight-statistics/ufc-297-strickland-vs-du-plessis/arnold-allen-v-movsar-evloev/
https://sports-statistics.com/ufc/ufc-fight-statistics/ufc-297-strickland-vs-du-plessis/brad-katona-v-garrett-armfield/
https://sports-statistics.com/ufc/ufc-fight-statistics/ufc-297-strickland-vs-du-plessis/charles-jourdain-v-sean-woodson/
https://sports-statistics.com/ufc/ufc-fight-statistics/ufc-297-strickland-vs-du-plessis/serhiy-sidey-v-ramon-taveras/
https://sports-statistics.com/ufc/ufc-fight-statistics/ufc-fight-night-ankalaev-vs-walker-2/ricky-simon-v-mario-bautista/
https://sports-statistic

In [25]:
# Importing necessary packages
from urllib.request import urlopen
from urllib.error import URLError
from urllib.request import Request

# Importing Counter from collections which we need to find unique elements
from collections import Counter

# Creating a list for decisionfights
decisionfights2 = list()

# Writing a loop which prints out the unique ways of winning for every fight after the 4000th   
for x in fulllinksfinal[4001:]:
    # Converting these links into html pages
    req = Request(x)
    
    try:
        html_page = urlopen(req) 
        
        # This converts the html page to a soup object
        soup = BeautifulSoup(html_page, "lxml")

        # We then create the list of fights which ended by decision
        for z in soup.find_all("li"):
            finaltext = z.get_text()
            if finaltext.startswith("Method: Decision") == True:
                print(x)
                decisionfights.append(x)
    
    except URLError as e:
        if isinstance(e.reason, socket.timeout):
            print(f"Timeout error accessing {x}")
        else:
            print(f"Error accessing {x}: {e}")

https://sports-statistics.com/ufc/ufc-fight-statistics/ufc-on-fox-dillashaw-vs-barao-ii/eddie-wineland-v-bryan-caraway/
https://sports-statistics.com/ufc/ufc-fight-statistics/ufc-on-fox-dillashaw-vs-barao-ii/ramsey-nijem-v-andrew-holbrook/
https://sports-statistics.com/ufc/ufc-fight-statistics/ufc-on-fox-dillashaw-vs-barao-ii/jessamyn-duke-v-elizabeth-phillips/
https://sports-statistics.com/ufc/ufc-fight-statistics/ufc-fight-night-bisping-vs-leites/michael-bisping-v-thales-leites/
https://sports-statistics.com/ufc/ufc-fight-statistics/ufc-fight-night-bisping-vs-leites/ross-pearson-v-evan-dunham/
https://sports-statistics.com/ufc/ufc-fight-statistics/ufc-fight-night-bisping-vs-leites/joanne-wood-v-cortney-casey/
https://sports-statistics.com/ufc/ufc-fight-statistics/ufc-fight-night-bisping-vs-leites/leon-edwards-v-pawel-pawlak/
https://sports-statistics.com/ufc/ufc-fight-statistics/ufc-fight-night-bisping-vs-leites/paddy-holohan-v-vaughan-lee/
https://sports-statistics.com/ufc/ufc-fight

In [26]:
# Importing datetime module
from datetime import datetime
# Counting the number of links
a = str(len(decisionfights))
b = datetime.now()
c = b.date()
print("There have been " + a + " fights ending in a decision as of", c, " excluding events which have been cancelled.")

There have been 1370 fights ending in a decision as of 2024-01-27  excluding events which have been cancelled.


In [27]:
for fight in decisionfights:
    print(fight)

https://sports-statistics.com/ufc/ufc-fight-statistics/ufc-on-fox-dillashaw-vs-barao-ii/eddie-wineland-v-bryan-caraway/
https://sports-statistics.com/ufc/ufc-fight-statistics/ufc-on-fox-dillashaw-vs-barao-ii/ramsey-nijem-v-andrew-holbrook/
https://sports-statistics.com/ufc/ufc-fight-statistics/ufc-on-fox-dillashaw-vs-barao-ii/jessamyn-duke-v-elizabeth-phillips/
https://sports-statistics.com/ufc/ufc-fight-statistics/ufc-fight-night-bisping-vs-leites/michael-bisping-v-thales-leites/
https://sports-statistics.com/ufc/ufc-fight-statistics/ufc-fight-night-bisping-vs-leites/ross-pearson-v-evan-dunham/
https://sports-statistics.com/ufc/ufc-fight-statistics/ufc-fight-night-bisping-vs-leites/joanne-wood-v-cortney-casey/
https://sports-statistics.com/ufc/ufc-fight-statistics/ufc-fight-night-bisping-vs-leites/leon-edwards-v-pawel-pawlak/
https://sports-statistics.com/ufc/ufc-fight-statistics/ufc-fight-night-bisping-vs-leites/paddy-holohan-v-vaughan-lee/
https://sports-statistics.com/ufc/ufc-fight