Imports

In [15]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime

Scrape all text from the rankings page and store as list of lines.

In [16]:
#Rankings URL
URL = 'https://www.ufc.com/rankings'

response = requests.get(URL)

# Parse the HTML content using BeautifulSoup
soup = BeautifulSoup(response.content, 'html.parser')
# Extract all text from the webpage
text = soup.get_text(separator='\n', strip=True)

# Split the text into lines
lines = text.splitlines()

print(lines)

['UFC Rankings, Division Rankings, P4P rankings, UFC Champions | UFC.com', 'Skip to main content', 'UFC', 'Rankings', 'Main navigation', 'Events', 'Upcoming', 'Past', 'Tickets', 'VIP Experiences', 'Group Sales', 'UFC Fight Pass Invitational', 'Road to UFC', "Dana White's Contender Series", 'Rankings', 'Athletes', 'All Athletes', 'Hall of Fame', 'Record Book', 'News', 'UFC', 'DWCS', "How to Watch Dana White's Contender Series", 'Connect', 'Newsletter', 'UFC Fight Club', 'UFC Apex', 'Find a Gym', 'EA Sports UFC 5', 'Betting Odds', 'Watch', 'How to Watch', 'Find a Bar', 'UFC Fight Pass', 'UFC Video Archive', 'SHOP', 'UFC STORE', 'UFC COLLECTIBLES', 'VENUM FIGHT WEEK', 'UFC STRIKE', 'MORE', 'Athlete Rankings', "What's Trending Now", 'Sponsored By', "Men's Pound-for-Pound", 'Top Rank', "Men's Pound-for-Pound", 'Top Rank', 'Islam Makhachev', '1', 'Islam Makhachev', '2', 'Alex Pereira', '3', 'Jon Jones', '4', 'Ilia Topuria', '5', 'Belal Muhammad', '6', 'Dricus Du Plessis', '7', 'Merab Dvalish

Convert lines to pandas dataframe with a single column called 'Fighter'. Then trim unnessecary text from the top and bottom of the data frame.

In [17]:
#Convert to pandas dataframe
df = pd.DataFrame({'Fighter':lines})

#Trim unnecessary rows
rmtop = df[df['Fighter'] == 'Top Rank'].index
df = df.loc[rmtop[0] + 1:]

rmbottom = df[df['Fighter'] == 'How are rankings determined?'].index
df = df.loc[:rmbottom[0]-1]

df.head(10)

Unnamed: 0,Fighter
46,Men's Pound-for-Pound
47,Top Rank
48,Islam Makhachev
49,1
50,Islam Makhachev
51,2
52,Alex Pereira
53,3
54,Jon Jones
55,4


Initialize the notes column and define values to move to the new column. Notes will be copied to the 'Notes' column and dropped from the 'Fighter' column.

In [18]:
#Create notes column and populate it with the values in the list below
comments = ['NR', 'Champion', 'interim', 'Rank increased by', 'Rank decreased by']

df['Notes'] = None

df.reset_index(drop = True, inplace = True)

for comment in comments:
    indices = df[df['Fighter'] == comment].index
    for index in indices:
        if index > 0:
            df.at[index - 1, 'Notes'] = comment
            df.at[index, 'Fighter'] = pd.NA

#drop empty rows
df = df.dropna(subset = ['Fighter'])
df.reset_index(drop = True, inplace = True)

df.head(10)

Unnamed: 0,Fighter,Notes
0,Men's Pound-for-Pound,
1,Top Rank,
2,Islam Makhachev,
3,1,
4,Islam Makhachev,
5,2,
6,Alex Pereira,
7,3,
8,Jon Jones,
9,4,


Add the rank change number to the comment and drop from the 'Fighter' column.

In [19]:
#Convert notes and fighter columns to strings
df['Notes'] = df['Notes'].astype(str)
df['Fighter'] = df['Fighter'].astype(str)

#Iterate through rows to add number ranks changed to notes
for index, row in df.iterrows():
    if 'Rank' in row['Notes']:
        if index + 1 < len(df):
            nextrow = df.iloc[index + 1]['Fighter']
            df.at[index, 'Notes'] += ' ' + nextrow
            df.at[index + 1, 'Fighter'] = pd.NA

#Drop empty rows
df = df.dropna(subset = ['Fighter'])
df.reset_index(drop = True, inplace = True)

df.head(10)

Unnamed: 0,Fighter,Notes
0,Men's Pound-for-Pound,
1,Top Rank,
2,Islam Makhachev,
3,1,
4,Islam Makhachev,
5,2,
6,Alex Pereira,
7,3,
8,Jon Jones,
9,4,


Initialize the 'Division' column and populate it from the values in the 'Fighter' column. Division values will then be dropped from 'Fighter'.

In [20]:
#Ordered list of divisions
divisions = [
"Men's Pound-for-Pound", 
"Flyweight", 
"Bantamweight",
"Featherweight",
"Lightweight",
"Welterweight",
"Middleweight",
"Light Heavyweight",
"Heavyweight",
"Women's Pound-for-Pound",
"Women's Strawweight",
"Women's Flyweight",
"Women's Bantamweight",
]

#Initailize division column
df['Division'] = None

#Iterate through rows adding the division to each row
for row in range(len(df)):
    if df.at[row, 'Fighter'] in divisions:
        currentdivision = df.at[row, 'Fighter']
    df.at[row, 'Division'] = currentdivision

#Remove rows with the top rank text
df = df[df['Fighter'] != 'Top Rank']
df = df[~df['Fighter'].isin(divisions)]

df.reset_index(drop = True, inplace = True)

df.head(10)

Unnamed: 0,Fighter,Notes,Division
0,Islam Makhachev,,Men's Pound-for-Pound
1,1,,Men's Pound-for-Pound
2,Islam Makhachev,,Men's Pound-for-Pound
3,2,,Men's Pound-for-Pound
4,Alex Pereira,,Men's Pound-for-Pound
5,3,,Men's Pound-for-Pound
6,Jon Jones,,Men's Pound-for-Pound
7,4,,Men's Pound-for-Pound
8,Ilia Topuria,,Men's Pound-for-Pound
9,5,,Men's Pound-for-Pound


Initalize 'Ranking' column and populate with values from the 'Fighter' column. Then, drop values from 'Fighter'.

In [21]:
#Initalize ranking column
df['Ranking'] = None

#Iterate through rows adding ranking to ranking column
for index, row in df.iterrows():
    if len(row['Fighter']) < 3:
        if index + 1 < len(df):
            currentranking = df.iloc[index]['Fighter']
            df.at[index + 1, 'Ranking'] = currentranking


df.loc[df['Notes'] == 'Champion', 'Ranking'] = 0

#Drop empty rows
df = df.dropna(subset = ['Ranking'])

df.head(10)

Unnamed: 0,Fighter,Notes,Division,Ranking
2,Islam Makhachev,,Men's Pound-for-Pound,1
4,Alex Pereira,,Men's Pound-for-Pound,2
6,Jon Jones,,Men's Pound-for-Pound,3
8,Ilia Topuria,,Men's Pound-for-Pound,4
10,Belal Muhammad,,Men's Pound-for-Pound,5
12,Dricus Du Plessis,,Men's Pound-for-Pound,6
14,Merab Dvalishvili,,Men's Pound-for-Pound,7
16,Tom Aspinall,,Men's Pound-for-Pound,8
18,Leon Edwards,,Men's Pound-for-Pound,9
20,Alexander Volkanovski,,Men's Pound-for-Pound,10


Initialize the 'Date' column and populate each row with the current date in YYYY-MM-DD format. Then, sort columns by date, division, then ranking in ascending order. Division will be sorted in the order of the list 'divisions'.

In [22]:
#Create date column
df['Date'] = None

#Add today's date to all rows
df['Date'] = datetime.now().strftime('%Y-%m-%d')

#Sort Columns
df = df[['Date', 'Division', 'Fighter', 'Ranking', 'Notes']]

df.head(10)

Unnamed: 0,Date,Division,Fighter,Ranking,Notes
2,2024-10-20,Men's Pound-for-Pound,Islam Makhachev,1,
4,2024-10-20,Men's Pound-for-Pound,Alex Pereira,2,
6,2024-10-20,Men's Pound-for-Pound,Jon Jones,3,
8,2024-10-20,Men's Pound-for-Pound,Ilia Topuria,4,
10,2024-10-20,Men's Pound-for-Pound,Belal Muhammad,5,
12,2024-10-20,Men's Pound-for-Pound,Dricus Du Plessis,6,
14,2024-10-20,Men's Pound-for-Pound,Merab Dvalishvili,7,
16,2024-10-20,Men's Pound-for-Pound,Tom Aspinall,8,
18,2024-10-20,Men's Pound-for-Pound,Leon Edwards,9,
20,2024-10-20,Men's Pound-for-Pound,Alexander Volkanovski,10,


Combine the new data to the existing csv in the repository. 

In [23]:
#Read in existing csv from repository
existing_csv = pd.read_csv('UFC_Rankings.csv')

#Combine the existing csv with the new data
combined = pd.concat([existing_csv, df], ignore_index=False)

combined.loc[combined['Ranking'] == 'Champion', 'Ranking'] = 0
combined.loc[combined['Ranking'] == 0, 'Notes'] = 'Champion'

combined['Division'] = pd.Categorical(combined['Division'], categories=divisions, ordered=True)
combined['Ranking'] = combined['Ranking'].astype(int)

#Sort values and convert back to a csv
combined = combined.sort_values(by=['Date', 'Division', 'Ranking'], ascending=[False, True, True])
combined.to_csv('UFC_Rankings.csv', index=False)

FileNotFoundError: [Errno 2] No such file or directory: 'UFC_Rankings.csv'