In [2]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import time
import csv
import os
import shutil
from concurrent.futures import ThreadPoolExecutor, as_completed
from threading import Lock

In [None]:
# Download github data

shutil.rmtree('./data')  # Remove the directory and all its contents
os.mkdir('./data')
os.makedirs('./data/github/', exist_ok=True)
urls = [
    'https://raw.githubusercontent.com/Greco1899/scrape_ufc_stats/main/ufc_event_details.csv',
    'https://raw.githubusercontent.com/Greco1899/scrape_ufc_stats/main/ufc_fight_results.csv',
    #'https://raw.githubusercontent.com/Greco1899/scrape_ufc_stats/main/ufc_fight_details.csv',
    #'https://raw.githubusercontent.com/Greco1899/scrape_ufc_stats/main/ufc_fight_stats.csv',
    #'https://raw.githubusercontent.com/Greco1899/scrape_ufc_stats/main/ufc_fighter_details.csv',
    'https://raw.githubusercontent.com/Greco1899/scrape_ufc_stats/main/ufc_fighter_tott.csv'
]
for url in urls:
    filename = os.path.join('./data/github/', url.split('/')[-1])
    response = requests.get(url)
    if response.status_code == 200:
        with open(filename, 'wb') as file:
            file.write(response.content)
        print(f"Downloaded: {filename}")
    else:
        print(f"Failed to download: {url}")
for url in urls:
    filename = os.path.join('./data/github/', url.split('/')[-1])
    df = pd.read_csv(filename)
    print(f"Total rows in {filename}: {len(df)}")

In [None]:
# Scrape fighter pages from ufcstats.com

session = requests.Session()
session.headers.update({
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
})
df = pd.read_csv('./data/github/ufc_fighter_tott.csv')
print(f"Loaded {len(df)} fighters from CSV")
fighter_details = []
failed_fighters = []
details_lock = Lock()
failed_lock = Lock()
counter_lock = Lock()
completed_count = 0
def scrape_fighter(fighter_info):
    """Scrape a single fighter's details - thread-safe function"""
    global completed_count
    index, fighter_name, fighter_url = fighter_info
    try:
        response = session.get(fighter_url, timeout=15)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')
        fighter_data = {
            'name': fighter_name,
            'url': fighter_url,
            'height': '', 'weight': '', 'reach': '', 'stance': '', 'dob': '',
            'slpm': '', 'str_acc': '', 'sapm': '', 'str_def': '',
            'td_avg': '', 'td_acc': '', 'td_def': '', 'sub_avg': ''
        }
        info_boxes = soup.find_all('div', class_='b-list__info-box')
        for info_box in info_boxes:
            list_items = info_box.find_all('li', class_='b-list__box-list-item')
            for item in list_items:
                title_elem = item.find('i', class_='b-list__box-item-title')
                if title_elem:
                    title = title_elem.text.strip().lower().replace(':', '')
                    value = item.get_text().replace(title_elem.get_text(), '').strip()
                    if 'height' in title:
                        fighter_data['height'] = value
                    elif 'weight' in title:
                        fighter_data['weight'] = value
                    elif 'reach' in title:
                        fighter_data['reach'] = value
                    elif 'stance' in title:
                        fighter_data['stance'] = value
                    elif 'dob' in title:
                        fighter_data['dob'] = value
                    elif 'slpm' in title:
                        fighter_data['slpm'] = value
                    elif 'str. acc' in title:
                        fighter_data['str_acc'] = value
                    elif 'sapm' in title:
                        fighter_data['sapm'] = value
                    elif 'str. def' in title:
                        fighter_data['str_def'] = value
                    elif 'td avg' in title:
                        fighter_data['td_avg'] = value
                    elif 'td acc' in title:
                        fighter_data['td_acc'] = value
                    elif 'td def' in title:
                        fighter_data['td_def'] = value
                    elif 'sub. avg' in title:
                        fighter_data['sub_avg'] = value
        for key, value in fighter_data.items():
            if value == '--':
                fighter_data[key] = ''
        with details_lock:
            fighter_details.append(fighter_data)
        with counter_lock:
            completed_count += 1
            if completed_count % 100 == 0 or completed_count <= 20:
                print(f"✓ Progress: {completed_count}/{len(df)} - Latest: {fighter_name}")
        return True
    except Exception as e:
        with failed_lock:
            failed_fighters.append({'name': fighter_name, 'url': fighter_url, 'error': str(e)})
        with counter_lock:
            completed_count += 1
            if completed_count <= 20:  
                print(f"✗ Failed: {fighter_name} - {str(e)}")
        return False
fighter_list = [(index, row['FIGHTER'], row['URL']) for index, row in df.iterrows()]
print(f"Starting FAST parallel scraping with 10 threads...")
start_time = time.time()
with ThreadPoolExecutor(max_workers=10) as executor:
    futures = [executor.submit(scrape_fighter, fighter_info) for fighter_info in fighter_list]
    for future in as_completed(futures):
        future.result()  
end_time = time.time()
elapsed_time = end_time - start_time
print(f"\n=== FAST SCRAPING COMPLETED ===")
print(f"Total time: {elapsed_time:.1f} seconds ({elapsed_time/60:.1f} minutes)")
print(f"Successfully scraped: {len(fighter_details)} fighters")
print(f"Failed: {len(failed_fighters)} fighters")
print(f"Success rate: {len(fighter_details)}/{len(df)} ({len(fighter_details)/len(df)*100:.1f}%)")
print(f"Average time per fighter: {elapsed_time/len(df):.3f} seconds")
if fighter_details:
    fieldnames = [
        'name', 'url', 'height', 'weight', 'reach', 'stance', 'dob',
        'slpm', 'str_acc', 'sapm', 'str_def', 'td_avg', 'td_acc', 'td_def', 'sub_avg'
    ]
    with open('./data/fighters.csv', 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        writer.writerows(fighter_details)
    print(f"✓ Saved {len(fighter_details)} fighter details to ./data/fighters.csv")
else:
    print("No fighter details to save")
    
# Display summary and sample data
if fighter_details:
    df_results = pd.DataFrame(fighter_details)
    print("=== SCRAPING SUMMARY ===")
    print(f"Total fighters scraped: {len(df_results)}")
    print(f"Total columns: {len(df_results.columns)}")
    print(f"Success rate: {len(fighter_details)}/{len(df)} ({len(fighter_details)/len(df)*100:.1f}%)")
    print("\n=== COLUMN ANALYSIS ===")
    for col in df_results.columns:
        missing_count = (df_results[col] == '').sum()
        missing_pct = (missing_count / len(df_results)) * 100
        print(f"{col}: {missing_count} missing ({missing_pct:.1f}%)")
    print(f"\n=== SAMPLE DATA (First 3 fighters) ===")
    print(df_results.head(3).to_string())
    if failed_fighters:
        print(f"\n=== FAILED FIGHTERS ===")
        for fighter in failed_fighters:
            print(f"- {fighter['name']}: {fighter['error']}")
else:
    print("No data scraped")



In [None]:
# Delete all events before 2010 from ufc_fight_results.csv

import pandas as pd
print('=== CLEANING FIGHT RESULTS - REMOVING PRE-2010 EVENTS ===')
fight_results_df = pd.read_csv('./data/github/ufc_fight_results.csv')
events_df = pd.read_csv('./data/github/ufc_event_details.csv')
print(f'Original fight results count: {len(fight_results_df)}')
fight_results_df['EVENT'] = fight_results_df['EVENT'].str.strip()
events_df['DATE_PARSED'] = pd.to_datetime(events_df['DATE'], format='%B %d, %Y')
events_before_2010 = events_df[events_df['DATE_PARSED'] < '2010-01-01']
event_names_to_remove = events_before_2010['EVENT'].tolist()
print(f'Events before 2010 to remove: {len(event_names_to_remove)}')
matching_events = set(event_names_to_remove) & set(fight_results_df['EVENT'].unique())
print(f'Matching events found: {len(matching_events)}')
if len(matching_events) > 0:
    print(f'Removing fights from {len(matching_events)} events...')
    fights_before_removal = len(fight_results_df)
    fight_results_cleaned = fight_results_df[~fight_results_df['EVENT'].isin(matching_events)]
    fights_after_removal = len(fight_results_cleaned)
    print(f'Fights removed: {fights_before_removal - fights_after_removal}')
    print(f'Remaining fights: {fights_after_removal}')
    fight_results_cleaned.to_csv('./data/github/ufc_fight_results.csv', index=False)
    print('✓ Saved cleaned fight results to ./data/github/ufc_fight_results.csv')
    fight_results_df.to_csv('./data/github/ufc_fight_results_backup.csv', index=False)
    print('✓ Created backup at ./data/github/ufc_fight_results_backup.csv')
    fight_events_in_details = events_df[events_df['EVENT'].isin(fight_results_cleaned['EVENT'].unique())]
    if len(fight_events_in_details) > 0:
        fight_events_in_details['DATE_PARSED'] = pd.to_datetime(fight_events_in_details['DATE'], format='%B %d, %Y')
        earliest = fight_events_in_details['DATE_PARSED'].min()
        latest = fight_events_in_details['DATE_PARSED'].max()
        print(f'New date range: {earliest.strftime("%Y-%m-%d")} to {latest.strftime("%Y-%m-%d")}')
        before_2010_remaining = fight_events_in_details[fight_events_in_details['DATE_PARSED'] < '2010-01-01']
        if len(before_2010_remaining) == 0:
            print('✅ SUCCESS: All events before 2010 have been removed!')
        else:
            print(f'⚠️ WARNING: {len(before_2010_remaining)} events before 2010 still remain')
else:
    print('❌ No matching events found - the fight results may already be filtered to post-2010 events')
    
    

# Delete all fighters before 2010 in fighters.csv
print('=== FILTERING FIGHTER INFO TO MATCH POST-2010 FIGHTS ===')
fight_results_df = pd.read_csv('./data/github/ufc_fight_results.csv')
fighter_info_df = pd.read_csv('./data/fighters.csv')
print(f'Fight results count: {len(fight_results_df)}')
print(f'Fighter info count: {len(fighter_info_df)}')
all_fighters_in_bouts = []
for bout in fight_results_df['BOUT']:
    if ' vs. ' in bout:
        fighters = bout.split(' vs. ')
        if len(fighters) == 2:
            all_fighters_in_bouts.extend([fighters[0].strip(), fighters[1].strip()])
unique_fighters_post_2010 = set(all_fighters_in_bouts)
print(f'Unique fighters in post-2010 fights: {len(unique_fighters_post_2010)}')
fighter_info_df['name_lower'] = fighter_info_df['name'].str.lower()
unique_fighters_lower = {name.lower() for name in unique_fighters_post_2010}
fighter_info_filtered = fighter_info_df[fighter_info_df['name_lower'].isin(unique_fighters_lower)]
fighter_info_filtered = fighter_info_filtered.drop('name_lower', axis=1)
print(f'Fighters before filtering: {len(fighter_info_df)}')
print(f'Fighters after filtering: {len(fighter_info_filtered)}')
print(f'Fighters removed: {len(fighter_info_df) - len(fighter_info_filtered)}')
fighter_info_filtered.to_csv('./data/fighters.csv', index=False)
print('✓ Saved filtered fighter info to ./data/fighters.csv')
print(f'Retention rate: {len(fighter_info_filtered)/len(fighter_info_df)*100:.1f}%')



In [None]:
# Remove all women fighters from fighters.csv

print('\n=== REMOVING WOMEN FIGHTERS FROM FIGHTER INFO ===')
fight_results_df = pd.read_csv('./data/github/ufc_fight_results.csv')
fighter_info_df = pd.read_csv('./data/fighters.csv')
print(f'Original fighter info count: {len(fighter_info_df)}')
womens_weight_classes = fight_results_df[fight_results_df['WEIGHTCLASS'].str.contains('Women\'s', na=False)]['WEIGHTCLASS'].unique()
print(f'Women\'s weight classes found: {len(womens_weight_classes)}')
for wc in womens_weight_classes:
    print(f'  - {wc}')
women_fighters_in_bouts = []
womens_fights = fight_results_df[fight_results_df['WEIGHTCLASS'].str.contains('Women\'s', na=False)]
print(f'Total women\'s fights found: {len(womens_fights)}')
for bout in womens_fights['BOUT']:
    if ' vs. ' in bout:
        fighters = bout.split(' vs. ')
        if len(fighters) == 2:
            women_fighters_in_bouts.extend([fighters[0].strip(), fighters[1].strip()])
unique_women_fighters = set(women_fighters_in_bouts)
print(f'Unique women fighters found: {len(unique_women_fighters)}')
fighter_info_df['name_lower'] = fighter_info_df['name'].str.lower()
women_fighters_lower = {name.lower() for name in unique_women_fighters}
fighter_info_men_only = fighter_info_df[~fighter_info_df['name_lower'].isin(women_fighters_lower)]
fighter_info_men_only = fighter_info_men_only.drop('name_lower', axis=1)
print(f'Fighters before removing women: {len(fighter_info_df)}')
print(f'Fighters after removing women: {len(fighter_info_men_only)}')
print(f'Women fighters removed: {len(fighter_info_df) - len(fighter_info_men_only)}')
fighter_info_men_only.to_csv('./data/fighters.csv', index=False)
print('✓ Saved men-only fighter info to ./data/fighters.csv')
fighter_info_df.drop('name_lower', axis=1).to_csv('./data/fighter_info_with_women_backup.csv', index=False)
print('✓ Created backup with women fighters at ./data/fighter_info_with_women_backup.csv')
print(f'Retention rate: {len(fighter_info_men_only)/len(fighter_info_df)*100:.1f}%')
print(f'\nSample of removed women fighters:')
removed_women = list(unique_women_fighters)[:10]
for fighter in removed_women:
    print(f'  - {fighter}')
if len(unique_women_fighters) > 10:
    print(f'  ... and {len(unique_women_fighters) - 10} more')
print('\n✅ SUCCESS: All women fighters have been removed from fighters.csv!')


In [None]:
# Check for missing values in fighters.csv and remove rows with missing reach/height/stance

print('\n=== CHECKING FOR MISSING VALUES IN FIGHTER INFO ===')
fighter_info_df = pd.read_csv('./data/fighters.csv')
print(f'Total fighters before cleaning: {len(fighter_info_df)}')
missing_values = fighter_info_df.isna().sum()
missing_df = pd.DataFrame({
    'Missing Values': missing_values
})
print('\nMissing values by column:')
print(missing_df)
empty_strings = (fighter_info_df == '').sum()
if empty_strings.sum() > 0:
    print('\nEmpty strings by column:')
    print(empty_strings[empty_strings > 0])
else:
    print('\nNo columns have more than 5% missing values.')
print('\nRows with missing reach values:')
missing_reach_df = fighter_info_df[fighter_info_df['reach'].isna()]
print(missing_reach_df)
print(f'Total rows with missing reach: {len(missing_reach_df)}')
fighter_info_df = fighter_info_df.dropna(subset=['reach'])
print(f'\nTotal fighters after removing rows with missing reach: {len(fighter_info_df)}')
print(f'Removed {len(missing_reach_df)} fighters with missing reach data')
fighter_info_df.to_csv('./data/fighters.csv', index=False)
print('✓ Saved cleaned fighter info to ./data/fighters.csv')
print('\nMissing values in each column after cleaning:')
print(fighter_info_df.isna().sum())
print('\nRows with missing values:')
missing_rows = fighter_info_df[fighter_info_df.isna().any(axis=1)]
print(missing_rows.to_string())
print(f'Total rows with missing values: {len(missing_rows)}')
fighter_info_df = fighter_info_df.dropna()
fighter_info_df.to_csv('./data/fighters.csv', index=False)
print(f'\nRemoved all rows with any missing values. Final count: {len(fighter_info_df)}')
print('✓ Saved final cleaned fighter info to ./data/fighters.csv')
print('\n✅ Missing values analysis complete and all rows with missing values removed!')
print('\nRows with missing values:')
missing_rows = fighter_info_df[fighter_info_df.isna().any(axis=1)]
print(missing_rows.to_string())
print(f'Total rows with missing values: {len(missing_rows)}')

In [None]:
# Create clean fights.csv master file from fight results data

print('=== CREATING CLEAN FIGHTS.CSV MASTER FILE ===')
fight_results_df = pd.read_csv('./data/github/ufc_fight_results.csv')
print(f'Original fight results count: {len(fight_results_df)}')
print('\n--- Removing Women\'s Fights ---')
womens_fights = fight_results_df[fight_results_df['WEIGHTCLASS'].str.contains('Women\'s', na=False)]
print(f'Women\'s fights to remove: {len(womens_fights)}')
fights_men_only = fight_results_df[~fight_results_df['WEIGHTCLASS'].str.contains('Women\'s', na=False)]
print(f'Fights after removing women: {len(fights_men_only)}')
print(f'Women\'s fights removed: {len(fight_results_df) - len(fights_men_only)}')
print('\n--- Extracting Fighter Names ---')
fights_clean = fights_men_only.copy()
print('Extracting fighter names from BOUT column...')
fights_clean['FIGHTER_1'] = ''
fights_clean['FIGHTER_2'] = ''
for idx, bout in enumerate(fights_clean['BOUT']):
    if ' vs. ' in bout:
        fighters = bout.split(' vs. ')
        if len(fighters) == 2:
            fights_clean.loc[fights_clean.index[idx], 'FIGHTER_1'] = fighters[0].strip()
            fights_clean.loc[fights_clean.index[idx], 'FIGHTER_2'] = fighters[1].strip()
final_columns = [
    'EVENT', 'BOUT', 'FIGHTER_1', 'FIGHTER_2', 'OUTCOME', 
    'WEIGHTCLASS', 'METHOD', 'ROUND', 'TIME', 'TIME FORMAT', 
    'REFEREE', 'DETAILS', 'URL'
]
fights_master = fights_final[final_columns].copy()
fights_master = fights_master.sort_values(['EVENT', 'BOUT']).reset_index(drop=True)
fights_master.to_csv('./data/fights.csv', index=False)
print(f'✓ Saved {len(fights_master)} fights to ./data/fights.csv')


In [None]:
## Summary statistics and verification
#print('\n=== FIGHTS.CSV MASTER FILE SUMMARY ===')
#print(f'Total fights in master file: {len(fights_master)}')
#print(f'Original fight results: {len(fight_results_df)}')
#print(f'Fights removed: {len(fight_results_df) - len(fights_master)}')
#print(f'Retention rate: {len(fights_master)/len(fight_results_df)*100:.1f}%')
#print('\nWeight class distribution:')
#print(fights_master['WEIGHTCLASS'].value_counts().head(10))
#print('\nMethod distribution:')  
#print(fights_master['METHOD'].value_counts().head(10))
#print('\nOutcome distribution:')
#print(fights_master['OUTCOME'].value_counts())
#print('\nDate range verification:')
#unique_events = fights_master['EVENT'].nunique()
#print(f'Total unique events: {unique_events}')
#events_df = pd.read_csv('./data/github/ufc_event_details.csv')
#events_df['DATE_PARSED'] = pd.to_datetime(events_df['DATE'], format='%B %d, %Y')
#fight_events = events_df[events_df['EVENT'].isin(fights_master['EVENT'].unique())]
#if len(fight_events) > 0:
#    earliest = fight_events['DATE_PARSED'].min()
#    latest = fight_events['DATE_PARSED'].max()
#    print(f'Event date range: {earliest.strftime("%Y-%m-%d")} to {latest.strftime("%Y-%m-%d")}')
#    pre_2010 = fight_events[fight_events['DATE_PARSED'] < '2010-01-01']
#    if len(pre_2010) == 0:
#        print('✅ Confirmed: No pre-2010 events in dataset')
#    else:
#        print(f'⚠️ Warning: {len(pre_2010)} pre-2010 events found')
#print('\n✅ SUCCESS: Clean fights.csv master file created!')
#print(f'File location: ./data/fights.csv')
#print(f'Total fights: {len(fights_master)}')
#print(f'Total columns: {len(fights_master.columns)}')
#print('\n=== SAMPLE DATA (First 5 fights) ===')
#print(fights_master.head().to_string())