In [None]:
import pandas as pd
import scrapy
from scrapy.crawler import CrawlerProcess

## Initialize Lists and Parameters

We set up:
- `pages`: the number of pages to scrape from [ufcstats.com](http://ufcstats.com)
- `event_links`: an empty list to hold event URLs.
- `fights`: an empty list for storing parsed fight details.

In [None]:
pages = 1  # Set the number of pages to scrape
event_links = []  # List to store event links separately
fights = []  # List to store fight details

## Define the Scrapy Spider

This class, `UfcSpider`, handles the logic for:
1. Navigating through the UFCStats pages.
2. Extracting event links.
3. Visiting each event to get fight URLs, dates, and locations.
4. Finally, parsing detailed fight stats (fighter names, round, result, etc.).

In [None]:
class UfcSpider(scrapy.Spider):
    name = "ufc_spider"  # Name of the spider

    def start_requests(self):
        """
        Start by sending requests to the UFC statistics events page for each page.
        """
        for p in range(1, pages + 1):
            url = f"http://ufcstats.com/statistics/events/completed?page={p}"
            yield scrapy.Request(url=url, callback=self.parse_main)

    def parse_main(self, response):
        """
        Extract event links from the page and follow them.
        """
        event_links_on_page = response.css(
            "a.b-link.b-link_style_black::attr(href)"
        ).extract()

        for e in event_links_on_page:
            event_links.append({"event_link": e})  # Store event link
            yield response.follow(url=e, callback=self.parse_events)

    def parse_events(self, response):
        """
        Extract fight links, date, and location from the event page.
        Follow each fight link to extract fight details.
        """
        fight_links = response.css("a.b-flag.b-flag_style_green::attr(href)").extract()
        date = response.css("li.b-list__box-list-item:nth-child(1)::text").extract()[1]
        location = response.css(
            "li.b-list__box-list-item:nth-child(2)::text"
        ).extract()[1]

        for f in fight_links:
            yield response.follow(
                url=f,
                callback=self.parse_fights,
                meta={"date": date, "location": location},
            )

    def parse_fights(self, response):
        """
        Extract fight details such as fighters, results, and statistics.
        """
        date = response.meta["date"]  # Extract date from metadata
        location = response.meta["location"]  # Extract location from metadata

        # Extract fighter details
        fighter_details = response.css("div.b-fight-details__person")
        win_loss_1, win_loss_2 = fighter_details.css(
            "i.b-fight-details__person-status::text"
        ).extract()[0:2]
        name_1, name_2 = fighter_details.css("h3 > a::text").extract()[0:2]
        stage_name_1, stage_name_2 = fighter_details.css(
            "p.b-fight-details__person-title::text"
        ).extract()[0:2]

        # Extract fight details
        fight_details = response.css("div.b-fight-details__content")
        method = fight_details.css(
            "p:nth-child(1) > i.b-fight-details__text-item_first > i:nth-child(2)::text"
        ).get()
        round_num = fight_details.css(
            "p:nth-child(1) > i:nth-child(2)::text"
        ).extract()[1]
        time = fight_details.css("p:nth-child(1) > i:nth-child(3)::text").extract()[1]
        time_format = fight_details.css(
            "p:nth-child(1) > i:nth-child(4)::text"
        ).extract()[1]
        referee = fight_details.css(
            "p:nth-child(1) > i:nth-child(5) > span::text"
        ).get()
        details = fight_details.css("p:nth-child(2)::text").extract()[1].strip()

        # Extract fight statistics
        stats_table = response.css(
            "body > section > div > div > section:nth-child(4) > table > tbody"
        )
        kd_1, kd_2 = stats_table.css("td:nth-child(2) p::text").extract()[0:2]
        sig_str_1, sig_str_2 = stats_table.css("td:nth-child(3) p::text").extract()[0:2]
        total_str_1, total_str_2 = stats_table.css("td:nth-child(5) p::text").extract()[
            0:2
        ]

        # Append all extracted fight details to the fights list
        fights.append(
            {
                "fight_link": response.url,
                "date": date,
                "location": location,
                "method": method,
                "round": round_num,
                "time": time,
                "time_format": time_format,
                "referee": referee,
                "details": details,
                "name_1": name_1,
                "name_2": name_2,
                "stage_name_1": stage_name_1,
                "stage_name_2": stage_name_2,
                "win_loss_1": win_loss_1,
                "win_loss_2": win_loss_2,
                "kd_1": kd_1,
                "kd_2": kd_2,
                "sig_str_1": sig_str_1,
                "sig_str_2": sig_str_2,
                "total_str_1": total_str_1,
                "total_str_2": total_str_2,
            }
        )

## Run the Scrapy Spider

Create a `CrawlerProcess` instance, add our spider, and start the crawling process.
This begins the full web-scraping routine.

In [None]:
process = CrawlerProcess()
process.crawl(UfcSpider)
process.start()

## Convert the Results to DataFrames

After scraping, we have two lists: `event_links` and `fights`.
Here we create pandas DataFrames for further processing and analysis.

In [None]:
event_links_df = pd.DataFrame(event_links)
fights_df = pd.DataFrame(fights)

## Clean the DataFrame Strings

We apply a `lambda` function to **strip whitespace** from any string columns in the `fights_df`.

In [None]:
fights_df = fights_df.applymap(lambda x: x.strip() if isinstance(x, str) else x)

## Specify the Output Directory

Here we define where we want to save our CSV files.
Make sure the path is correct for your system or adjust it as necessary.

In [None]:
save_path = "C:\\Users\\ahlaw\\OneDrive - UBC\\Documents\\vscode\\Projects\\UFC_data_webscraping\\Data\\Raw\\"

## Save DataFrames as CSV Files

Finally, we export both DataFrames to CSV.
These files will appear in the specified `save_path` directory.

In [None]:
event_links_df.to_csv(save_path + "event_links.csv", index=False)
fights_df.to_csv(save_path + "fight_details.csv", index=False)

## Display Success Message

A quick confirmation that everything ran smoothly.

In [None]:
print("DataFrames saved successfully in:", save_path)