# Set Up

- Installing required libraries/ packages

In [3]:
import os
import re
import requests
import json
import csv
import shutil
from scrapy import Selector
from urllib.parse import urlparse
from datetime import datetime

import numpy as np
import pandas as pd
from plotnine import *
from tqdm.notebook import tqdm

%matplotlib inline
%config InlineBackend.figure_formats = ['svg']



#  Generative AI Statement 

In compliance with the DS105W 🤖 Generative AI Policy, I declare that I have used:
- Chat GPT 3.5 and logs have been uploaded as for GenAIl project
- I have github co-pilot installed which was mainly used to brainstorm functions and chunks or code to get started
- Chat GPT was mainly used to trouble shoot errors, for instance:
    - When downloading packages to trouble queries
    - To use the type function and help identify the type for the object/ str etc
    - To help edit functions, e.g. using it to split episode and season numbers, or to scrape different headings, such that some series have synopsises, and if not others have plots to scrape from.

# Task 1

- This task will scrape the wiki fandom page


In [5]:
base_url = 'https://peaky-blinders.fandom.com'
response = requests.get(base_url, verify=True)

In [6]:
# Check the status code
if response.status_code == 200:
    'Everything is OK, status code:', response.status_code
    response.text[:100]
else:
    print('Something went wrong, status code:', response.status_code)


In [7]:
# Function to obtain all season numbers & URLs
def get_season_links(base_url):
    """
    This function returns a list of dictionaries with each season's number and URL.
    
    Args:
    - base_url (str): The base URL of the Peaky Blinders wiki fandom.
    
    Returns:
    - list: A list of dictionaries with each season's number and URL.
    """
    response = requests.get(base_url)
    sel = Selector(text=response.text)

    # Find the element containing the list of seasons
    ul_element = sel.xpath('//li[2]/div[2]/ul')
    # Extract individual list items (seasons)
    li_elements = ul_element.xpath('li[not(contains(.//span/text(), "Cast"))]')
    season_list = []

    # Iterate over each list item to extract season number and URL
    for li in li_elements:
        season_url = li.xpath('.//a/@href').get()
        season_str = li.xpath('.//span/text()').get()

        season_num = re.search(r'\d+', season_str).group()

        # Append season number and URL to the list
        season_list.append({'season_num': season_num, 'season_url': season_url})

    seasons_list = season_list[:6]
    return seasons_list

base_url = 'https://peaky-blinders.fandom.com/wiki/Peaky_Blinders_Wiki'
# Call the function to get season numbers and URLs
result = get_season_links(base_url)


In [8]:
def get_episode_data(season_url):
    """
    This function returns a single dictionary that represents all the episodes of a season.

    Args:
    - season_url (str): The URL of the season's wiki page.

    Returns:
    - dict: A single dictionary containing the episode number, title, air date, and URL of all episodes.
    """
    response = requests.get(season_url)
    sel = Selector(text=response.text)
    tr_elements = sel.xpath('//tr')

    episode_data = {
        'episode_num': [],
        'episode_title': [],
        'episode_url': [],
        'air_date': []
    }
    date_formats = ['%B %d, %Y', '%d %B %Y', '%d %B, %Y']
    base_url = 'https://peaky-blinders.fandom.com'

    for tr in tr_elements:
        episode_num = tr.xpath('./td[3]/text()').get()
        
        # Check if episode title exists in both formats
        episode_title_a = tr.xpath('./td[2]/a/b/text()').get()
        episode_title_b = tr.xpath('./td[2]/b/a/text()').get()
        episode_title = episode_title_a if episode_title_a else episode_title_b
        
        # Check if episode URL exists in both formats
        episode_url_a = tr.xpath('./td[2]/a/@href').get()
        episode_url_b = tr.xpath('./td[2]/b/a/@href').get()
        episode_url = episode_url_a if episode_url_a else episode_url_b
        
        air_date = tr.xpath('./td[4]/text()').get()
        
        if episode_num and episode_title and episode_url:
            formatted_date = None
            for date_format in date_formats:
                try:
                    formatted_date = datetime.strptime(air_date.strip(), date_format).strftime('%Y-%m-%d')
                    break
                except ValueError:
                    continue
            episode_num = episode_num.strip().split('.')[1] #stripping the season no to only extract episode no.
            episode_num = int(episode_num)  # Converts to integer
            full_episode_url = base_url + episode_url if episode_url else None #adding base url and prefix

            episode_data['episode_num'].append(episode_num)
            episode_data['episode_title'].append(episode_title)
            episode_data['episode_url'].append(full_episode_url)
            episode_data['air_date'].append(formatted_date)

    return episode_data


In [9]:
# Create an initial data frame with the seasons' links
df = pd.DataFrame.from_dict(get_season_links(base_url))

# Add the TV show name to the data frame
df['tv_show'] = 'Peaky Blinders'

# Create a new column with all episode information
df['episode_data'] = df['season_url'].apply(get_episode_data)

# Convert the episode_data column into a data frame 
# and join it with the original data frame
df = (
    pd.json_normalize(df['episode_data'])
    .join(df.drop(columns='episode_data'))
    .explode(['episode_num', 'episode_url', 'episode_title', 'air_date'])
)

# Re-order the columns
ordered_columns = ['tv_show', 'season_num', 'season_url',
                   'episode_num', 'episode_url', 
                   'episode_title', 'air_date']
df = df[ordered_columns].copy()

- Note: Seasons 1-4 Episodes are named 'Episode 1, Episode 2' etc whereas Seasons 5 and 6 episodes are for instance "Black Tuesday, or Noose".

In [10]:
#Saving DataFrame to CSV file
df.to_csv('../data/peaky_blinders_episodes.csv', index=False)
df.head()

Unnamed: 0,tv_show,season_num,season_url,episode_num,episode_url,episode_title,air_date
0,Peaky Blinders,1,https://peaky-blinders.fandom.com/wiki/Series_1,1,https://peaky-blinders.fandom.com/wiki/Episode...,Episode 1,2013-09-12
0,Peaky Blinders,1,https://peaky-blinders.fandom.com/wiki/Series_1,2,https://peaky-blinders.fandom.com/wiki/Episode...,Episode 2,2013-09-19
0,Peaky Blinders,1,https://peaky-blinders.fandom.com/wiki/Series_1,3,https://peaky-blinders.fandom.com/wiki/Episode...,Episode 3,2013-09-26
0,Peaky Blinders,1,https://peaky-blinders.fandom.com/wiki/Series_1,4,https://peaky-blinders.fandom.com/wiki/Episode...,Episode 4,2013-10-03
0,Peaky Blinders,1,https://peaky-blinders.fandom.com/wiki/Series_1,5,https://peaky-blinders.fandom.com/wiki/Episode...,Episode 5,2013-10-10


# TASK 2    

In [11]:
def scrape_peaky_blinders_synopses(episode_url):
    """
    Retrieves synopsis data for a given episode.
    Args:
    - episode_url (str): The URL of the episode's wiki page.

    Returns:
    - dict: A dictionary containing the synopsis data for the episode, 
            with keys 'paragraph_id', 'link_title', and 'link_url' (or None if not found).
    """
    response = requests.get(episode_url)
    sel = Selector(text=response.text)

    # Find the section containing the synopsis
    summary_heading = sel.xpath('//span[@id="Summary"]') or sel.xpath('//span[@id="Plot"]')
    cast_heading = sel.xpath('//span[@id="Cast"]')

    # If neither 'Summary' nor 'Plot' heading found, return None
    if not summary_heading or not cast_heading:
        print("Error: 'Summary' or 'Plot' heading not found.")
        return None

    # Extracting summary paragraphs
    summary_paragraphs_filtered = [
        paragraph 
        for paragraph in summary_heading.xpath('./ancestor::h2/following-sibling::p')
        if paragraph.xpath('.//span[@id="Cast"]/ancestor::h2') != cast_heading.xpath('./ancestor::h2')
    ]

    # Extracting link titles and URLs
    synopsis_data = {
        'paragraph_id': [],
        'link_title': [],
        'link_url': [],
    }
    base_url = 'https://peaky-blinders.fandom.com'
    for paragraph_id, paragraph in enumerate(summary_paragraphs_filtered, start=1):
        link_titles = paragraph.xpath('.//a[contains(@href, "/wiki/")]/text()').extract()
        link_urls = paragraph.xpath('.//a[contains(@href, "/wiki/")]/@href').extract()
        synopsis_data['paragraph_id'].extend([paragraph_id] * len(link_titles))
        full_episode_urls = [base_url + link_url if link_url else None for link_url in link_urls]
        synopsis_data['link_title'].extend(link_titles)
        synopsis_data['link_url'].extend(full_episode_urls)

    return synopsis_data


In [12]:
def get_season_links(base_url):
    """
    This function returns a dictionary with each season's URL and number as key-value pairs.
    Args:
    - base_url (str): The base URL of the Peaky Blinders wiki fandom.
    Returns:
    - dict: A dictionary with each season's URL and number.
    """
    response = requests.get(base_url)
    sel = Selector(text=response.text)

    # Find the element containing the list of seasons
    ul_element = sel.xpath('//li[2]/div[2]/ul')
    # Extract individual list items (seasons)
    li_elements = ul_element.xpath('li[not(contains(.//span/text(), "Cast"))]')
    
    # Use list comprehension to create the dictionary
    season_dict = {
        li.xpath('.//a/@href').get(): {'season_num': re.search(r'\d+', li.xpath('.//span/text()').get()).group()}
        for li in li_elements
    }
    return season_dict

In [13]:
def get_episode_data(season_url):
    response = requests.get(season_url)
    sel = Selector(text=response.text)
    tr_elements = sel.xpath('//tr')

    episode_data = []
    base_url = 'https://peaky-blinders.fandom.com'

    for tr in tr_elements:
        episode_num = tr.xpath('./td[3]/text()').get()
        
        # Check if episode URL exists in both formats
        episode_url_a = tr.xpath('./td[2]/a/@href').get()
        episode_url_b = tr.xpath('./td[2]/b/a/@href').get()
        episode_url = episode_url_a if episode_url_a else episode_url_b
        
        if episode_num and episode_url:
            episode_num = episode_num.strip().split('.')[1]
            episode_num = int(episode_num)  # Convert to integer
            full_episode_url = base_url + episode_url if episode_url else None
            synopsis_data = scrape_peaky_blinders_synopses(full_episode_url)
            if synopsis_data:
                # Pad the lists to ensure they are of equal length
                max_length = max(len(synopsis_data['paragraph_id']), len(synopsis_data['link_title']), len(synopsis_data['link_url']))
                synopsis_data['paragraph_id'] += [None] * (max_length - len(synopsis_data['paragraph_id']))
                synopsis_data['link_title'] += [None] * (max_length - len(synopsis_data['link_title']))
                synopsis_data['link_url'] += [None] * (max_length - len(synopsis_data['link_url']))
                # Create a DataFrame from the dictionary
                synopsis_data_df = pd.DataFrame(synopsis_data)
                synopsis_data_df['tv_show'] = 'Peaky Blinders'
                synopsis_data_df['season_num'] = season_url.split('_')[-1]
                synopsis_data_df['episode_num'] = episode_num
                episode_data.append(synopsis_data_df)

    if episode_data:
        return pd.concat(episode_data)
    else:
        return None

# Get the base URL
base_url = 'https://peaky-blinders.fandom.com/wiki/Peaky_Blinders_Wiki'

season_dict = get_season_links(base_url)

all_episode_data = [] # List to store all episode data
for season_url, season_info in season_dict.items():
    episode_data = get_episode_data(season_url)
    if episode_data is not None:
        all_episode_data.append(episode_data)


# Concatenate all episode data into a single DataFrame
final_episode_data = pd.concat(all_episode_data)
final_episode_data = final_episode_data[['tv_show', 'season_num', 'episode_num', 'paragraph_id', 'link_title', 'link_url']] #Reordering columns

final_episode_data['paragraph_id'] = final_episode_data['paragraph_id'].fillna(1)

final_episode_data['paragraph_id'] = final_episode_data['paragraph_id'].astype(int)
final_episode_data.to_csv('../data/peaky_blinders_synopsis.csv', index=False)
display(final_episode_data.head()) #head of the dataframe





Unnamed: 0,tv_show,season_num,episode_num,paragraph_id,link_title,link_url
0,Peaky Blinders,1,1,1,Shelbys,https://peaky-blinders.fandom.com/wiki/Shelby_...
1,Peaky Blinders,1,1,1,Arthur Shelby,https://peaky-blinders.fandom.com/wiki/Arthur_...
2,Peaky Blinders,1,1,2,church,https://peaky-blinders.fandom.com/wiki/Catholi...
3,Peaky Blinders,1,1,2,Polly Gray,https://peaky-blinders.fandom.com/wiki/Polly_Gray
4,Peaky Blinders,1,1,3,Grace Burgess,https://peaky-blinders.fandom.com/wiki/Grace_S...
