# Task 1 - Collecting a list of episodes

In [84]:
import requests
from scrapy import Selector
import json
from parsel import Selector
import pandas as pd
import datetime
import re

base_url = str('https://gossipgirl.fandom.com/wiki/Gossip_Girl_Wiki')
response = requests.get(base_url)
sel = Selector(text=response.text)
tv_show = sel.xpath("//title/text()").get()



def get_season_links(base_url):
    seasons_list = []

    # URL of the page to scrape


    # Send a GET request to the URL
    response = requests.get(base_url)

    # Initialize a Selector instance with the response body
    sel = Selector(text=response.text)

    # Use XPath to select the ul element with the specified class
    ul_element = sel.xpath('body/div[5]/div[4]/div[2]/header/nav/ul/li[3]/div[2]/ul/li[6]/div/ul')

    # Use XPath to select all li elements within the ul element
    li_elements = ul_element.xpath('.//li')

    # Iterate over each li element and extract its URL and season number
    seasons_list = [{'season_num': int(li.xpath('.//span/text()').get().strip().split()[-1]),
                     'season_url': li.xpath('.//a/@href').get()} 
                    for li in li_elements]

    return seasons_list

seasons_data = get_season_links(base_url)
print(seasons_data)

[{'season_num': 1, 'season_url': 'https://gossipgirl.fandom.com/wiki/Season_1'}, {'season_num': 2, 'season_url': 'https://gossipgirl.fandom.com/wiki/Season_2'}, {'season_num': 3, 'season_url': 'https://gossipgirl.fandom.com/wiki/Season_3'}, {'season_num': 4, 'season_url': 'https://gossipgirl.fandom.com/wiki/Season_4'}, {'season_num': 5, 'season_url': 'https://gossipgirl.fandom.com/wiki/Season_5'}, {'season_num': 6, 'season_url': 'https://gossipgirl.fandom.com/wiki/Season_6'}]


In [92]:
import requests
from scrapy import Selector
import pandas as pd

def scrape_links_from_season_page(season_url, tv_show, season_num):
    links_data = []

    try:
        # Send a GET request to the season URL
        response = requests.get(season_url)
        response.raise_for_status()  # Raise an exception for bad status codes

        # Initialize a Selector instance with the response body
        sel = Selector(text=response.text)

        # Find the starting point for scraping text
        start_tag = sel.xpath('//b').get()
        start_index = response.text.find(start_tag) + len(start_tag)

        # Create a new Selector starting from the <b> tag
        sel = Selector(text=response.text[start_index:])

        # Select all paragraph elements containing hyperlinks
        paragraphs = sel.xpath('//p')

        # Iterate over each paragraph element and extract link information
        for paragraph_id, paragraph in enumerate(paragraphs, start=1):
            # Select all hyperlinks within the paragraph
            links = paragraph.xpath('.//a')

            # Iterate over each link in the paragraph
            for link in links:
                # Extract link title using adjusted XPath expression
                link_title = link.xpath('./text() | ./descendant-or-self::*/text()').get()
                link_url = link.xpath('./@href').get()
                link_url = base_url + link_url

                # Append link information to the list
                links_data.append({
                    'tv_show': tv_show,
                    'season_num': season_num,
                    'paragraph_id': paragraph_id,
                    'link_title': link_title.strip() if link_title else None,
                    'link_url': link_url
                })

    except Exception as e:
        print(f"Error scraping links from {season_url}: {e}")

    return links_data

# Initialize an empty list to store links data for all seasons
all_links_data = []
for season_data in seasons_data:
    season_url = season_data['season_url']
    season_num = season_data['season_num']
    
    # Call the scrape_links_from_season_page function with the current season data
    links_data = scrape_links_from_season_page(season_url, tv_show, season_num)
    
    all_links_data.extend(links_data)

# Create a DataFrame from the accumulated links data
df_links = pd.DataFrame(all_links_data)

# Save the DataFrame to a CSV file
file_path = '../task2.csv'
df_links.to_csv(file_path, index=False)



In [86]:
def get_episode_data(season_url):
    all_episodes = {
        'episode_num': [],
        'episode_url': [],
        'episode_title': [],
        'air_date': []
    }
    
    try:
        tables = pd.read_html(season_url, extract_links='body')
        for table in tables:
            if 'Title' in table.columns and 'Airdate' in table.columns:
                skip_next_row = False
                for index, row in table.iterrows():
                    if skip_next_row:
                        skip_next_row = False
                        continue
                    if row['Title'] == 'Title':
                        skip_next_row = True
                        continue
                    episode_num = int(row['#'][0])
                    episode_title = row['Title'][0] 
                    air_date_str = row['Airdate'][0] 
                    if re.match(r"^\w+\s\d{1,2},\s\d{4}$", air_date_str):
                        air_date = datetime.strptime(air_date_str, '%B %d, %Y').date()
                    else:
                        air_date = air_date_str
                    episode_url = row['Title'][1] 
                    if episode_url is not None:
                        episode_url = base_url + episode_url
                    all_episodes['episode_num'].append(episode_num)
                    all_episodes['episode_url'].append(episode_url)
                    all_episodes['episode_title'].append(episode_title)
                    all_episodes['air_date'].append(air_date)
                all_episodes = {key: value[::2] for key, value in all_episodes.items()}
    except Exception as e:
        print(f"Error getting episode data for {season_url}: {e}")
    
    return all_episodes



In [89]:
# Create an initial data frame with the seasons' links
df = pd.DataFrame.from_dict(get_season_links(base_url))

# Add the TV show name to the data frame
tv_show = 'gossip girl'
df['tv_show'] = tv_show


# Create a new column with all episode information
df['episode_data'] = df['season_url'].apply(get_episode_data)


df = (
    pd.json_normalize(df['episode_data'])
    .join(df.drop(columns='episode_data'))
    .explode(['episode_num', 'episode_url', 'episode_title', 'air_date'])
)

# Re-order the columns
ordered_columns = ['tv_show', 'season_num', 'season_url',
                   'episode_num', 'episode_url', 
                   'episode_title', 'air_date']
df = df[ordered_columns].copy()
display(df)
print(df)

file_path = '../data.frame.csv'
df.to_csv(file_path, index=False)

Error getting episode data for https://gossipgirl.fandom.com/wiki/Season_1: module 'datetime' has no attribute 'strptime'
Error getting episode data for https://gossipgirl.fandom.com/wiki/Season_2: module 'datetime' has no attribute 'strptime'
Error getting episode data for https://gossipgirl.fandom.com/wiki/Season_3: module 'datetime' has no attribute 'strptime'
Error getting episode data for https://gossipgirl.fandom.com/wiki/Season_4: module 'datetime' has no attribute 'strptime'
Error getting episode data for https://gossipgirl.fandom.com/wiki/Season_5: module 'datetime' has no attribute 'strptime'
Error getting episode data for https://gossipgirl.fandom.com/wiki/Season_6: module 'datetime' has no attribute 'strptime'


Unnamed: 0,tv_show,season_num,season_url,episode_num,episode_url,episode_title,air_date
0,gossip girl,1,https://gossipgirl.fandom.com/wiki/Season_1,,,,
1,gossip girl,2,https://gossipgirl.fandom.com/wiki/Season_2,,,,
2,gossip girl,3,https://gossipgirl.fandom.com/wiki/Season_3,,,,
3,gossip girl,4,https://gossipgirl.fandom.com/wiki/Season_4,,,,
4,gossip girl,5,https://gossipgirl.fandom.com/wiki/Season_5,,,,
5,gossip girl,6,https://gossipgirl.fandom.com/wiki/Season_6,,,,


       tv_show  season_num                                   season_url  \
0  gossip girl           1  https://gossipgirl.fandom.com/wiki/Season_1   
1  gossip girl           2  https://gossipgirl.fandom.com/wiki/Season_2   
2  gossip girl           3  https://gossipgirl.fandom.com/wiki/Season_3   
3  gossip girl           4  https://gossipgirl.fandom.com/wiki/Season_4   
4  gossip girl           5  https://gossipgirl.fandom.com/wiki/Season_5   
5  gossip girl           6  https://gossipgirl.fandom.com/wiki/Season_6   

  episode_num episode_url episode_title air_date  
0         NaN         NaN           NaN      NaN  
1         NaN         NaN           NaN      NaN  
2         NaN         NaN           NaN      NaN  
3         NaN         NaN           NaN      NaN  
4         NaN         NaN           NaN      NaN  
5         NaN         NaN           NaN      NaN  


 #  Task 2: Collecting important info from episodes

###    note: the task asks to scrape important info from each synopsis, however my page only contains links in its 'season synopses' - as advised by staff, my task 2 scrapes these links

In [78]:
import requests
from scrapy import Selector
import pandas as pd

def scrape_links_from_season_page(season_url, tv_show, season_num):
    links_data = []

    try:
        # Send a GET request to the season URL
        response = requests.get(season_url)
        response.raise_for_status()  # Raise an exception for bad status codes

        # Initialize a Selector instance with the response body
        sel = Selector(text=response.text)

        # Find the starting point for scraping text
        start_tag = sel.xpath('//b').get()
        start_index = response.text.find(start_tag) + len(start_tag)

        # Create a new Selector starting from the <b> tag
        sel = Selector(text=response.text[start_index:])

        # Select all paragraph elements containing hyperlinks
        paragraphs = sel.xpath('//p')

        # Iterate over each paragraph element and extract link information
        for paragraph_id, paragraph in enumerate(paragraphs, start=1):
            # Select all hyperlinks within the paragraph
            links = paragraph.xpath('.//a')

            # Iterate over each link in the paragraph
            for link in links:
                # Extract link title using adjusted XPath expression
                link_title = link.xpath('./text() | ./descendant-or-self::*/text()').get()
                link_url = link.xpath('./@href').get()
                link_url = base_url + link_url

                # Append link information to the list
                links_data.append({
                    'tv_show': tv_show,
                    'season_num': season_num,
                    'paragraph_id': paragraph_id,
                    'link_title': link_title.strip() if link_title else None,
                    'link_url': link_url
                })

    except Exception as e:
        print(f"Error scraping links from {season_url}: {e}")

    return links_data

# Initialize an empty list to store links data for all seasons
all_links_data = []
for season_data in seasons_data:
    season_url = season_data['season_url']
    season_num = season_data['season_num']
    
    links_data = scrape_links_from_season_page(season_url, tv_show, season_num)
    
    all_links_data.extend(links_data)

# Create a DataFrame from the accumulated links data
df_links = pd.DataFrame(all_links_data)

# Save the DataFrame to a CSV file
file_path = '//Users/amimai875/Documents/ds105/DS105W/ds105w-2024-w08-summative-amimai875/task2.csv'
df_links.to_csv(file_path, index=False)

