In [37]:
import pandas as pd
from lxml import html
import requests

# Function to extract data from a single page
def extract_data_from_page(page_number):
    page_url = 'https://www.latlong.net/tv-series-locations?page={}'
    base_url = 'https://www.latlong.net'
    
    response = requests.get(page_url.format(page_number))
    tree = html.fromstring(response.content)
    
    # Extract hrefs and names
    hrefs = tree.xpath('/html/body/main/div[3]/div/div/a/@href')
    names = tree.xpath('/html/body/main/div[3]/div/div/a/text()')
    
    # Construct full URLs
    full_urls = [f"{base_url}{href}" for href in hrefs]
    
    return names, full_urls

# Function to extract location details from each TV show page
def extract_location_details(url):
    tables = pd.read_html(url)
    if tables:
        return tables[0]
    else:
        return pd.DataFrame()

# Main script to loop through pages and gather data
all_names = []
all_urls = []

page_number = 1
while True:
    names, urls = extract_data_from_page(page_number)
    
    # If no more names or urls are found, break the loop
    if not names or not urls:
        break
    
    all_names.extend(names)
    all_urls.extend(urls)
    
    page_number += 1

# Correct the URLs
corrected_urls = [url.replace('tv-series-locations/location', 'location') for url in all_urls]

# Create a DataFrame with the basic information
data = {'TV Show Name': all_names, 'TV Show Page Link': corrected_urls}
df = pd.DataFrame(data)

# Loop through each TV show page and extract location details
all_location_data = []

for index, row in df.iterrows():
    tv_show_name = row['TV Show Name']
    tv_show_url = row['TV Show Page Link']
    
    location_df = extract_location_details(tv_show_url)
    location_df['TV Show Name'] = tv_show_name
    location_df['TV Show Page Link'] = tv_show_url
    
    all_location_data.append(location_df)

# Combine all location data into a single DataFrame
final_df = pd.concat(all_location_data, ignore_index=True)

# Display the final DataFrame
print(final_df)


                         Location Name   Latitude   Longitude  \
0                         1320 Ohio St  38.107109 -122.244179   
1                       231 Bayview St  37.967632 -122.534531   
2                      415 Virginia St  38.101749 -122.256142   
3                       419 Georgia St  38.100761 -122.255974   
4     Analy High School (Liberty High)  38.407238 -122.826286   
...                                ...        ...         ...   
5977                       North Beach  37.805542 -122.411461   
5978                          Richmond  49.165783 -123.136826   
5979                   The Embarcadero  37.796528 -122.403854   
5980                         Vancouver  49.278507 -123.126572   
5981         Whites Studios Copperwood  49.124706 -123.102264   

                       TV Show Name  \
0                    13 Reasons Why   
1                    13 Reasons Why   
2                    13 Reasons Why   
3                    13 Reasons Why   
4                    13 

In [39]:
final_df.to_csv('tv_show_locations.csv')