In [1]:
import pandas as pd
from lxml import html
import requests

# Base URL components
base_url = 'https://www.latlong.net'
page_url = 'https://www.latlong.net/movies-locations?page={}'

# Initialize lists to store data
all_names = []
all_urls = []

# Function to extract data from a single page
def extract_data_from_page(page_number):
    response = requests.get(page_url.format(page_number))
    tree = html.fromstring(response.content)
    
    # Extract hrefs and names
    hrefs = tree.xpath('/html/body/main/div[3]/div/div/a/@href')
    names = tree.xpath('/html/body/main/div[3]/div/div/a/text()')
    
    # Construct full URLs
    full_urls = [f"{base_url}{href}" for href in hrefs]
    
    return names, full_urls

# Loop through pages until no more data is found
page_number = 1
while True:
    names, urls = extract_data_from_page(page_number)
    
    # If no more names or urls are found, break the loop
    if not names or not urls:
        break
    
    all_names.extend(names)
    all_urls.extend(urls)
    
    page_number += 1

# Correct the URLs
corrected_urls = [url.replace('movies-locations/location', 'location') for url in all_urls]

# Create a DataFrame with the basic information
data = {'Movie Name': all_names, 'Movie Page Link': corrected_urls}
df = pd.DataFrame(data)

# Function to extract location details from each movie page
def extract_location_details(url):
    try:
        tables = pd.read_html(url)
        if tables:
            return tables[0]
    except Exception as e:
        print(f"Error reading {url}: {e}")
    return pd.DataFrame()

# Loop through each movie page and extract location details
all_location_data = []

for index, row in df.iterrows():
    movie_name = row['Movie Name']
    movie_url = row['Movie Page Link']
    
    location_df = extract_location_details(movie_url)
    if not location_df.empty:
        location_df['Movie Name'] = movie_name
        location_df['Movie Page Link'] = movie_url
        all_location_data.append(location_df)

# Combine all location data into a single DataFrame
if all_location_data:
    final_df = pd.concat(all_location_data, ignore_index=True)
else:
    final_df = pd.DataFrame(columns=['Location Name', 'Latitude', 'Longitude', 'Movie Name', 'Movie Page Link'])

# Display the final DataFrame
print(final_df)


  from pandas.core import (


                                       Location Name   Latitude   Longitude  \
0                                          Hahnville  29.976858  -90.410561   
1                              Home Place Plantation  29.971119  -90.407745   
2                                        New Orleans  29.950888  -90.076546   
3                   1117 Broadway (Gil's Music Shop)  47.252495 -122.439644   
4      2715 North Junett St (Kat and Bianca's House)  47.272591 -122.474480   
...                                              ...        ...         ...   
13045              Ziegler's Hardware & Supply, Inc.  33.943634 -118.202255   
13046                                        Atlanta  33.748333  -84.396515   
13047                                        Jackson  33.294323  -83.968224   
13048                                          Macon  32.840534  -83.637505   
13049                              North DeKalb Mall  33.808346  -84.278275   

                       Movie Name  \
0             

In [None]:
final_df.to_csv('')