In [1]:
import pandas as pd
import wikipedia as wp
from bs4 import BeautifulSoup

html = "https://en.wikipedia.org/wiki/List_of_lost_films"

try:
    tables = pd.read_html(html)
    
    if len(tables) > 1:
        df = tables[3]
        df1 = tables[4]
        df2 = tables[5]
        df3 = tables[6]
        df4 = tables[7]
    elif tables:
        df = tables[0]
    else:
        raise ValueError("No tables found on the Wikipedia page.")
        
except Exception as e:
    print(f"An error occurred: {e}")
    df = pd.DataFrame()

# Display the DataFrame with the new 'Hyperlink' column
print(df.to_string())
print(df1.to_string())
print(df2.to_string())
print(df3.to_string())
print(df4.to_string())



    Year                                Film                              Director                                                                                                           Cast                                                                                                                                                                                                                                                                                                                                                                              Notes       Ref Unnamed: 6
0   1930                   An Elastic Affair                      Alfred Hitchcock                                                                                                            NaN                                                                                                                                                                                                                             

In [2]:
dfs = [df, df1, df2, df3, df4]

combined_df = pd.concat(dfs, ignore_index=True)

combined_df.head()


Unnamed: 0,Year,Film,Director,Cast,Notes,Ref,Unnamed: 6
0,1930,An Elastic Affair,Alfred Hitchcock,,Short film made by Hitchcock for an awards cer...,[65],
1,1930,The Big Party,John G. Blystone,"Sue Carol, Dixie Lee",,[50],
2,1930,Cock o' the Walk,Walter Lang,"Joseph Schildkraut, Myrna Loy",,,
3,1930,Noli Me Tángere,Jose Nepumuceno,,The 1930 version of Noli Me Tángere was direct...,,
4,1930,Cameo Kirby,Irving Cummings,"J. Harold Murray, Norma Terris",,[50],


In [3]:
combined_df.drop(['Ref', 'Unnamed: 6'], axis=1, inplace=True)
combined_df.drop(1, inplace=True)

combined_df.head()

Unnamed: 0,Year,Film,Director,Cast,Notes
0,1930,An Elastic Affair,Alfred Hitchcock,,Short film made by Hitchcock for an awards cer...
2,1930,Cock o' the Walk,Walter Lang,"Joseph Schildkraut, Myrna Loy",
3,1930,Noli Me Tángere,Jose Nepumuceno,,The 1930 version of Noli Me Tángere was direct...
4,1930,Cameo Kirby,Irving Cummings,"J. Harold Murray, Norma Terris",
5,1930,The Cave of the Silken Web II,Dan Duyu,Yin Mingzhu,Silent. Chinese film. Original title: 续盘丝洞 (Xù...


In [4]:
import requests

response = requests.get(html)
soup = BeautifulSoup(response.text, 'html.parser')

url_list = []

for a in soup.find_all('a', href=True):
    url_list.append(a['href'])

df_urls = pd.DataFrame({'URLs': url_list})

# Display the DataFrame with the found URLs
print(df_urls.to_string())


                                                                                                                                                                       URLs
0                                                                                                                                                              #bodyContent
1                                                                                                                                                           /wiki/Main_Page
2                                                                                                                                                  /wiki/Wikipedia:Contents
3                                                                                                                                               /wiki/Portal:Current_events
4                                                                                                                                           

In [5]:
# Create a new column 'ref' and fill it based on matching values
combined_df['ref'] = None  # Initialize the 'ref' column with None

for index, row in combined_df.iterrows():
    film_name = row['Film']
    
    # Check for a match in url_list
    matching_refs = [ref for ref in url_list if all(word.lower() in ref.lower() for word in film_name.split())]
    
    # If there is a match, store the first matching reference in the 'ref' column
    if matching_refs:
        combined_df.at[index, 'ref'] = matching_refs[0]


In [6]:
combined_df.head(10)

Unnamed: 0,Year,Film,Director,Cast,Notes,ref
0,1930,An Elastic Affair,Alfred Hitchcock,,Short film made by Hitchcock for an awards cer...,/wiki/An_Elastic_Affair
2,1930,Cock o' the Walk,Walter Lang,"Joseph Schildkraut, Myrna Loy",,
3,1930,Noli Me Tángere,Jose Nepumuceno,,The 1930 version of Noli Me Tángere was direct...,
4,1930,Cameo Kirby,Irving Cummings,"J. Harold Murray, Norma Terris",,/wiki/Cameo_Kirby_(1930_film)
5,1930,The Cave of the Silken Web II,Dan Duyu,Yin Mingzhu,Silent. Chinese film. Original title: 续盘丝洞 (Xù...,/w/index.php?title=The_Cave_of_the_Silken_Web_...
6,1930,College Lovers,John G. Adolfi,"Marion Nixon, Jack Whiting",Musical comedy. Six Vitaphone discs containing...,/wiki/College_Lovers
7,1930,Fellers,"Austin Fay, Arthur Higgins","Arthur Tauchert, Les Coney",An Australian comedy,/wiki/Fellers_(1930_film)
8,1930,Kismet,John Francis Dillon,"Otis Skinner, Loretta Young",A lavish costume drama in the early widescreen...,/wiki/Kismet_(1930_film)
9,1930,Let's Go Places,Frank R. Strayer,"Frank Richardson, Dixie Lee",,
10,1930,Lord Richard in the Pantry,Walter Forde,"Richard Cooper, Dorothy Seacombe","Included on the British Film Institute's ""75 M...",/wiki/Lord_Richard_in_the_Pantry


In [7]:
combined_df = combined_df.dropna(subset=['ref']).reset_index()
combined_df.head()

Unnamed: 0,index,Year,Film,Director,Cast,Notes,ref
0,0,1930,An Elastic Affair,Alfred Hitchcock,,Short film made by Hitchcock for an awards cer...,/wiki/An_Elastic_Affair
1,4,1930,Cameo Kirby,Irving Cummings,"J. Harold Murray, Norma Terris",,/wiki/Cameo_Kirby_(1930_film)
2,5,1930,The Cave of the Silken Web II,Dan Duyu,Yin Mingzhu,Silent. Chinese film. Original title: 续盘丝洞 (Xù...,/w/index.php?title=The_Cave_of_the_Silken_Web_...
3,6,1930,College Lovers,John G. Adolfi,"Marion Nixon, Jack Whiting",Musical comedy. Six Vitaphone discs containing...,/wiki/College_Lovers
4,7,1930,Fellers,"Austin Fay, Arthur Higgins","Arthur Tauchert, Les Coney",An Australian comedy,/wiki/Fellers_(1930_film)


In [12]:
combined_df

Unnamed: 0,index,Year,Film,Director,Cast,Notes,ref,image_url
0,0,1930,An Elastic Affair,Alfred Hitchcock,,Short film made by Hitchcock for an awards cer...,/wiki/An_Elastic_Affair,
1,4,1930,Cameo Kirby,Irving Cummings,"J. Harold Murray, Norma Terris",,/wiki/Cameo_Kirby_(1930_film),
2,5,1930,The Cave of the Silken Web II,Dan Duyu,Yin Mingzhu,Silent. Chinese film. Original title: 续盘丝洞 (Xù...,/w/index.php?title=The_Cave_of_the_Silken_Web_...,
3,6,1930,College Lovers,John G. Adolfi,"Marion Nixon, Jack Whiting",Musical comedy. Six Vitaphone discs containing...,/wiki/College_Lovers,
4,7,1930,Fellers,"Austin Fay, Arthur Higgins","Arthur Tauchert, Les Coney",An Australian comedy,/wiki/Fellers_(1930_film),
...,...,...,...,...,...,...,...,...
64,74,1975,Levi & Leather,Mother Goose,,Also known as Levi's N' Leather. A fetish-them...,/w/index.php?title=Levi_%26_Leather&action=edi...,
65,75,1977,Kissa Kursi Ka,Amrit Nahata,"Shabana Azmi, Utpal Dutt",The plot revolved around a corrupt and evil po...,/wiki/Kissa_Kursi_Ka,
66,76,1979,Njattadi,Bharath Gopi,"Bharat Murali, K.N. Sreenivasan, Sunil, Girija...",The film is based on the life of the protagoni...,/wiki/Njattadi,
67,77,1982,Milagro sa Porta Vaga,Florencio Orbeta,Julie Vega,Religious epic film about Our Lady of Porta Va...,/wiki/Milagro_sa_Porta_Vaga,


In [23]:
from bs4 import BeautifulSoup
import regex as re
import requests

title = combined_df['ref']

def get_main_wiki_image(title):
  person_url = []
  urlpage =  'https://en.wikipedia.org/' + title
  # query the website and return the html to the variable 'page'
  page = requests.get(urlpage).text
  # parse the html using beautiful soup and store in variable 'soup'
  soup = BeautifulSoup(page, 'html.parser')
  for raw_img in soup.find_all('img'):
   link = raw_img.get('src')
   # The first image on the page with the URL strucutre below is usually 
   # the image inside the infobox. We exlcude any .svg images, as they are 
   # vector graphics common to all Wikipedia pages
   if re.search('wikipedia/.*/thumb/', link) and not re.search('.svg', link):
     person_url = [title, link]
     # Once the first image has been found, we break out of the loop and search the next page
     break
  return person_url

title_urls = []
for title in combined_df['ref']:
  person_url = get_main_wiki_image(title)
  if person_url:
    title_urls.append(person_url)
    
# Save the results to a dataframe
title_urls_df = pd.DataFrame(title_urls, columns = ['ref', 'url'])


In [22]:
title_urls_df

Unnamed: 0,title,url
0,/wiki/Cameo_Kirby_(1930_film),//upload.wikimedia.org/wikipedia/commons/thumb...
1,/wiki/College_Lovers,//upload.wikimedia.org/wikipedia/en/thumb/0/0f...
2,/wiki/Kismet_(1930_film),//upload.wikimedia.org/wikipedia/en/thumb/4/4e...
3,/wiki/One_Mad_Kiss,//upload.wikimedia.org/wikipedia/en/thumb/6/61...
4,/wiki/Song_of_the_Flame_(film),//upload.wikimedia.org/wikipedia/en/thumb/0/05...
5,/wiki/Alam_Ara,//upload.wikimedia.org/wikipedia/commons/thumb...
6,/wiki/Kalidas_(film),//upload.wikimedia.org/wikipedia/commons/thumb...
7,/wiki/Two_Crowded_Hours,//upload.wikimedia.org/wikipedia/en/thumb/a/ab...
8,/wiki/Men_of_Tomorrow_(1932_film),//upload.wikimedia.org/wikipedia/en/thumb/f/f5...
9,/wiki/The_Night_of_Decision_(1931_film),//upload.wikimedia.org/wikipedia/en/thumb/3/30...


In [24]:
merged_df = pd.merge(combined_df, title_urls_df, on='ref', how='inner')
merged_df
merged_df.to_csv('lost_films_images.csv')

In [25]:
combined_df.to_csv('lost_films.csv')

In [27]:
title_urls_df['url'].to_csv('images.csv')