In [45]:
import glob 
import pandas as pd 
from datetime import datetime 
import os
import requests
from bs4 import BeautifulSoup
import sqlite3


In [46]:
url = 'https://web.archive.org/web/20230902185655/https://en.everybodywiki.com/100_Most_Highly-Ranked_Films'
db_name = 'Movies.db'
table_name = 'Top_50'
csv_path = 'top_50_films.csv'
df = pd.DataFrame(columns=["Average Rank","Film","Year"])
count = 0

In [47]:

response = requests.get(url).text

data = BeautifulSoup(response, 'html.parser')



- When inspecting the table we can see it's under the table class "wikitable". 
- The tbody is the table itself. 
- The tr tags indiicate the row tables. 
- The td tag denotes the actual data.

In [48]:
#We can start at the tbody level and move down to it's children from here.
tables = data.find_all('tbody')
#Now that we have all the rows we know our data is in the td tag and we can start working with the data now. 
rows = tables[0].find_all('tr')

#We are only interested in working with the top 50 rows so we use a counter variable to handle this. 
#We are also only interested in three headers: average rank, film, and year of release.
for row in rows:
    if count<50:
        col = row.find_all('td')
        if len(col)!=0:
            data_dict = {"Average Rank": col[0].contents[0],
                         "Film": col[1].contents[0],
                         "Year": col[2].contents[0]}
            df1 = pd.DataFrame(data_dict, index=[0]) # Added index is 0 here to make simple check there is only 1 row of data. 
            df = pd.concat([df,df1], ignore_index=True) #Append row to our own df table with rearranged indices. 
            count+=1
    else:
        break

In [49]:
#Top 50 movies present
print(df)

   Average Rank                                           Film  Year
0             1                                  The Godfather  1972
1             2                                   Citizen Kane  1941
2             3                                     Casablanca  1942
3             4                         The Godfather, Part II  1974
4             5                            Singin' in the Rain  1952
5             6                                         Psycho  1960
6             7                                    Rear Window  1954
7             8                                 Apocalypse Now  1979
8             9                          2001: A Space Odyssey  1968
9            10                                  Seven Samurai  1954
10           11                                        Vertigo  1958
11           12                                    Sunset Blvd  1950
12           13                                   Modern Times  1936
13           14                   

##### Filter the output to print only the films released in the 2000s (year 2000 included).

In [50]:
#Collecting films from 2000 to 2009 

#To make this comparison we first need to ensure that out data types are matching.
print(type(df['Year'][0]))
df['Year'] = pd.to_numeric(df['Year'])
print(type(df['Year'][0]))


<class 'bs4.element.NavigableString'>
<class 'numpy.int64'>


In [51]:
#Now we can filter
films_2000s = df[(df['Year'] >= 2000) & (df['Year'] <= 2009)]

In [52]:
print(films_2000s)

   Average Rank                                           Film  Year
18           19  Lord of the Rings: The Fellowship of the Ring  2001
36           37                                The Dark Knight  2008
42           43          Lord of the Rings: Return of the King  2003
48           49              Lord of the Rings: The Two Towers  2002


In [53]:
df.to_csv(csv_path)

In [54]:
#Let's store out data in a sql database 
conn = sqlite3.connect(db_name)
df.to_sql(table_name, conn, if_exists='replace', index=False)
conn.close()

The database can now be viewed on the terminal by running sqlite3 and uses common sql commands