In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import requests
from bs4 import BeautifulSoup
import csv
import re

In [None]:

url = 'https://editorial.rottentomatoes.com/guide/100-best-classic-movies/'
response = requests.get(url)

if response.status_code == 200:
    soup = BeautifulSoup(response.text, 'html.parser')

    # Find all <h2> tags
    h2_tags = soup.find_all('h2')

    # Extract text from each <h2> tag, excluding the last 4 elements
    h2_titles = [h2_tag.get_text(strip=True) for h2_tag in h2_tags[:-4]]

    # Extract title, year, and rating using regular expression
    movie_data = []
    for text_from_h2 in h2_titles:
        match = re.match(r'^(.*?)\s*\((\d{4})\)\s*(.+)$', text_from_h2)
        if match:
            title, year, rating = match.groups()
            movie_data.append({'Title': title.strip(), 'Year': year.strip(), 'Rating': rating.strip()})

    # Find all <div> elements with class 'info cast'
    cast_divs = soup.find_all('div', class_='info cast')

    # Extract text from <a> tags within each <div>
    for i, cast_div in enumerate(cast_divs, 1):
        a_tags = cast_div.find_all('a')
        text_from_a_tags = [a_tag.get_text(strip=True) for a_tag in a_tags]
        movie_data[i-1]['Cast'] = ', '.join(text_from_a_tags)

    # Find all <div> elements with class 'info director'
    director_divs = soup.find_all('div', class_='info director')

    # Extract text from <a> tags within each <div>
    for i, director_div in enumerate(director_divs, 1):
        a_tags = director_div.find_all('a')
        text_from_a_tags = [a_tag.get_text(strip=True) for a_tag in a_tags]
        movie_data[i-1]['Director'] = ', '.join(text_from_a_tags)

    # Save data to CSV with 'utf-8-sig' encoding
    csv_filename = 'movie_data_combined.csv'
    with open(csv_filename, 'w', newline='', encoding='utf-8-sig') as csv_file:
        fieldnames = ['Title', 'Year', 'Rating', 'Cast', 'Director']
        writer = csv.DictWriter(csv_file, fieldnames=fieldnames)

        # Write header
        writer.writeheader()

        # Write data
        writer.writerows(movie_data)

    print(f"Data has been saved to {csv_filename}")
else:
    print(f"Failed to retrieve the page. Status code: {response.status_code}")


Data has been saved to movie_data_combined.csv


In [None]:
import pandas as pd
df = pd.read_csv('movie_data_combined.csv')
df.head()

Unnamed: 0,Title,Year,Rating,Cast,Director
0,The Philadelphia Story,1940,100%,"Cary Grant, Katharine Hepburn, James Stewart, ...",George Cukor
1,Seven Samurai,1954,100%,"Toshiro Mifune, Takashi Shimura, Yoshio Inaba,...",Akira Kurosawa
2,Meet Me in St. Louis,1944,100%,"Judy Garland, Margaret O'Brien, Leon Ames, Luc...",Vincente Minnelli
3,Singin' in the Rain,1952,100%,"Gene Kelly, Debbie Reynolds, Donald O'Connor, ...","Stanley Donen, Gene Kelly"
4,Laura,1944,100%,"Gene Tierney, Dana Andrews, Clifton Webb, Vinc...",Otto Preminger


In [None]:
df.info()
df.isnull().sum()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Title     100 non-null    object
 1   Year      100 non-null    int64 
 2   Rating    100 non-null    object
 3   Cast      100 non-null    object
 4   Director  100 non-null    object
dtypes: int64(1), object(4)
memory usage: 4.0+ KB


Title       0
Year        0
Rating      0
Cast        0
Director    0
dtype: int64

In [None]:
df['Rating'] = pd.to_numeric(df.Rating.str.replace("%", ""))

In [None]:
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Title     100 non-null    object
 1   Year      100 non-null    int64 
 2   Rating    100 non-null    int64 
 3   Cast      100 non-null    object
 4   Director  100 non-null    object
dtypes: int64(2), object(3)
memory usage: 4.0+ KB


Unnamed: 0,Title,Year,Rating,Cast,Director
0,The Philadelphia Story,1940,100,"Cary Grant, Katharine Hepburn, James Stewart, ...",George Cukor
1,Seven Samurai,1954,100,"Toshiro Mifune, Takashi Shimura, Yoshio Inaba,...",Akira Kurosawa
2,Meet Me in St. Louis,1944,100,"Judy Garland, Margaret O'Brien, Leon Ames, Luc...",Vincente Minnelli
3,Singin' in the Rain,1952,100,"Gene Kelly, Debbie Reynolds, Donald O'Connor, ...","Stanley Donen, Gene Kelly"
4,Laura,1944,100,"Gene Tierney, Dana Andrews, Clifton Webb, Vinc...",Otto Preminger


In [None]:
df.describe()

Unnamed: 0,Year,Rating
count,100.0,100.0
mean,1949.45,97.46
std,11.985998,1.966487
min,1922.0,88.0
25%,1940.0,96.0
50%,1951.0,98.0
75%,1960.0,99.0
max,1968.0,100.0


In [None]:
df.columns

Index(['Title', 'Year', 'Rating', 'Cast', 'Director'], dtype='object')

In [None]:

# Split and create new rows without removing names
new_rows = []
for _, row in df.iterrows():
    if ',' in row['Director']:
        directors = row['Director'].split(', ')
        for director in directors:
            new_rows.append({'Director': director})
    else:
        new_rows.append({'Director': row['Director']})

# Create a new DataFrame with the modified rows
df1 = pd.DataFrame(new_rows)

agg_df = df1.groupby('Director').size().reset_index(name='Count')
height_value = 20 * len(df1['Director'].unique())

figure1 = px.bar(agg_df, y = 'Director', x = 'Count', title="Number of Films Directed")
figure1.update_layout(xaxis_title="Number of Films", yaxis_title = "Director",margin=dict(t=50), height=height_value, width =600, yaxis=dict(categoryorder='total ascending'))
figure1.show()

In [None]:
import pandas as pd

# Sample DataFrame without 'Movie' column
data = {'Director': ['John Doe, Jane Smith', 'Bob Johnson', 'Alice Williams, Charlie Brown, Bob Johnson']}
df = pd.DataFrame(data)

# Split and create new rows without removing names
new_rows = []
for _, row in df.iterrows():
    if ',' in row['Director']:
        directors = row['Director'].split(', ')
        for director in directors:
            new_rows.append({'Director': director})
    else:
        new_rows.append({'Director': row['Director']})

# Create a new DataFrame with the modified rows
df = pd.DataFrame(new_rows)

print(df)


         Director
0        John Doe
1      Jane Smith
2     Bob Johnson
3  Alice Williams
4   Charlie Brown
5     Bob Johnson
