I’ll scrape data from a website containing information about movies, clean the data using BeautifulSoup and Pandas, and draw conclusions about the movie dataset.

### TRIED TO EXTRACT DATA FROM WIKIPEDIA WEBPAGE

In [None]:
import requests
import pandas as pd
from bs4 import BeautifulSoup

# Define the URL
url = 'https://en.wikipedia.org/wiki/List_of_highest-grossing_films'

# Send an HTTP request to the URL
response = requests.get(url)

# Parse the HTML content of the page
soup = BeautifulSoup(response.text, 'html.parser')

# Find the movie data within the HTML (assuming the table is the first one on the page)
table = soup.find('table', {'class': 'wikitable'})
rows = table.find_all('tr')[1:]  # Skip the header row

# Extract movie details and create a list of dictionaries
movies = []
for row in rows:
    columns = row.find_all(['td', 'th'])
    title = columns[1].text.strip()
    year = columns[3].text.strip()
    movies.append({'Title': title, 'Year': year})



# Create a data frame from the list of dictionaries
df = pd.DataFrame(movies)

# Display the data frame
print(df)




### TRIED IMDBPY with MOVIE ID'S

In [None]:
# Install the IMDbPY library using %pip
%pip install IMDbPY

from imdb import IMDb
import pandas as pd

# Create an instance of the IMDb class
ia = IMDb()

# IMDb movie IDs for the top 10 highest-grossing films (you can extend this list)
movie_ids = ['0499549', '0120338', '4154796', '0499549', '0120338', '4154796', '0499549', '0120338', '4154796', '0499549']

# Fetch movie data using IMDb IDs
movies = []
for movie_id in movie_ids:
    movie_info = ia.get_movie(movie_id)
    title = movie_info['title']
    year = movie_info['year']
    movies.append({'Title': title, 'Year': year})

# Create a data frame from the list of dictionaries
df = pd.DataFrame(movies)

# Clean the data
df_cleaned = df.dropna()  # Remove any rows with missing values
df_cleaned['Year'] = pd.to_numeric(df_cleaned['Year'], errors='coerce')  # Convert 'Year' to numeric
df_cleaned = df_cleaned.dropna(subset=['Year'])  # Drop rows with missing or incorrect 'Year' values
df_cleaned.reset_index(drop=True, inplace=True)  # Reset the index

# Display the cleaned data frame
print(df_cleaned)




## DATA ANALYSIS

In [None]:
import matplotlib.pyplot as plt

# Assuming you have df_cleaned DataFrame with 'Title' and 'Year' columns

# Display basic statistics of the data
print(df_cleaned.describe())

# Explore average ratings of the movies (assuming 'rating' is a valid attribute)
ratings = []
for movie_id in movie_ids:
    movie_info = ia.get_movie(movie_id)
    if 'rating' in movie_info:
        ratings.append(movie_info['rating'])

# Plot a bar chart for average ratings
plt.figure(figsize=(10, 6))
plt.bar(df_cleaned['Title'], ratings, color='skyblue')
plt.xlabel('Movies')
plt.ylabel('Average Rating')
plt.title('Average Ratings of Movies')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

# Plot the distribution of ratings of movies
plt.figure(figsize=(8, 6))
plt.hist(ratings, bins=20, color='lightgreen', edgecolor='black')
plt.xlabel('Rating')
plt.ylabel('Frequency')
plt.title('Distribution of Ratings of Movies')
plt.show()