In [None]:
# Import pandas library
import pandas as pd

# Step 1: Create the Movie Dataset
data = {
    'Movie Name': [
        'Sky Warriors', 'Love Algorithm', 'The Quantum Heist', 'Jungle Secrets',
        'Echoes of Silence', 'RoboChef', 'Legends Rise', 'Dust and Gold'
    ],
    'Director': [
        'Ava Chen', 'Rami Noor', 'Sophie Lin', 'Daniel Okafor',
        'Nina Petrova', 'Max Turner', 'Ava Chen', 'Rami Noor'
    ],
    'Release Year': [2022, 2019, 2024, 2021, 2020, 2023, 2025, 2018],
    'Genre': [
        'Action', 'Romance', 'Sci-Fi', 'Adventure',
        'Drama', 'Comedy', 'Action', 'Western'
    ],
    'Rating': [8.5, 7.2, 9.1, 7.8, 8.3, 6.5, 8.9, 7.0],
    'Budget': [150, 30, 200, 80, 50, 20, 180, 40],  # in millions
    'Box Office Revenue': [600, 100, 950, 320, 450, 90, 880, 150]  # in millions
}

# Convert the dictionary to a DataFrame
movies_df = pd.DataFrame(data)

# Step 2: Data Exploration and Analysis

# 1. Basic Exploration
print("First few rows of the dataset:")
print(movies_df.head(), '\n')

print("Dataset info:")
print(movies_df.info(), '\n')

print("Summary statistics for numerical columns:")
print(movies_df[['Rating', 'Budget', 'Box Office Revenue']].describe(), '\n')

# 2. Filtering
print("Movies with rating > 8.0:")
print(movies_df[movies_df['Rating'] > 8.0], '\n')

print("Movies with box office revenue > $500 million:")
print(movies_df[movies_df['Box Office Revenue'] > 500], '\n')

# 3. Sorting
print("Movies sorted by Release Year (most recent first):")
print(movies_df.sort_values(by='Release Year', ascending=False), '\n')

print("Movies sorted by Box Office Revenue (descending):")
print(movies_df.sort_values(by='Box Office Revenue', ascending=False), '\n')

# 4. Aggregation
avg_rating = movies_df['Rating'].mean()
total_budget = movies_df['Budget'].sum()
highest_grossing_movie = movies_df.loc[movies_df['Box Office Revenue'].idxmax()]

print(f"Average movie rating: {avg_rating:.2f}")
print(f"Total combined budget: ${total_budget} million")
print("Highest-grossing movie:")
print(highest_grossing_movie, '\n')

# 5. Grouping
print("Average rating by genre:")
print(movies_df.groupby('Genre')['Rating'].mean(), '\n')

print("Total box office revenue by director:")
print(movies_df.groupby('Director')['Box Office Revenue'].sum(), '\n')

# Step 3: Extra Challenge - Calculate Profit
movies_df['Profit'] = movies_df['Box Office Revenue'] - movies_df['Budget']
most_profitable_movie = movies_df.loc[movies_df['Profit'].idxmax()]

print("Dataset with Profit column added:")
print(movies_df[['Movie Name', 'Profit']], '\n')

print("Movie with the highest profit:")
print(most_profitable_movie, '\n')

# Step 4: Reflection
reflection = """
Reflection:
From the dataset, action and sci-fi movies like 'The Quantum Heist' and 'Legends Rise' tend to have higher ratings and box office revenues.
One interesting observation was that 'Echoes of Silence', a drama, performed surprisingly well in both rating and revenue despite a mid-range budget.
To improve this dataset, I could add more features like duration, country, or streaming platform availability.
Adding more movies would also help reveal stronger statistical trends.
"""
print(reflection)


First few rows of the dataset:
          Movie Name       Director  Release Year      Genre  Rating  Budget  \
0       Sky Warriors       Ava Chen          2022     Action     8.5     150   
1     Love Algorithm      Rami Noor          2019    Romance     7.2      30   
2  The Quantum Heist     Sophie Lin          2024     Sci-Fi     9.1     200   
3     Jungle Secrets  Daniel Okafor          2021  Adventure     7.8      80   
4  Echoes of Silence   Nina Petrova          2020      Drama     8.3      50   

   Box Office Revenue  
0                 600  
1                 100  
2                 950  
3                 320  
4                 450   

Dataset info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8 entries, 0 to 7
Data columns (total 7 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Movie Name          8 non-null      object 
 1   Director            8 non-null      object 
 2   Release Year        8 non-n