In [1]:
import pandas as pd
import numpy as np

# Set random seed for reproducibility
np.random.seed(42)

# Create a sample movie dataset
n_movies = 200

# Generate random data
titles = [f"Movie {i}" for i in range(1, n_movies + 1)]
years = np.random.randint(1990, 2023, n_movies)
genres = np.random.choice(['Action', 'Comedy', 'Drama', 'Sci-Fi', 'Horror', 'Thriller', 'Romance'], n_movies)
runtimes = np.random.randint(75, 180, n_movies)
budgets = np.round(np.random.uniform(5, 250, n_movies), 1)  # In millions USD
box_offices = np.round(budgets * np.random.uniform(0.5, 4, n_movies), 1)  # In millions USD

directors = []
for _ in range(n_movies):
    first_names = ['James', 'Steven', 'Christopher', 'Martin', 'Quentin', 'David', 'Ridley', 'Sofia', 'Greta', 'Kathryn']
    last_names = ['Smith', 'Johnson', 'Williams', 'Jones', 'Brown', 'Davis', 'Miller', 'Wilson', 'Moore', 'Taylor']
    directors.append(f"{np.random.choice(first_names)} {np.random.choice(last_names)}")

ratings = np.round(np.random.uniform(3, 9.5, n_movies), 1)
countries = np.random.choice(['USA', 'UK', 'France', 'Japan', 'South Korea', 'India', 'Canada', 'Germany'], n_movies)

# Create the DataFrame
movies_data = pd.DataFrame({
    'Title': titles,
    'Year': years,
    'Genre': genres,
    'Runtime': runtimes,
    'Budget': budgets,
    'BoxOffice': box_offices,
    'Director': directors,
    'Rating': ratings,
    'Country': countries
})

# Introduce some missing values
for col in ['Runtime', 'Budget', 'BoxOffice', 'Rating']:
    missing_indices = np.random.choice(n_movies, size=int(n_movies * 0.05), replace=False)
    movies_data.loc[missing_indices, col] = np.nan

# Display the DataFrame
print(movies_data.head())

     Title  Year     Genre  Runtime  Budget  BoxOffice      Director  Rating  \
0  Movie 1  2018    Sci-Fi     75.0   147.4      507.4  James Miller     5.2   
1  Movie 2  2004     Drama    108.0   216.5      175.8   James Jones     4.6   
2  Movie 3  1997    Action    170.0   220.7      523.9   Greta Smith     9.0   
3  Movie 4  2010    Sci-Fi    122.0    63.0       82.9  Martin Davis     3.7   
4  Movie 5  2008  Thriller    163.0   227.4      386.6  David Wilson     7.5   

   Country  
0   France  
1  Germany  
2      USA  
3    Japan  
4       UK  


In [7]:
print("\nTask 1: Basic Data Exploration")
print("\n1. First 5 rows of the DataFrame:")
print(movies_data.head())

print("\n2. Shape of the DataFrame:")
print(movies_data.shape)

print("\n3. Descriptive statistics for numerical columns:")
print(movies_data.describe())

print("\n4. Missing values in each column:")
print(movies_data.isna().sum())


Task 1: Basic Data Exploration

1. First 5 rows of the DataFrame:
     Title  Year     Genre  Runtime  Budget  BoxOffice      Director  Rating  \
0  Movie 1  2018    Sci-Fi     75.0   147.4      507.4  James Miller     5.2   
1  Movie 2  2004     Drama    108.0   216.5      175.8   James Jones     4.6   
2  Movie 3  1997    Action    170.0   220.7      523.9   Greta Smith     9.0   
3  Movie 4  2010    Sci-Fi    122.0    63.0       82.9  Martin Davis     3.7   
4  Movie 5  2008  Thriller    163.0   227.4      386.6  David Wilson     7.5   

   Country  
0   France  
1  Germany  
2      USA  
3    Japan  
4       UK  

2. Shape of the DataFrame:
(200, 9)

3. Descriptive statistics for numerical columns:
             Year     Runtime      Budget   BoxOffice      Rating
count   200.00000  190.000000  190.000000  190.000000  190.000000
mean   2005.99500  127.557895  125.166316  289.196842    6.337895
std      10.10224   31.736586   69.435439  218.797221    1.914892
min    1990.00000   75.

In [12]:
print("\nTask 2: Data Filtering and Selection")
print("\n1. Movies released after 2010:")
movies_after_2010 = movies_data[movies_data['Year'] > 2010]
print(movies_after_2010[['Title', 'Year']].head())

print("\n2. Movies with a rating above 8.0:")
high_rated_movies = movies_data[movies_data['Rating'] > 8.0]
print(high_rated_movies[['Title', 'Rating']].head())

print("\n3. Movies in the 'Action' or Comedy genre:")
action_comedy_movies = movies_data[(movies_data['Genre'] == 'Action') | (movies_data['Genre'] == 'Comedy')]
print(action_comedy_movies[['Title', 'Genre']].head())

print("\n4. Movies that had a box office greater than twice their budget:")
high_box_office_movies = movies_data[movies_data['BoxOffice'] > movies_data['Budget'] * 2]
print(high_box_office_movies[['Title', 'Budget', 'BoxOffice']].head())

print("\n5. Movies directed by 'Steven Spielberg' or Christopher Nolan:")
steven_christopher_movies = movies_data[movies_data['Director'].str.contains('Steven Spielberg|Christopher Nolan', na=False)]
print(steven_christopher_movies[['Title', 'Director']].head())





Task 2: Data Filtering and Selection

1. Movies released after 2010:
       Title  Year
0    Movie 1  2018
5    Movie 6  2012
8    Movie 9  2013
9   Movie 10  2013
11  Movie 12  2011

2. Movies with a rating above 8.0:
       Title  Rating
2    Movie 3     9.0
6    Movie 7     8.7
9   Movie 10     8.7
11  Movie 12     8.3
13  Movie 14     8.9

3. Movies in the 'Action' or Comedy genre:
       Title   Genre
2    Movie 3  Action
9   Movie 10  Action
11  Movie 12  Action
13  Movie 14  Comedy
15  Movie 16  Comedy

4. Movies that had a box office greater than twice their budget:
     Title  Budget  BoxOffice
0  Movie 1   147.4      507.4
2  Movie 3   220.7      523.9
5  Movie 6   150.0      323.8
7  Movie 8   178.5      494.6
8  Movie 9   123.0      268.0

5. Movies directed by 'Steven Spielberg' or Christopher Nolan:
Empty DataFrame
Columns: [Title, Director]
Index: []
