In [105]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [106]:
df = pd.read_csv('movie_data_scrape.csv')
df.head()

Unnamed: 0,Title,Year,Worldwide gross,Rank,Peak
0,Avatar,2009,2923706000.0,1,1
1,Avengers: Endgame,2019,2797501000.0,2,1
2,Avatar: The Way of Water,2022,2320250000.0,3,3
3,Titanic,1997,2257845000.0,4,1
4,Star Wars: The Force Awakens,2015,2068224000.0,5,3


# Naming the rows and columns

In [107]:
df.columns

Index(['Title', 'Year', 'Worldwide gross', 'Rank', 'Peak'], dtype='object')

# Display overview of the datafraame

In [108]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Title            50 non-null     object 
 1   Year             50 non-null     int64  
 2   Worldwide gross  50 non-null     float64
 3   Rank             50 non-null     int64  
 4   Peak             50 non-null     object 
dtypes: float64(1), int64(2), object(2)
memory usage: 2.1+ KB


# Summary of numeric coulmns

In [109]:
df.describe()

Unnamed: 0,Year,Worldwide gross,Rank
count,50.0,50.0,50.0
mean,2015.4,3001399000.0,25.5
std,6.758215,11298910000.0,14.57738
min,1993.0,1034800000.0,1.0
25%,2012.25,1086891000.0,13.25
50%,2017.0,1276761000.0,25.5
75%,2019.0,1517947000.0,37.75
max,2025.0,81238760000.0,50.0


# Chekcing for null data

In [110]:
df.isnull().values.any()

False

# Changing the Worldwide gross colum from object to float

In [111]:
df["Worldwide gross"] = df["Worldwide gross"].astype(str).str.replace(r"[$,]", "", regex=True)
df['Worldwide gross']


0      2923706026.0
1      2797501328.0
2      2320250281.0
3      2257844554.0
4      2068223624.0
5      2048359754.0
6      1922598800.0
7      1698863816.0
8      1671537444.0
9      1656943394.0
10     1631940000.0
11     1518815515.0
12     1515341399.0
13     1495696292.0
14     1450026933.0
15     1447038421.0
16     1402809540.0
17     1362566989.0
18     1347280838.0
19     1342139727.0
20     1338073645.0
21     1332539889.0
22     1308473425.0
23     1290000000.0
24     1263521126.0
25     1242805359.0
26    81238764765.0
27     1214811252.0
28     1159444662.0
29     1153337496.0
30     1148528393.0
31     1147997407.0
32     1132679685.0
33     1128274794.0
34     1123794079.0
35     1108594137.0
36     1104054072.0
37     1081169825.0
38     1074458282.0
39     1074144248.0
40     1073394593.0
41     1066970811.0
42     1066179747.0
43     1057420387.0
44     1050693953.0
45     1046515409.0
46     1045751565.0
47     1045713802.0
48     1037535230.0
49     1034800131.0


# Top ten highest grossing movies

In [112]:
top_ten = df.sort_values(by='Worldwide gross', ascending=False).head(10)
top_ten

Unnamed: 0,Title,Year,Worldwide gross,Rank,Peak
26,The Fate of the Furious,2017,81238764765.0,27,11
0,Avatar,2009,2923706026.0,1,1
1,Avengers: Endgame,2019,2797501328.0,2,1
2,Avatar: The Way of Water,2022,2320250281.0,3,3
3,Titanic,1997,2257844554.0,4,1
4,Star Wars: The Force Awakens,2015,2068223624.0,5,3
5,Avengers: Infinity War,2018,2048359754.0,6,4
6,Spider-Man: No Way Home,2021,1922598800.0,7,6
7,Inside Out 2,2024,1698863816.0,8,8
8,Jurassic World,2015,1671537444.0,9,3


# How many movies were released per year

In [113]:
movies_per_year = df['Year'].value_counts().sort_index()
movies_per_year

Year
1993    1
1997    1
1999    1
2003    1
2006    1
2009    1
2010    1
2011    3
2012    3
2013    2
2014    1
2015    5
2016    2
2017    4
2018    5
2019    9
2021    1
2022    2
2023    2
2024    3
2025    1
Name: count, dtype: int64

# Which year had the highest total box office revenue


In [114]:
highest_revenue_per_year = df.groupby('Year')['Worldwide gross'].max().sort_values(ascending=False)
highest_revenue_per_year

Year
2017    81238764765.0
2009     2923706026.0
2019     2797501328.0
2022     2320250281.0
1997     2257844554.0
2015     2068223624.0
2018     2048359754.0
2021     1922598800.0
2024     1698863816.0
2025     1631940000.0
2012     1518815515.0
2023     1447038421.0
2011     1342139727.0
2013     1290000000.0
2016     1153337496.0
2003     1147997407.0
2014     1104054072.0
2010     1066970811.0
2006     1066179747.0
1999     1046515409.0
1993     1037535230.0
Name: Worldwide gross, dtype: object

# Which movie had the lowest worldiwde gross

In [115]:
lowest_revenue_per_year = df.groupby('Year')['Worldwide gross'].min().sort_values(ascending=True)
lowest_revenue_per_year

Year
2017    1034800131.0
1993    1037535230.0
2011    1045713802.0
2024    1045751565.0
1999    1046515409.0
2019    1050693953.0
2016    1057420387.0
2006    1066179747.0
2010    1066970811.0
2012    1081169825.0
2014    1104054072.0
2003    1147997407.0
2018    1148528393.0
2015    1159444662.0
2013    1214811252.0
2023    1362566989.0
2022    1495696292.0
2025    1631940000.0
2021    1922598800.0
1997    2257844554.0
2009    2923706026.0
Name: Worldwide gross, dtype: object