In [1]:
# Import required libraries
import gzip
import shutil
import os
import pandas as pd
import numpy as np

import plotly.express as px

In [2]:
# Extract the movies.csv.gz file
with gzip.open('data/movies.csv.gz', 'rb') as f_in:
    with open('data/movies.csv', 'wb') as f_out:
        shutil.copyfileobj(f_in, f_out)

In [3]:
# Importing Cleaned Movie data
movies = pd.read_csv('data/movies.csv')

In [4]:
movies.head()

Unnamed: 0,Title,Year,Genres,Rating,Votes
0,Miss Jerry,1894,Romance,5.4,223
1,The Corbett-Fitzsimmons Fight,1897,"Documentary,News,Sport",5.3,555
2,The Story of the Kelly Gang,1906,"Action,Adventure,Biography",6.0,979
3,L'enfant prodigue,1907,Drama,5.6,31
4,Robbery Under Arms,1907,Drama,4.3,28


In [5]:
len(movies)

364637

In [6]:
# Check datatypes
data_types = movies.dtypes
print(data_types)

Title      object
Year        int64
Genres     object
Rating    float64
Votes       int64
dtype: object


In [7]:
# Convert 'Genres' column to string
movies['Genres'] = movies['Genres'].astype(str)

In [8]:
# Count unique entries in the 'Genres' column
unique_genres_count = movies['Genres'].nunique()
print(unique_genres_count)

1257


In [9]:
# Exploding the 'genres' column
genres_exploded = movies['Genres'].str.split(',').explode().reset_index(drop=True)

genres_exploded.head()

0        Romance
1    Documentary
2           News
3          Sport
4         Action
Name: Genres, dtype: object

In [10]:
# Count unique entries in Genres column, print count
unique_genres_count = genres_exploded.nunique()
print(unique_genres_count)


27


In [11]:
# Print list as txt file called genres_list
with open('data/genres_list.txt', 'w') as f:
    for genre in genres_exploded.unique():
        f.write(f"{genre}\n")

In [12]:
# DataFrame movies in column genres count how many times each entry from unique_genres_count exists, print results
genre_counts = movies['Genres'].str.get_dummies(sep=',').sum().reset_index()
genre_counts.columns = ['Genre', 'Count']
print(genre_counts)

          Genre   Count
0        Action   35141
1         Adult       1
2     Adventure   21508
3     Animation    7173
4     Biography   12877
5        Comedy   89496
6         Crime   32169
7   Documentary   67529
8         Drama  165242
9        Family   15602
10      Fantasy   11183
11    Film-Noir     864
12    Game-Show      17
13      History   11354
14       Horror   25755
15        Music   10449
16      Musical    7815
17      Mystery   14754
18         News     710
19   Reality-TV     185
20      Romance   39813
21       Sci-Fi    8642
22        Sport    4793
23    Talk-Show      43
24     Thriller   30853
25          War    7369
26      Western    5709


In [13]:
# Print list as txt file called genres_list
genre_counts.to_csv('data/genre_counts.txt', index=False, sep='\t')

In [14]:
# from genre_counts print list of the 10 Genres using count column
top_genres = genre_counts.nlargest(10, 'Count')
print(top_genres)
with open('data/top_genres.txt', 'w') as f:
    for index, row in top_genres.iterrows():
        f.write(f"{row['Genre']}: {row['Count']}\n")
        # save top_genres as csv
top_genres.to_csv('data/top_genres.csv', index=False)

          Genre   Count
8         Drama  165242
5        Comedy   89496
7   Documentary   67529
20      Romance   39813
0        Action   35141
6         Crime   32169
24     Thriller   30853
14       Horror   25755
2     Adventure   21508
9        Family   15602


In [15]:
# use plotly to make a graph
#fig = px.bar(top_genres, x='Genre', y='Count', title='Top 10 Genres')
#fig.show()

In [16]:
# use plotly to make pie chart
fig_pie = px.pie(top_genres, names='Genre', values='Count', title='Top 10 Genres Distribution')
fig_pie.update_layout(title_x=0.48)  # Adjust the title placement
fig_pie.show()

In [None]:
# Find 10 highest rated 'Movie' titles
top_rating = movies.nlargest(10, 'Rating')[['Title', 'Rating']]
# Print list
print(top_rating.to_string(index=False))

                                            Title  Year  Rating
                 Auf den Spuren des Hans im Glück  2006    10.0
                                 Broadway Legends  2002    10.0
                                          Kaputol  2019    10.0
                                       D on Dance  2019    10.0
                              Rainy in Glenageary  2019    10.0
                                It's a Love Thang  2019    10.0
Love Live! Series 9th Anniversary LOVE LIVE! FEST  2020    10.0
                               Olu Bliss: Dive In  2018    10.0
                            Tetonica Castro: Home  2011    10.0
                                             Ixel  2014    10.0


In [18]:
# Find the average rating amongst all titles
average_rating = movies['Rating'].mean()
print("Average Rating:", average_rating)

Average Rating: 6.232101788902386


In [None]:
# Find 10 highest voted 'Movie' titles
top_vote = movies.nlargest(10, 'Votes')[['Title', 'Votes']]
top_vote.insert(1, ' ', ' ')
# Print list
print(top_vote.to_string(index=False))

                                            Title  Year   Votes
                         The Shawshank Redemption  1994 3018863
                                  The Dark Knight  2008 2995152
                                        Inception  2010 2662142
                                       Fight Club  1999 2441331
                                     Forrest Gump  1994 2360083
                                     Pulp Fiction  1994 2316350
                                     Interstellar  2014 2309781
                                       The Matrix  1999 2139191
                                    The Godfather  1972 2106844
The Lord of the Rings: The Fellowship of the Ring  2001 2091781


In [21]:
# Find the average vote count amongst all titles
average_votes = movies['Votes'].mean()
print("Average Vote Count:", average_rating)

Average Vote Count: 6.232101788902386


In [22]:
# Create a new DataFrame 'drama' from 'movies' DataFrame where 'Genres' contains 'Drama'
drama = movies[movies['Genres'].str.contains('Drama', na=False)].reset_index(drop=True)
len(drama)
drama.head(25)
# Write the 'Drama' Movies DataFrame to a CSV file
drama.to_csv('data/drama_mov.csv', index=False) 

In [23]:
# Create a new DataFrame 'drama' from 'movies' DataFrame where 'Genres' contains 'Comedy'
comedy = movies[movies['Genres'].str.contains('Comedy', na=False)].reset_index(drop=True)
len(comedy)
# Write the 'Comdey' Movies DataFrame to a CSV file
comedy.to_csv('data/comedy_mov.csv', index=False) 

In [24]:
# Create a new DataFrame 'drama' from 'movies' DataFrame where 'Genres' contains 'Documentary'
documentary = movies[movies['Genres'].str.contains('Documentary', na=False)].reset_index(drop=True)
len(documentary)
# Write the 'Documentary' Movies DataFrame to a CSV file
documentary.to_csv('data/documentary_mov.csv', index=False) 

In [25]:
# Create a new DataFrame 'drama' from 'movies' DataFrame where 'Genres' contains 'Romance'
romance = movies[movies['Genres'].str.contains('Romance', na=False)].reset_index(drop=True)
len(romance)
# Write the 'Romance' Movies DataFrame to a CSV file
romance.to_csv('data/romance_mov.csv', index=False) 

In [26]:
# Create a new DataFrame 'drama' from 'movies' DataFrame where 'Genres' contains 'Action'
action = movies[movies['Genres'].str.contains('Action', na=False)].reset_index(drop=True)
len(action)
# Write the 'Action' Movies DataFrame to a CSV file
action.to_csv('data/action_mov.csv', index=False) 

In [27]:
# Create a new DataFrame 'drama' from 'movies' DataFrame where 'Genres' contains 'Crime'
crime = movies[movies['Genres'].str.contains('Crime', na=False)].reset_index(drop=True)
len(crime)
# Write the 'Crime' Movies DataFrame to a CSV file
crime.to_csv('data/crime_mov.csv', index=False) 

In [28]:
# Create a new DataFrame 'drama' from 'movies' DataFrame where 'Genres' contains 'Thriller'
thriller = movies[movies['Genres'].str.contains('Thriller', na=False)].reset_index(drop=True)
len(thriller)
# Write the 'Thriller' Movies DataFrame to a CSV file
thriller.to_csv('data/thriller_mov.csv', index=False) 

In [29]:
# Create a new DataFrame 'drama' from 'movies' DataFrame where 'Genres' contains 'Horror'
horror = movies[movies['Genres'].str.contains('Horror', na=False)].reset_index(drop=True)
len(horror)
# Write the 'Horror' Movies DataFrame to a CSV file
horror.to_csv('data/horror_mov.csv', index=False) 

In [30]:
# Create a new DataFrame 'drama' from 'movies' DataFrame where 'Genres' contains 'Adventure'
adventure = movies[movies['Genres'].str.contains('Adventure', na=False)].reset_index(drop=True)
len(adventure)
# Write the 'adventure' Movies DataFrame to a CSV file
adventure.to_csv('data/adventure_mov.csv', index=False) 

In [31]:
# Create a new DataFrame 'drama' from 'movies' DataFrame where 'Genres' contains 'Family'
family = movies[movies['Genres'].str.contains('Family', na=False)].reset_index(drop=True)
len(family)
# Write the 'Family' Movies DataFrame to a CSV file
family.to_csv('data/family_mov.csv', index=False) 