In [None]:
!pip3 install pandas numpy matplotlib plotly

In [None]:
# import necesary libs
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt

In [None]:
df = pd.read_csv("./imdb_top_1000.csv")
print(df.describe())
print("#" * 30)
print(df.dtypes)

In [None]:
sampled_data = df.sample(5)
print(sampled_data[['Series_Title', 'Released_Year', 'Genre', 'Director', 'Star1', 'Star2', 'Star3', 'Star4']])

In [None]:
# print(df["Released_Year"][1:10])
# convert years to numbers
df["Released_Year"] = pd.to_numeric(df["Released_Year"])

In [None]:
# print(len(df.loc[df["Released_Year"] == "PG"])) #=> 1
print(df.loc[df["Released_Year"] == "PG"])

In [None]:
df.loc[966, "Released_Year"] = "1995"
print(df.loc[[966]])

In [None]:
# convert years to numbers, after cleanup
df["Released_Year"] = pd.to_numeric(df["Released_Year"])
print(df.dtypes)

In [None]:
# filter movies between 1990 and 2020
filtered_movies = df[(df['Released_Year'] >= 1990) & (df['Released_Year'] <= 2020)]
print(filtered_movies[1:10])

In [None]:
# Group the data by year and genre
grouped = filtered_movies.groupby(['Released_Year', 'Genre']).size().unstack().fillna(0)

In [None]:
# Stack graph of genres per year

grouped.plot(kind='bar', stacked=True, figsize=(12, 8))
plt.title('Stacked Bar Chart of Movie Genres per Year')
plt.xlabel('Year')
plt.ylabel('Number of Movies')
plt.legend(title='Genres', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()


In [None]:
single_row = df.loc[69]
print(single_row)

In [None]:
# single_row.to_frame() is a df.Series.
# Transposing it converts it to a df.DataFrame
updated_row = single_row.to_frame().T.assign(Genre=df['Genre'].str.split(', ')).explode('Genre')
print(updated_row)

In [None]:
# now, onto the whole shebang
exploded_df = df.assign(Genre=df['Genre'].str.split(', ')).explode('Genre')
print(len(exploded_df))

In [None]:
exploded_n_grouped = \
    exploded_df[(exploded_df['Released_Year'] >= 1990) & (exploded_df['Released_Year'] <= 2020)] \
    .groupby(['Released_Year', 'Genre']).size().unstack().fillna(0)

In [None]:
# Stack graph of genres per year, onto the exploded group
exploded_n_grouped.plot(kind='bar', stacked=True, figsize=(12, 8))
plt.title('Stacked Bar Chart of Movie Genres per Year')
plt.xlabel('Year')
plt.ylabel('Number of Movies')
plt.legend(title='Genres', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()

In [None]:
eng = exploded_n_grouped
genre_count = eng.shape[1]
print(genre_count)
colors = mpl.colormaps['tab20']

In [None]:
def plot_data(dataframe):
    fig, ax = plt.subplots(figsize=(12, 8))
    bottom = np.zeros(len(dataframe.index))
    for i, (colname, col) in enumerate(dataframe.items()):
        ax.bar(grouped.index, col, bottom=bottom, label=colname, color=colors(i))
        bottom += np.array(col)
    
    plt.title('Stacked Bar Chart of Movie Genres per Year (1990 - 2020)')
    plt.xlabel('Year')
    plt.ylabel('Number of Movies')
    plt.legend(title='Genres', bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.tight_layout()
    plt.show()

plot_data(eng)

In [None]:
# normalize by genre:
# the provided dataset is poor for certain years (e.g. < 20 movies in 2020)
# To normalize the dataset, divide the # of movies in a given genre by the # of movies in that year

movie_count_per_year = filtered_movies['Released_Year'].value_counts()
print(movie_count_per_year)

In [None]:
normalized_grouped = eng.div(movie_count_per_year, axis=0)
print(normalized_grouped.sample(5))

In [None]:
plot_data(normalized_grouped)

In [None]:
df[df["Released_Year"] == 2009][["Series_Title", "Director", "IMDB_Rating"]]