In [None]:
from netflix_data_reader import NetflixReader

reader = NetflixReader()
reader.read_netflix_data(file_path="./data/netflix_data.csv")
reader.preprocess()

### imdb_score and imdb_votes mean and standard deviation

In [None]:
print(f"IMDB Score (mean): {reader.netflix_data['imdb_score'].mean()}")
print(f"IMDB Score (std): {reader.netflix_data['imdb_score'].std()}")
print(f"IMDB Votes (mean): {reader.netflix_data['imdb_votes'].mean()}")
print(f"IMDB Votes (std): {reader.netflix_data['imdb_votes'].std()}")

In [None]:
reader.netflix_data

In [None]:
import matplotlib.pyplot as plt

In [None]:
# create histogram of imdb_score
plt.hist(reader.netflix_data['imdb_score'], bins=50, alpha=0.5, color='#8da0cb', edgecolor='#4a4a4a')
plt.title('IMDB Score Distribution', fontsize=18, fontweight='bold')
plt.xlabel('IMDB Score', fontsize=14, fontweight='bold')
plt.ylabel('Frequency', fontsize=14, fontweight='bold')
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
plt.grid(True, linestyle='--', alpha=0.5)
plt.style.use('Solarize_Light2')
plt.show()

# create histogram of imdb_votes
plt.hist(reader.netflix_data['imdb_votes'], bins=50, log=True, alpha=0.5, color='#8da0cb', edgecolor='#4a4a4a')
plt.xlabel('IMDB Votes', fontsize=14, fontweight='bold')
plt.ylabel('Frequency (log scale)', fontsize=14, fontweight='bold')
plt.xticks(fontsize=12)
# Rotate the x-axis labels for better readability
plt.xticks(rotation=90)
plt.yticks(fontsize=12)
plt.grid(True, linestyle='--', alpha=0.5)
plt.style.use('Solarize_Light2')
plt.show()


In [None]:
# Scatter-Plot of imdb_score and release_year
plt.scatter(reader.netflix_data['release_year'], reader.netflix_data['imdb_score'], alpha=0.3, color='b')
plt.title('IMDB Score vs. Release Year')
plt.xlabel('Release Year')
plt.ylabel('IMDB Score')
plt.grid(True, linestyle='--', alpha=0.5)
plt.show()

# Scatter-Plot of imdb_votes and release_year
plt.scatter(reader.netflix_data['release_year'], reader.netflix_data['imdb_votes'], alpha=0.3, color='b')
plt.title('IMDB Votes vs. Release Year')
plt.xlabel('Release Year')
plt.ylabel('IMDB Votes')
plt.grid(True, linestyle='--', alpha=0.5)
plt.show()


In [None]:
# Load data from the dataframe
df = reader.netflix_data

# Get the frequency of occurrence of each genre
genre_freq = df.iloc[:, -19:].sum().sort_values(ascending=False)

# Create a bar chart of the genre frequency
plt.bar(genre_freq.index, genre_freq.values)

# Set the axis labels and title
plt.xlabel('Genre')
plt.ylabel('Frequency')
plt.title('Genre Frequency in Netflix Dataset')

# Rotate the x-axis labels for better readability
plt.xticks(rotation=90)

# Show the plot
plt.show()

### Box-Plot IMDB Score for genres

In [None]:
# TODO: General improvements

import pandas as pd
import matplotlib.pyplot as plt

# read in the data
df = reader.netflix_data

# select only the columns for genres and IMDB scores
df_genres = df.iloc[:, -19:]
df_imdb = df['imdb_score']

# create a new dataframe with the IMDB scores for each genre
df_genre_scores = pd.concat([df_imdb, df_genres], axis=1)
df_genre_scores = pd.melt(df_genre_scores, id_vars=['imdb_score'], var_name='genre', value_name='in_genre')

# select only the rows where in_genre is True
df_genre_scores = df_genre_scores[df_genre_scores['in_genre'] == True]

# create the boxplot
plt.figure(figsize=(10,6))
plt.boxplot([df_genre_scores[df_genre_scores['genre'] == genre]['imdb_score'] for genre in df_genres.columns])
plt.title('IMDB Scores by Genre')
plt.xlabel('Genre')
plt.ylabel('IMDB Score')
plt.xticks(range(1, len(df_genres.columns) + 1), df_genres.columns, rotation=90)
plt.show()
