In [None]:
import pandas as pd
import pandas as pd
import numpy as np
from datetime import datetime
import re
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud

In [None]:

import sys
sys.path.append('../utils')
import functions

In [None]:
df = pd.read_csv('../data/local/raw/doesthedogdie.com.csv')
df.head()

In [None]:
functions.show_basic_info(df)

Round to 3 decimals in 'yes_ratio'

In [None]:
df['yes_ratio'] = df['yes_ratio'].round(3)

Check if there are duplicate titles

In [None]:
duplicates = df[df['title'].duplicated()]

duplicate_count = df['title'].duplicated().sum()

print(f"Number of duplicates in 'title': {duplicate_count}")

### Check the categories in 'title' column

In [None]:
# Extract the unique values before the underscore in 'title'
unique_titles = df['title'].str.split('_').str[0].unique()

print(unique_titles)

Extract media type and remove it from title column

In [None]:
df['media_type'] = df['title'].str.split('_').str[0]  # Extract media type
df['title'] = df['title'].str.split('_').str[1]       # Remove media type from title

df.head()

In [None]:
print(df['media_type'].unique())

#### Create new df with media_type 'Movie' only

In [None]:
doesthe_movies = df[df['media_type'] == 'Movie']
print(doesthe_movies)

In [None]:
doesthe_movies.to_csv('../data/local/raw/doesthedog_movies.csv', index=False)

In [None]:
print(f'Unique titles in doesthe_movies: {doesthe_movies['title'].nunique()}')

In [None]:
doesthe_movies['clean_title'] = functions.prepare_clean_titles(doesthe_movies, 'title')
doesthe_movies.head()


In [None]:
functions.show_basic_info(doesthe_movies)

Drop media_type column

In [None]:
doesthe_movies.drop(columns=['media_type'], inplace=True)

#### Extract category_name unique values

In [None]:
unique_values = sorted(doesthe_movies['category_name'].dropna().unique())
print(f'Unique values in category column: {unique_values}')

In [None]:
print(f'Unique titles in doesthe_movies: {doesthe_movies['title'].nunique()}')

### Drop rows where 'yes' is greater than 'no'
Keep rows where sensitive content does take place.

In [None]:
events_df = doesthe_movies.copy()
events_df = events_df[events_df['yes'] >= events_df['no']]

Group categories in new column

In [None]:
events_df['events'] = events_df.groupby('clean_title')['category_name'].transform(lambda x: ', '.join(x))
events_df = events_df.drop(columns=['category_name'])

In [None]:
events_df.head(10)

In [None]:
events_df

### Check and remove duplicates

In [None]:
functions.check_for_duplicates(events_df)

In [None]:
functions.remove_duplicates(events_df)

In [None]:
# Save the DataFrame to a CSV file, reset the index, and exclude the index column
events_df.reset_index(drop=True, inplace=True)
# events_df.to_csv('../data/local/clean/doesthe_movie_events_df.csv', index=False)

Find matching cases and add movie information

In [None]:
imdb_df = pd.read_csv('../data/local/clean/imdb_titles_ratings.csv')
imdb_df.head()

In [None]:
# Merge events_df and imdb_df on 'clean_title' where the case matches exactly
combined_df = pd.merge(events_df, imdb_df, on='clean_title', how='inner')


In [None]:
functions.show_basic_info(combined_df)

In [None]:
combined_df.head()

In [None]:
combined_df = combined_df.drop(columns=['title_x', 'yes', 'no', 'yes_ratio', 'title_y'])

In [None]:
combined_df.head(50)


In [None]:
# Basic statistics summary for numerical columns
summary_stats = combined_df.describe()
print(summary_stats)


In [None]:
# Include both numeric and object columns in the summary
summary_stats_all = combined_df.describe(include='all')
print(summary_stats_all)


In [None]:
# Step 1: Split the 'events' column by commas and flatten the list
all_events = combined_df['events'].str.split(',').explode().str.strip()

# Step 2: Get the statistics on the individual events
event_stats = all_events.value_counts()

# Display the statistics (frequency of each event)
print(event_stats)


In [None]:
# Number of unique events
num_unique_events = all_events.nunique()
print(f"Number of unique events: {num_unique_events}")

# Most frequent event
most_frequent_event = event_stats.idxmax()
print(f"Most frequent event: {most_frequent_event}")


In [None]:
# Top 10 most frequent events
top_events = event_stats.head(20)

# Create a bar chart
plt.figure(figsize=(10, 6))
sns.barplot(x=top_events.values, y=top_events.index, palette='viridis')
plt.title('Top 20 Most Frequent Events')
plt.xlabel('Frequency')
plt.ylabel('Event')
plt.show()


In [None]:

# # Create a word cloud for the most frequent events
# wordcloud = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(event_stats)

# # Plot the word cloud
# plt.figure(figsize=(10, 6))
# plt.imshow(wordcloud, interpolation='bilinear')
# plt.title('Word Cloud of Most Frequent Events')
# plt.axis('off')
# plt.show()

In [None]:
combined_df.head()

In [None]:
# Select numeric columns for correlation
numeric_columns = ['year', 'runtime', 'rating', 'votes']

# Calculate the correlation matrix
correlation_matrix = combined_df[numeric_columns].corr()

# Plot the heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', vmin=-1, vmax=1)
plt.title('Correlation Matrix of Numeric Columns')
plt.show()

In [None]:
# Create a pairplot to visualize relationships between numeric columns
sns.pairplot(combined_df[numeric_columns])
plt.suptitle('Pairplot of Numeric Features', y=1.02)
plt.show()

In [None]:
# Split the 'genre' column into individual genres
combined_df['genre_list'] = combined_df['genre'].str.split(',')

# Explode the genre list to get a row per genre
exploded_genre_df = combined_df.explode('genre_list')

# Strip any leading or trailing spaces from genre names
exploded_genre_df['genre_list'] = exploded_genre_df['genre_list'].str.strip()

# Calculate the average rating per genre
genre_rating_avg = exploded_genre_df.groupby('genre_list')['rating'].mean().sort_values(ascending=False)

# Plot a bar chart of average ratings by genre
plt.figure(figsize=(12, 6))
sns.barplot(x=genre_rating_avg.values, y=genre_rating_avg.index, palette='viridis')
plt.title('Average Rating by Genre')
plt.xlabel('Average Rating')
plt.ylabel('Genre')
plt.show()

In [None]:
# Count the number of events for each movie (number of commas + 1)
combined_df['num_events'] = combined_df['events'].str.split(',').apply(len)

# Plot the relationship between the number of events and the rating
plt.figure(figsize=(10, 6))
sns.scatterplot(x=combined_df['num_events'], y=combined_df['rating'])
plt.title('Number of Events vs. Rating')
plt.xlabel('Number of Events')
plt.ylabel('Rating')
plt.show()

In [None]:
# Scatter plot of votes vs. rating
plt.figure(figsize=(10, 6))
sns.scatterplot(x=combined_df['votes'], y=combined_df['rating'])
plt.title('Votes vs. Rating')
plt.xlabel('Number of Votes')
plt.ylabel('Rating')
plt.show()