## Import libs and set up plots

In [None]:
import matplotlib
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure

In [None]:
# Set up the plot style and default size
plt.style.use('ggplot')
matplotlib.rcParams['figure.figsize'] = (12, 8)

In [None]:
# Read data from the csv
df = pd.read_csv('dataset\\movies.csv')

In [None]:
# Display the five first rows
df.head()

## Display the percentage of null data

In [None]:
for col in df.columns:
    percentage_missing = np.mean(df[col].isnull())
    print('{} - {}%'.format(col, percentage_missing))

## Display data types for all the columns

In [None]:
df.dtypes

## Clean the data

In [None]:
# Change the type of columns to integer and replace NaN values with zero
df['budget'] = df['budget'].fillna(0).astype('int64')
df['gross'] = df['gross'].fillna(0).astype('int64')
df.head()

In [None]:
# Clean date objects to convert them to dates
country_name_pattern = r'\s*\(.*\)'
accepted_formats = ['%B %d, %Y', '%B %Y', '%Y']

cleaned_string_dates = df['released'].str.replace(country_name_pattern, '', regex=True)
cleaned_released_dates = pd.Series([None]*len(cleaned_string_dates))

# Verify if dates are in the expected format
for index, date_item in enumerate(cleaned_string_dates):
    converted_item = None
    for date_format in accepted_formats:
        try:
            converted_item = pd.to_datetime(date_item, format=date_format)
            break
        except:
            pass
        
    # The .iloc indexer in pandas is used for integer-location based indexing and selection by position
    cleaned_released_dates.iloc[index] = converted_item
    if converted_item is None:
        print(f'Date {date_item} could not be converted to formats: {accepted_formats}')
    
# Create correct year column
cleaned_released_dates = pd.to_datetime(cleaned_released_dates, errors='coerce')
df['yearcorrect'] = cleaned_released_dates.dt.year.astype('Int64').astype(str)
df.head()

## Sort the data and display plots

In [None]:
# Sort values of the column 'gross'
df = df.sort_values(by=['gross'], inplace=False, ascending=False)
# Drop duplicates to show only the distinct ones
df['company'].drop_duplicates().sort_values(ascending=False)

In [None]:
# Scatter plot with the budget and the gross
plt.scatter(x=df['budget'], y=df['gross'])
plt.title('Budget VS Gross Earnings')
plt.xlabel('Gross Earnings')
plt.ylabel('Budget For Film')
plt.show()

In [None]:
# Regression plot of budget vs gross earnings using seaborn
sns.regplot(x='budget', y='gross', data=df, scatter_kws={'color': 'green'}, line_kws={'color': 'blue'})

In [None]:
# Looking at correlation
df_correlations = df.select_dtypes(include=['number'])
df_correlations.corr()

In [None]:
# Looking at a heatmap of the dataframe
correlation_matrix = df_correlations.corr()
sns.heatmap(correlation_matrix, annot=True)  
plt.title('Correlation Matric For Numeric Features')
plt.xlabel('Movie Features')
plt.ylabel('Movie Features')
plt.show()

In [None]:
# Catagorizing the columns that are not numerical
df_catagorized = df

for col_name in df_catagorized.columns:
    if (df_catagorized[col_name].dtype == 'object'):
        df_catagorized[col_name] = df_catagorized[col_name].astype('category')
        df_catagorized[col_name] = df_catagorized[col_name].cat.codes
        
df_catagorized.head()

In [None]:
# Looking at a heapmap with the non-numerical data catagorized
sns.heatmap(df_catagorized.corr(), annot=True)  
plt.title('Correlation Matric For Numeric Features')
plt.xlabel('Movie Features')
plt.ylabel('Movie Features')
plt.show()

In [None]:
# Convert the correlation matrix from dataframe into a series of correlation pairs.
correlation_mat = df_catagorized.corr()
corr_pairs = correlation_mat.unstack()
corr_pairs

In [None]:
# Sort correlation pairs in ascending order based on their correlation values
sorted_values = corr_pairs.sort_values()
sorted_values

In [None]:
# Obtaining only the highest correlations
high_correlation = sorted_values[sorted_values > 0.5]
high_correlation