In [None]:
#Setting up the project

import pandas as pd
import numpy as np
import seaborn as sns

import matplotlib.pyplot as plt
import matplotlib.mlab as mlab
import matplotlib
plt.style.use('ggplot')
from matplotlib.pyplot import figure

%matplotlib inline
matplotlib.rcParams['figure.figsize'] = (12,8)

pd.options.mode.chained_assignment = None

df = pd.read_csv(r'D:\Data Analytics\Portfolio Alex\archive\movies.csv')

In [None]:
# Taking a quick look

df.head()

In [None]:
#Searching for missing data

for col in df.columns:
    pct_missing = np.mean(df[col].isnull())
    print('{} - {}'.format(col, pct_missing))

In [None]:
#Making sure the column types are accurate.

df.dtypes

In [None]:
#Changing the data type to remove the unnecessary decimals polluting the table.
df['budget'] = df['budget'].astype('int64')
df['gross'] = df['gross'].astype('int64')
#Update.26.09.2023
#This database keeps getting updated, so this piece of code doesn't work
#unless you clean the data every time.

In [None]:
#Quickly taking a look to see the changes
df

In [None]:
#The two different year related collumns are slightly different.
#We'll create a third column from where we'll pull the proper year, when necessary.
df['yearcorrect'] = df['released'].astype(str).str[:4]
df

In [None]:
df.sort_values(by=['gross'], inplace=False, ascending=False)

In [None]:
pd.set_option('display.max_rows', None)

In [None]:
#Using a scatterplot to find any corellation between the 'budget' and 'gross' columns.
plt.scatter(x=df['budget'], y=df['gross'])
plt.title('Budget vs Gross Earnings')
plt.xlabel('Gross Earnings')
plt.ylabel('Budget for Film')
plt.show()

In [None]:
df.head()

In [None]:
#Applying Seaborn to the scatterplot and increasing the visibility for the different elements.
sns.regplot(x='budget', y='gross', data=df, scatter_kws={"color": "red"}, line_kws={"color":"blue"})

In [None]:
#Digging deeper into the correlation. For now it seems if the budget increases, so does the gross.

In [None]:
df.corr(method='pearson') 
#I could have also used different methods, like Kendall's or Spearmen's, but 
#for this project it doesn't matter too much.

In [None]:
#There is indeed high correlation between 'budget' and 'gross'.
#The original theory seems good.

In [None]:
#For more visual oriented results:
corellation_matrix = df.corr(method='pearson')
sns.heatmap(corellation_matrix, annot=True)
plt.title('Correlation Matrix for Numeric Features')
plt.xlabel('Movie Features')
plt.ylabel('Movie Features')
plt.show()

In [None]:
#Moving on to the 'Company' column
df.head()

In [None]:
#Applying nummerical values to the different companies, 
#so the companies' names can be applied to the matrix
df_numerized = df
for col_name in df_numerized.columns:
    if(df_numerized[col_name].dtype == 'object'):
        df_numerized[col_name] = df_numerized[col_name].astype('category')
        df_numerized[col_name] = df_numerized[col_name].cat.codes
        
df_numerized


In [None]:
#Checking if it worked
df

In [None]:
#Same matrix with all the newly numerized columns
corellation_matrix = df.corr(method='pearson')
sns.heatmap(corellation_matrix, annot=True)
plt.title('Correlation Matrix for Numeric Features')
plt.xlabel('Movie Features')
plt.ylabel('Movie Features')
plt.show()

In [None]:
df_numerized.corr()

In [None]:
#Sorting and linearizing the matrix into simple columns for even better visibility

correlation_mat = df.apply(lambda x: x.factorize()[0]).corr()

corr_pairs = correlation_mat.unstack()

print(corr_pairs)


In [None]:
sorted_pairs = corr_pairs.sort_values()
sorted_pairs

In [51]:
high_corr = sorted_pairs[(sorted_pairs) > 0.5]

high_corr

star         company        0.527363
company      star           0.527363
writer       company        0.547126
company      writer         0.547126
director     company        0.552420
company      director       0.552420
             gross          0.587727
gross        company        0.587727
company      name           0.591834
name         company        0.591834
company      year           0.601720
year         company        0.601720
company      released       0.607607
released     company        0.607607
star         writer         0.676284
writer       star           0.676284
star         director       0.682358
director     star           0.682358
name         star           0.731509
star         name           0.731509
             gross          0.735612
gross        star           0.735612
name         director       0.745905
director     name           0.745905
writer       director       0.748875
director     writer         0.748875
             gross          0.750911
g

In [None]:
#End Result
#Votes and Budgets have the highest correlation to Gross Earnings

#The creator (company) matters, but way less than other things, 
#at least concerning the gross earnings. 