## Importing libs and setting up plots  

In [None]:
import matplotlib
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure

In [None]:
plt.style.use('ggplot')
matplotlib.rcParams['figure.figsize'] = (12, 8)

## Extracting data from CSV

In [None]:
df = pd.read_csv('dataset\\movies.csv')

## Displaying information about the data read in the CSV

In [None]:
df.head()

In [None]:
for col in df.columns:
    percentage_missing = np.mean(df[col].isnull())
    print('{} - {}%'.format(col, percentage_missing))

In [None]:
df.dtypes

## Ajusting the **budget**, **gross**, and **released** columns 

In [None]:
df['budget'] = df['budget'].fillna(0).astype('int64')
df['gross'] = df['gross'].fillna(0).astype('int64')
df.head()

In [None]:
accepted_formats = ['%B %d, %Y', '%B %Y', '%Y']
cleaned_string_dates = df['released'].str.replace(r'\s*\(.*\)', '', regex=True)
cleaned_released_dates = pd.Series([None]*len(cleaned_string_dates))

for index, date_item in enumerate(cleaned_string_dates):
    converted_item = None
    for date_format in accepted_formats:
        try:
            converted_item = pd.to_datetime(date_item, format=date_format)
            break
        except:
            pass
    cleaned_released_dates.iloc[index] = converted_item
    
cleaned_released_dates = pd.to_datetime(cleaned_released_dates, errors='coerce')
df['yearcorrect'] = cleaned_released_dates.dt.year.astype('Int64').astype(str)
df.head()

## Sorting the **gross** column and removing its duplicates

In [None]:
df = df.sort_values(by=['gross'], inplace=False, ascending=False)
df['company'].drop_duplicates().sort_values(ascending=False)

## Scatter plot with the **budget** and the **gross** columns

In [None]:
plt.scatter(x=df['budget'], y=df['gross'])
plt.title('Budget VS Gross Earnings')
plt.xlabel('Gross Earnings')
plt.ylabel('Budget For Film')
plt.show()

## Regression plot of **budget** vs **gross** earnings

In [None]:
sns.regplot(x='budget', y='gross', data=df, scatter_kws={'color': 'green'}, line_kws={'color': 'blue'})

## Displaying correlations in the dataframe

In [None]:
df_correlations = df.select_dtypes(include=['number'])
df_correlations.corr()

## Displaying correlations as a heatmap of the dataframe

In [None]:
correlation_matrix = df_correlations.corr()
sns.heatmap(correlation_matrix, annot=True)  
plt.title('Correlation Matric For Numeric Features')
plt.xlabel('Movie Features')
plt.ylabel('Movie Features')
plt.show()

## Catagorizing the columns that are not numerical

In [None]:
df_catagorized = df
for col_name in df_catagorized.columns:
    if (df_catagorized[col_name].dtype == 'object'):
        df_catagorized[col_name] = df_catagorized[col_name].astype('category')
        df_catagorized[col_name] = df_catagorized[col_name].cat.codes       
df_catagorized.head()

## Showing heapmap with the non-numerical data catagorized

In [None]:
sns.heatmap(df_catagorized.corr(), annot=True)  
plt.title('Correlation Matric For Numeric Features')
plt.xlabel('Movie Features')
plt.ylabel('Movie Features')
plt.show()

## Converting the correlation matrix from dataframe into a series of correlation pairs.

In [None]:
correlation_mat = df_catagorized.corr()
corr_pairs = correlation_mat.unstack()
corr_pairs

## Sorting correlation pairs in ascending order based on their correlation values

In [None]:
sorted_values = corr_pairs.sort_values()
sorted_values

## Obtaining only the highest correlations

In [None]:
high_correlation = sorted_values[sorted_values > 0.5]
high_correlation