In [2]:
# Import necessary dependencies
import matplotlib
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure

In [3]:
# Set up the plot style and default size for all plots created with matplotlib
plt.style.use('ggplot')
matplotlib.rcParams['figure.figsize'] = (12, 8)

In [None]:
# Read data from the csv
df = pd.read_csv('Dataset\\movies.csv')
df

In [None]:
# Display the five first rows
df.head()

In [None]:
# Verify if missing data exists
for col in df.columns:
    percentage_missing = np.mean(df[col].isnull())
    print('{} - {}%'.format(col, percentage_missing))

In [None]:
# Display data types for the columns
df.dtypes

In [None]:
# Change the type of columns
df['budget'] = df['budget'].fillna(0).astype('int64')
df['gross'] = df['gross'].fillna(0).astype('int64')
df.head()

In [None]:
# Clean date objects to convert them to dates
accepted_formats = ['%B %d, %Y', '%B %Y', '%Y']
country_name_pattern = r'\s*\(.*\)'
released_clean = df['released'].str.replace(country_name_pattern, '', regex=True)
released_dates = pd.Series([None]*len(released_clean))

for index, date_item in enumerate(released_clean):
    converted_item = None
    for date_format in accepted_formats:
        try:
            converted_item = pd.to_datetime(date_item, format=date_format)
            break
        except:
            pass
    released_dates.iloc[index] = converted_item
    if converted_item is None:
        print(f'Date {date_item} could not be converted to formats: {accepted_formats}')
    
# Create correct year column
released_dates = pd.to_datetime(released_dates, errors='coerce')
df['yearcorrect'] = released_dates.dt.year.astype('Int64').astype(str)
df.head()

In [None]:
# Sort values of the column 'gross'
df.sort_values(by=['gross'], inplace=False, ascending=False)