In [1]:
# Import necessary dependencies
import matplotlib
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure

In [2]:
# Set up the plot style and default size for all plots created with matplotlib
plt.style.use('ggplot')
matplotlib.rcParams['figure.figsize'] = (12, 8)

In [None]:
# Read data from the csv
df = pd.read_csv('Dataset\\movies.csv')
df

In [None]:
# Display the five first rows
df.head()

In [None]:
# Verify if missing data exists
for col in df.columns:
    percentage_missing = np.mean(df[col].isnull())
    print('{} - {}%'.format(col, percentage_missing))

In [None]:
# Display data types for the columns
df.dtypes

In [None]:
# Change the type of columns
df['budget'] = df['budget'].fillna(0).astype('int64')
df['gross'] = df['gross'].fillna(0).astype('int64')
df.head()

In [44]:
# Clean date objects to convert them to dates
accepted_formats = ['%B %d, %Y', '%B %Y', '%Y']
country_name_pattern = r'\s*\(.*\)'

released_clean = df['released'].str.replace(country_name_pattern, '', regex=True)
released_dates = pd.Series([None]*len(released_clean))

for index, date_item in enumerate(released_clean):
    converted_item = None
    for date_format in accepted_formats:
        try:
            converted_item = pd.to_datetime(date_item, format=date_format)
            break
        except:
            pass
        
    released_dates.iloc[index] = converted_item
    if converted_item is None:
        print(f'Date {date_item} could not be converted to formats: {accepted_formats}')
    
# Create correct year column
released_dates = pd.to_datetime(released_dates, errors='coerce')
df['yearcorrect'] = released_dates.dt.year.astype('Int64').astype(str)
df.head()

Unnamed: 0,name,rating,genre,year,released,score,votes,director,writer,star,country,budget,gross,company,runtime,correctyear,yearcorrect
0,The Shining,R,Drama,1980,"June 13, 1980 (United States)",8.4,927000.0,Stanley Kubrick,Stephen King,Jack Nicholson,United Kingdom,19000000,46998772,Warner Bros.,146.0,1980,1980
1,The Blue Lagoon,R,Adventure,1980,"July 2, 1980 (United States)",5.8,65000.0,Randal Kleiser,Henry De Vere Stacpoole,Brooke Shields,United States,4500000,58853106,Columbia Pictures,104.0,1980,1980
2,Star Wars: Episode V - The Empire Strikes Back,PG,Action,1980,"June 20, 1980 (United States)",8.7,1200000.0,Irvin Kershner,Leigh Brackett,Mark Hamill,United States,18000000,538375067,Lucasfilm,124.0,1980,1980
3,Airplane!,PG,Comedy,1980,"July 2, 1980 (United States)",7.7,221000.0,Jim Abrahams,Jim Abrahams,Robert Hays,United States,3500000,83453539,Paramount Pictures,88.0,1980,1980
4,Caddyshack,R,Comedy,1980,"July 25, 1980 (United States)",7.3,108000.0,Harold Ramis,Brian Doyle-Murray,Chevy Chase,United States,6000000,39846344,Orion Pictures,98.0,1980,1980
