In [1]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import requests
import time
from scipy.stats import linregress

The dependencies provides for the initial analysis of limited data on the US movie industry over the last 20 years.
Further consideration is given to extending the analysis in the future, to cover the  broader  global movie industry using other data sources, doing regression analysis and making projections into the future of the industry.

In [2]:
# Read cleaned movie data into pandas dataframe
IMDB_movies_df=pd.read_csv("Resources/IMDB_movies_and_ratings.csv")

# Data clean: remove old index from csv file
del IMDB_movies_df["Unnamed: 0"]

# Print a sample view of the data
IMDB_movies_df.head()

FileNotFoundError: [Errno 2] File b'Resources/IMDB_movies_and_ratings.csv' does not exist: b'Resources/IMDB_movies_and_ratings.csv'

A separate CSV file was created from the preliminary output of data from the merged files and inital cleaning process.
The duplicate index from the CSV file under the column, ""Unnamed: 0", needed to be removed. 

In [None]:
# Print a list of columns to view column names
print ("Column titles:  ")
print (" ")
for col in IMDB_movies_df.columns:
    print(col)
    
print (" ")
print ("-----------------------")    
print (" ")

# Print the records count to determine level of magnitude
print ("Number of movies in this dataset:  ")
len(IMDB_movies_df)    

In [None]:
This step prints out the column-headings of the workfile to determine the scope and level of magnitude.

In [None]:
# Filter data for movieS made from 1997 to present
# movies_made_after_1997 = IMDB_movies_df.loc[IMDB_movies_df["year"] >= 1997].loc[IMDB_movies_df["year"] <= 2017]
movies_made_after_1997 = IMDB_movies_df.loc[IMDB_movies_df["year"] >= 1997]

# Remove incomplete data from year 2019
movies_made_before_2018 = movies_made_after_1997.loc[IMDB_movies_df["year"] <= 2017]

# Filter data for movies made from 1997 to present that were made in the USA
USA_movies = movies_made_before_2018.loc[movies_made_after_1997["country"] == "USA"]
USA_movies.reset_index()
USA_movies

In [None]:
Data preparation and cleaning: The data is filtered to include only the relevant data for the study.
                            Data on USA movies for the years between 1997 and 2017 only, are isolated.

In [None]:
# Create, save to CSV and View dataset on number of viewers by year 
USA_movies_by_year = pd.DataFrame(USA_movies.groupby("year").sum())

# save merge data into a new csv file for further analysis
USA_movies_by_year.to_csv("Resources/USA_movies_by_year_from_1997.csv")

# show the data
USA_movies_by_year

In [None]:
The final step in the Gender-Age Group analysis is to group the data by year to show trends over time in the visualizations.

In [None]:
# Collect the years where data was collected
USA_movies_by_year.reset_index(inplace = True)

# Create labels for the X and Y axis
plt.title("Count of Movie Goers By Gender from 1997 through 2017", fontsize=20)
plt.xlabel("Years", fontsize=10)
plt.ylabel("Number of Movie Goers (in millions)", fontsize=10)
plt.xticks(range(1997,2018), rotation='vertical')

plt.style.use("seaborn-poster")
plt.grid(axis="y")

# Plot the chart and apply some styling
male_viewers, = plt.plot(USA_movies_by_year['year'], USA_movies_by_year['males_allages_votes'],
                         color='b', marker="^", markersize=7, label="Total Male Movie Goers")

female_viewers, = plt.plot(USA_movies_by_year['year'], USA_movies_by_year['females_allages_votes'],
                           color='r',  marker="o", markersize=7, label="Total Female Movie Goers")

plt.legend(handles=[male_viewers, female_viewers], fontsize=14, fancybox=True, shadow=True)

In [None]:
The graph shows movie attendance by gender. 
The disparity between the sexes show male movie goers are more inclined to include the movies as an entertainment option.

In [None]:
# Create Graph of Male Movie Goers by Age Group

# Create labels for the X and Y axis
plt.title("Count of Male Movie Goers By Age Group from 1997 through 2017", fontsize=20)
plt.xlabel("Years", fontsize=14)
plt.ylabel("Number of Male Movie Goers", fontsize=14)
plt.xticks(range(1997,2018), rotation='vertical')

plt.style.use("seaborn-poster")
plt.grid(axis="y")

# Plot the chart and apply some styling
male_viewers_under18, = plt.plot(USA_movies_by_year['year'], USA_movies_by_year['males_0age_votes'],
                         color='r', marker="^", markersize=7, label="Male Movie Goers under 18 yrs old")

male_viewers_18_30, = plt.plot(USA_movies_by_year['year'], USA_movies_by_year['males_18age_votes'],
                           color='g',  marker="o", markersize=7,
                               label="Male Movie Goers with ages between 18 & 29 yrs old")

male_viewers_30_45, = plt.plot(USA_movies_by_year['year'], USA_movies_by_year['males_30age_votes'],
                         color='b', marker="s", markersize=7,
                               label="Male Movie Goers with ages between 30 & 44 yrs old")

male_viewers_over45, = plt.plot(USA_movies_by_year['year'], USA_movies_by_year['males_45age_votes'],
                           color='k',  marker="d", markersize=7,
                               label="Male Movie Goers with Ages 45 yrs old & Over")

plt.legend(handles=[male_viewers_under18, male_viewers_18_30, male_viewers_30_45, male_viewers_over45],
           fontsize=11, fancybox=True, shadow=True)

In [None]:
The graph shows male movie viewer attendance by age-group. 
Male movie goers between the ages of 30 to 44 years of age
    have consistently outpaced all other age-groups in terms of attendance.

In [None]:
# Create Graph of Male Movie Goers by Age Group

# Create labels for the X and Y axis
plt.title("Count of Female Movie Goers By Age Group from 1997 through 2017", fontsize=20)
plt.xlabel("Years", fontsize=14)
plt.ylabel("Number of Female Movie Goers", fontsize=14)
plt.xticks(range(1997,2018), rotation='vertical')

plt.style.use("seaborn-poster")
plt.grid(axis="y")

# Plot the charts and apply some styling
female_viewers_under18, = plt.plot(USA_movies_by_year['year'], USA_movies_by_year['females_0age_votes'],
                         color='r', marker="^", markersize=7, label="Female Movie Goers under 18 yrs old")

female_viewers_18_30, = plt.plot(USA_movies_by_year['year'], USA_movies_by_year['females_18age_votes'],
                           color='g',  marker="o", markersize=7,
                               label="Female Movie Goers with ages between 18 & 29 yrs old")

female_viewers_30_45, = plt.plot(USA_movies_by_year['year'], USA_movies_by_year['females_30age_votes'],
                         color='b', marker="s", markersize=7,
                               label="Female Movie Goers with ages between 30 & 44 yrs old")

female_viewers_over45, = plt.plot(USA_movies_by_year['year'], USA_movies_by_year['females_45age_votes'],
                           color='k',  marker="d", markersize=7,
                               label="Female Movie Goers with Ages 45 yrs old & Over")

plt.legend(handles=[female_viewers_under18, female_viewers_18_30, female_viewers_30_45, female_viewers_over45],
           fontsize=12, fancybox=True, shadow=True)

In [None]:
The graph shows female movie viewer attendance by age-group. 
Female movie goers between the ages of 30 to 44 years of age show high attendance in the first half of the study timeframe.
The younger group of female movie goers between the ages of  18 to 29 years of age, pick up interest from 2009 going forward.
A review of movies and movie genre produced within this time frame might hold clues on the resurgent interest by this age-group.