# MOVIE POPULARITY PREDICTION USING SEMMA METHODOLOGY

## 1. SAMPLE

In [1]:
import pandas as pd
import numpy as np
data = pd.read_csv('Animation_Movies.csv')
data.head()

Unnamed: 0,id,title,vote_average,vote_count,status,release_date,revenue,runtime,adult,backdrop_path,...,original_language,original_title,overview,popularity,poster_path,tagline,genres,production_companies,production_countries,spoken_languages
0,150540,Inside Out,7.922,19463,Released,2015-06-09,857611174,95,False,/j29ekbcLpBvxnGk6LjdTc2EI5SA.jpg,...,en,Inside Out,"Growing up can be a bumpy road, and it's no ex...",107.292,/2H1TmgdfNtsKlU9jKdeNyYL5y8T.jpg,Meet the little voices inside your head.,"Animation, Family, Adventure, Drama, Comedy","Pixar, Walt Disney Pictures",United States of America,English
1,14160,Up,7.949,18857,Released,2009-05-28,735099082,96,False,/hGGC9gKo7CFE3fW07RA587e5kol.jpg,...,en,Up,Carl Fredricksen spent his entire life dreamin...,90.968,/vpbaStTMt8qqXaEgnOR2EE4DNJk.jpg,The greatest adventure is just getting back home.,"Animation, Comedy, Family, Adventure",Pixar,United States of America,English
2,12,Finding Nemo,7.824,18061,Released,2003-05-30,940335536,100,False,/h3uqFk7sZRJvLZDdLiFB9qwbL07.jpg,...,en,Finding Nemo,"Nemo, an adventurous young clownfish, is unexp...",55.456,/ggQ6o8X5984OCh3kZi2UIJQJY5y.jpg,There are 3.7 trillion fish in the ocean. They...,"Animation, Family",Pixar,United States of America,English
3,354912,Coco,8.222,17742,Released,2017-10-27,800526015,105,False,/askg3SMvhqEl4OL52YuvdtY40Yb.jpg,...,en,Coco,Despite his family’s baffling generations-old ...,166.578,/gGEsBPAijhVUFoiNpgZXqRVWJt2.jpg,The celebration of a lifetime,"Family, Animation, Fantasy, Music, Comedy, Adv...","Pixar, Walt Disney Pictures",United States of America,"English, Spanish"
4,10681,WALL·E,8.078,17446,Released,2008-06-22,521311860,98,False,/fK5ssgvtI43z19FoWigdlqgpLRE.jpg,...,en,WALL·E,What if mankind had to leave Earth and somebod...,58.517,/hbhFnRzzg6ZDmm8YAmxBnQpQIPh.jpg,After 700 years of doing what he was built for...,"Animation, Family, Science Fiction","Pixar, Walt Disney Pictures",United States of America,English


In [2]:
# Dataset details
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51945 entries, 0 to 51944
Data columns (total 23 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   id                    51945 non-null  int64  
 1   title                 51944 non-null  object 
 2   vote_average          51945 non-null  float64
 3   vote_count            51945 non-null  int64  
 4   status                51945 non-null  object 
 5   release_date          49808 non-null  object 
 6   revenue               51945 non-null  int64  
 7   runtime               51945 non-null  int64  
 8   adult                 51945 non-null  bool   
 9   backdrop_path         15835 non-null  object 
 10  budget                51945 non-null  int64  
 11  homepage              8253 non-null   object 
 12  imdb_id               29552 non-null  object 
 13  original_language     51945 non-null  object 
 14  original_title        51944 non-null  object 
 15  overview           

In [3]:
# Number of rows and columns
rows, columns = data.shape
print(f"The dataset has {rows} rows and {columns} columns.")

The dataset has 51945 rows and 23 columns.


In [4]:
print(data['popularity'].value_counts())

popularity
0.600     32393
0.000       963
1.400       566
0.840       420
0.841       180
          ...  
5.953         1
4.329         1
7.345         1
6.436         1
10.739        1
Name: count, Length: 5840, dtype: int64


In [5]:
# Create bins for popularity scores
bins = [0, 2, 5, 8, 11]  # Define the bin edges
labels = ['0-2', '3-5', '6-8', '9-11']  # Define the bin labels
data['popularity_group'] = pd.cut(data['popularity'], bins=bins, labels=labels, right=False)

In [6]:
# Inspect the grouped data
print(data['popularity_group'].value_counts())

popularity_group
0-2     45931
3-5      3102
6-8       885
9-11      515
Name: count, dtype: int64


In [7]:
# Check for missing values in the binned groups
print(data['popularity_group'].isnull().sum())

1512


In [8]:
# Remove rows with missing 'popularity_group' values
data = data.dropna(subset=['popularity_group'])

In [9]:
# Inspect the grouped data
print(data['popularity_group'].value_counts())

popularity_group
0-2     45931
3-5      3102
6-8       885
9-11      515
Name: count, dtype: int64


In [10]:
sample_size = 0.2  # 20% of the data

In [11]:
from sklearn.model_selection import train_test_split
# Perform stratified sampling
stratified_sample, _ = train_test_split(
    data,
    test_size=1 - sample_size,
    stratify=data['popularity_group'],  # Ensure stratified sampling based on the new groups
    random_state=42
)

In [12]:
# Save the sampled data to a new file
stratified_sample.to_csv('stratified_sample.csv', index=False)

## 2. EXPLORE

In [13]:
# Check the distribution in the stratified sample
print(stratified_sample['popularity_group'].value_counts())

popularity_group
0-2     9186
3-5      620
6-8      177
9-11     103
Name: count, dtype: int64


In [14]:
# Number of rows and columns in the sampled data
rows, columns = stratified_sample.shape
print(f"The sampled dataset has {rows} rows and {columns} columns.")

The sampled dataset has 10086 rows and 24 columns.


In [16]:
from ydata_profiling import ProfileReport
profile = ProfileReport(stratified_sample, title="Dataset Profile Report", explorative=True)
profile.to_notebook_iframe()


Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

In [18]:
import nltk
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Aqilah\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.


True

In [21]:
from autoviz.AutoViz_Class import AutoViz_Class

AV = AutoViz_Class()
auto_viz = AV.AutoViz(stratified_sample)

Shape of your Data Set loaded: (10086, 24)
#######################################################################################
######################## C L A S S I F Y I N G  V A R I A B L E S  ####################
#######################################################################################
Classifying variables in data set...
    Number of Numeric Columns =  2
    Number of Integer-Categorical Columns =  4
    Number of String-Categorical Columns =  2
    Number of Factor-Categorical Columns =  1
    Number of String-Boolean Columns =  1
    Number of Numeric-Boolean Columns =  0
    Number of Discrete String Columns =  3
    Number of NLP String Columns =  9
    Number of Date Time Columns =  0
    Number of ID Columns =  1
    Number of Columns to Delete =  1
    24 Predictors classified...
        2 variable(s) removed since they were ID or low-information variables
        List of variables removed: ['id', 'tagline']
To fix these data quality issues in the dataset, 

Unnamed: 0,Data Type,Missing Values%,Unique Values%,Minimum Value,Maximum Value,DQ Issue
id,int64,0.0,100.0,1260.0,1238308.0,Possible ID column: drop before modeling step.
title,object,0.009915,98.0,,,"1 missing values. Impute them with mean, median, mode, or a constant value such as 123., Mixed dtypes: has 2 different data types: object, float,"
vote_average,float64,0.0,,0.0,10.0,No issue
vote_count,int64,0.0,1.0,0.0,987.0,Column has 1412 outliers greater than upper bound (5.00) or lower than lower bound(-3.00). Cap them or remove them.
status,object,0.0,0.0,,,"5 rare categories: ['In Production', 'Planned', 'Post Production', 'Canceled', 'Rumored']. Group them into a single category or drop the categories."
release_date,object,4.144359,58.0,,,"418 missing values. Impute them with mean, median, mode, or a constant value such as 123., Mixed dtypes: has 2 different data types: object, float,"
revenue,int64,0.0,0.0,0.0,115570314.0,Column has 104 outliers greater than upper bound (0.00) or lower than lower bound(0.00). Cap them or remove them.
runtime,int64,0.0,1.0,0.0,3720.0,Column has 1640 outliers greater than upper bound (33.00) or lower than lower bound(-15.00). Cap them or remove them.
adult,bool,0.0,0.0,0.0,1.0,No issue
backdrop_path,object,71.57446,28.0,,,"7219 missing values. Impute them with mean, median, mode, or a constant value such as 123., Mixed dtypes: has 2 different data types: float, object,"


Number of All Scatter Plots = 3


[nltk_data] Downloading collection 'popular'
[nltk_data]    | 
[nltk_data]    | Downloading package cmudict to
[nltk_data]    |     C:\Users\Aqilah\AppData\Roaming\nltk_data...
[nltk_data]    |   Unzipping corpora\cmudict.zip.
[nltk_data]    | Downloading package gazetteers to
[nltk_data]    |     C:\Users\Aqilah\AppData\Roaming\nltk_data...
[nltk_data]    |   Unzipping corpora\gazetteers.zip.
[nltk_data]    | Downloading package genesis to
[nltk_data]    |     C:\Users\Aqilah\AppData\Roaming\nltk_data...
[nltk_data]    |   Unzipping corpora\genesis.zip.
[nltk_data]    | Downloading package gutenberg to
[nltk_data]    |     C:\Users\Aqilah\AppData\Roaming\nltk_data...
[nltk_data]    |   Unzipping corpora\gutenberg.zip.
[nltk_data]    | Downloading package inaugural to
[nltk_data]    |     C:\Users\Aqilah\AppData\Roaming\nltk_data...
[nltk_data]    |   Unzipping corpora\inaugural.zip.
[nltk_data]    | Downloading package movie_reviews to
[nltk_data]    |     C:\Users\Aqilah\AppData\Roam

All Plots done
Time to run AutoViz = 123 seconds 

 ###################### AUTO VISUALIZATION Completed ########################


In [22]:
from autoviz.AutoViz_Class import AutoViz_Class

AV = AutoViz_Class()

# Ensure stratified_sample is passed as a DataFrame
auto_viz = AV.AutoViz(
    "",
    dfte=stratified_sample,  # Pass the DataFrame
    depVar="",  # Specify the dependent variable, if any
    verbose=2
)

Shape of your Data Set loaded: (10086, 24)
#######################################################################################
######################## C L A S S I F Y I N G  V A R I A B L E S  ####################
#######################################################################################
Classifying variables in data set...
  Printing up to 30 columns (max) in each category:
    Numeric Columns : ['vote_average', 'popularity']
    Integer-Categorical Columns: ['vote_count', 'revenue', 'runtime', 'budget']
    String-Categorical Columns: ['status', 'original_language']
    Factor-Categorical Columns: ['popularity_group']
    String-Boolean Columns: ['adult']
    Numeric-Boolean Columns: []
    Discrete String Columns: ['genres', 'production_countries', 'spoken_languages']
    NLP text Columns: ['title', 'original_title', 'release_date', 'backdrop_path', 'homepage', 'imdb_id', 'overview', 'poster_path', 'production_companies']
    Date Time Columns: []
    ID Columns: [

Unnamed: 0,Data Type,Missing Values%,Unique Values%,Minimum Value,Maximum Value,DQ Issue
id,int64,0.0,100.0,1260.0,1238308.0,Possible ID column: drop before modeling step.
title,object,0.009915,98.0,,,"1 missing values. Impute them with mean, median, mode, or a constant value such as 123., Mixed dtypes: has 2 different data types: object, float,"
vote_average,float64,0.0,,0.0,10.0,No issue
vote_count,int64,0.0,1.0,0.0,987.0,Column has 1412 outliers greater than upper bound (5.00) or lower than lower bound(-3.00). Cap them or remove them.
status,object,0.0,0.0,,,"5 rare categories: ['In Production', 'Planned', 'Post Production', 'Canceled', 'Rumored']. Group them into a single category or drop the categories."
release_date,object,4.144359,58.0,,,"418 missing values. Impute them with mean, median, mode, or a constant value such as 123., Mixed dtypes: has 2 different data types: object, float,"
revenue,int64,0.0,0.0,0.0,115570314.0,Column has 104 outliers greater than upper bound (0.00) or lower than lower bound(0.00). Cap them or remove them.
runtime,int64,0.0,1.0,0.0,3720.0,Column has 1640 outliers greater than upper bound (33.00) or lower than lower bound(-15.00). Cap them or remove them.
adult,bool,0.0,0.0,0.0,1.0,No issue
backdrop_path,object,71.57446,28.0,,,"7219 missing values. Impute them with mean, median, mode, or a constant value such as 123., Mixed dtypes: has 2 different data types: float, object,"


Number of All Scatter Plots = 3


[nltk_data] Downloading collection 'popular'
[nltk_data]    | 
[nltk_data]    | Downloading package cmudict to
[nltk_data]    |     C:\Users\Aqilah\AppData\Roaming\nltk_data...
[nltk_data]    |   Package cmudict is already up-to-date!
[nltk_data]    | Downloading package gazetteers to
[nltk_data]    |     C:\Users\Aqilah\AppData\Roaming\nltk_data...
[nltk_data]    |   Package gazetteers is already up-to-date!
[nltk_data]    | Downloading package genesis to
[nltk_data]    |     C:\Users\Aqilah\AppData\Roaming\nltk_data...
[nltk_data]    |   Package genesis is already up-to-date!
[nltk_data]    | Downloading package gutenberg to
[nltk_data]    |     C:\Users\Aqilah\AppData\Roaming\nltk_data...
[nltk_data]    |   Package gutenberg is already up-to-date!
[nltk_data]    | Downloading package inaugural to
[nltk_data]    |     C:\Users\Aqilah\AppData\Roaming\nltk_data...
[nltk_data]    |   Package inaugural is already up-to-date!
[nltk_data]    | Downloading package movie_reviews to
[nltk_dat

All Plots are saved in .\AutoViz_Plots\AutoViz
Time to run AutoViz = 108 seconds 


In [23]:
import matplotlib.pyplot as plt
import seaborn as sns

# Ensure these columns exist in your dataset
categorical_columns = ['status', 'genres']

# Set figure size for better readability
plt.figure(figsize=(15, 5))

# Loop through categorical columns to create bar charts
for i, col in enumerate(categorical_columns, 1):
    plt.subplot(1, len(categorical_columns), i)  # Create subplots
    value_counts = stratified_sample[col].value_counts()  # Count frequencies
    sns.barplot(x=value_counts.index, y=value_counts.values, palette='viridis')  # Create bar chart
    plt.title(f'Frequency of {col}')
    plt.xlabel(col.capitalize())
    plt.ylabel('Frequency')
    plt.xticks(rotation=45)  # Rotate x-axis labels for better readability

# Adjust layout and display the plots
plt.tight_layout()
plt.show()


In [24]:
import matplotlib.pyplot as plt
import seaborn as sns

# Ensure the columns exist in your dataset
numerical_columns = ['budget', 'revenue']

# Scatter plot using Matplotlib
plt.figure(figsize=(8, 6))
plt.scatter(
    stratified_sample['budget'],
    stratified_sample['revenue'],
    alpha=0.6,  # Transparency for better visualization
    c='blue',   # Color of the points
    edgecolors='k'
)
plt.title('Scatter Plot: Budget vs. Revenue')
plt.xlabel('Budget')
plt.ylabel('Revenue')
plt.grid(True)
plt.show()

# Scatter plot using Seaborn (optional for better aesthetics)
plt.figure(figsize=(8, 6))
sns.scatterplot(
    x='budget',
    y='revenue',
    data=stratified_sample,
    alpha=0.6,
    color='blue'
)
plt.title('Scatter Plot: Budget vs. Revenue')
plt.xlabel('Budget')
plt.ylabel('Revenue')
plt.grid(True)
plt.show()


In [25]:
import matplotlib.pyplot as plt
import seaborn as sns

# List of numerical features to visualize
numerical_features = ['budget', 'revenue', 'popularity']

# Set the figure size
plt.figure(figsize=(15, 5))

# Loop through the numerical features to create box plots
for i, feature in enumerate(numerical_features, 1):
    plt.subplot(1, len(numerical_features), i)  # Create subplots
    sns.boxplot(data=stratified_sample[feature], color='skyblue', width=0.5)
    plt.title(f'Box Plot of {feature}')
    plt.xlabel(feature)
    plt.ylabel('Value')

# Adjust layout for better spacing
plt.tight_layout()
plt.show()


In [26]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Ensure the 'genres' column exists in your dataset
if 'genres' in stratified_sample.columns:
    # Step 1: Split the combined genres into individual genres
    # Drop missing values in 'genres' first
    stratified_sample['genres'] = stratified_sample['genres'].fillna("")
    split_genres = stratified_sample['genres'].str.split(',').explode()

    # Step 2: Count the frequency of each genre
    genre_counts = split_genres.value_counts()

    # Step 3: Plot the bar chart
    plt.figure(figsize=(10, 6))
    sns.barplot(x=genre_counts.index, y=genre_counts.values, palette="viridis")
    plt.title('Frequency of Individual Genres')
    plt.xlabel('Genres')
    plt.ylabel('Frequency')
    plt.xticks(rotation=45, ha='right')  # Rotate for better readability
    plt.tight_layout()
    plt.show()
else:
    print("The 'genres' column is not found in the dataset.")


In [27]:
# Ensure the 'production_companies' column exists in your dataset
if 'production_companies' in stratified_sample.columns:
    # Step 1: Split the combined production companies into individual companies
    # Drop missing values in 'production_companies' first
    stratified_sample['production_companies'] = stratified_sample['production_companies'].fillna("")
    split_companies = stratified_sample['production_companies'].str.split(',').explode()

    # Step 2: Count the frequency of each production company
    company_counts = split_companies.value_counts().head(20)  # Display top 20 companies for readability

    # Step 3: Plot the bar chart
    plt.figure(figsize=(12, 6))
    sns.barplot(x=company_counts.values, y=company_counts.index, palette="viridis", orient='h')
    plt.title('Frequency of Production Companies (Top 20)')
    plt.xlabel('Frequency')
    plt.ylabel('Production Companies')
    plt.tight_layout()
    plt.show()
else:
    print("The 'production_companies' column is not found in the dataset.")
