# Movie Genre Classification Using NLP

## by Andrew Alarcon

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import statistics as stats
import imdb

### Reading in the Data

In [43]:
data = pd.read_csv('data/movies.csv')

In [44]:
data.head()

Unnamed: 0,1,Oscar et la dame rose (2009),drama,"Listening in to a conversation between his doctor and parents, 10-year-old Oscar learns what nobody has the courage to tell him. He only has a few weeks to live. Furious, he refuses to speak to anyone except straight-talking Rose, the lady in pink he meets on the hospital stairs. As Christmas approaches, Rose uses her fantastical experiences as a professional wrestler, her imagination, wit and charm to allow Oscar to live life and love to the full, in the company of his friends Pop Corn, Einstein, Bacon and childhood sweetheart Peggy Blue."
0,2,Cupid (1997),thriller,A brother and sister with a past incestuous r...
1,3,"Young, Wild and Wonderful (1980)",adult,As the bus empties the students for their fie...
2,4,The Secret Sin (1915),drama,To help their unemployed father make ends mee...
3,5,The Unrecovered (2007),drama,The film's title refers not only to the un-re...
4,6,Quality Control (2011),documentary,Quality Control consists of a series of 16mm ...


### Data Cleaning

Looking at the first 5 rows, we see that the first row is being treated as the column names. Let's fix this by setting it as the first row and then correctly naming the columns.

In [45]:
#using vstack to stack the column names of data and set it to the
#first row of dataframe
data = pd.DataFrame(np.vstack([data.columns, data]))

In [46]:
data.head()

Unnamed: 0,0,1,2,3
0,1,Oscar et la dame rose (2009),drama,Listening in to a conversation between his do...
1,2,Cupid (1997),thriller,A brother and sister with a past incestuous r...
2,3,"Young, Wild and Wonderful (1980)",adult,As the bus empties the students for their fie...
3,4,The Secret Sin (1915),drama,To help their unemployed father make ends mee...
4,5,The Unrecovered (2007),drama,The film's title refers not only to the un-re...


Let's appropriately name the columns.

In [47]:
data.rename(columns = {1:'Title', 2:'Genre', 3: 'Description'}, inplace = True)


In [48]:
data.head()

Unnamed: 0,0,Title,Genre,Description
0,1,Oscar et la dame rose (2009),drama,Listening in to a conversation between his do...
1,2,Cupid (1997),thriller,A brother and sister with a past incestuous r...
2,3,"Young, Wild and Wonderful (1980)",adult,As the bus empties the students for their fie...
3,4,The Secret Sin (1915),drama,To help their unemployed father make ends mee...
4,5,The Unrecovered (2007),drama,The film's title refers not only to the un-re...


Now, we will create a Year column based of the years in the parenthesis in the Title column. Let's drop the index column first.

In [49]:
data = data.drop(columns=[0], axis=1)
data.head()

Unnamed: 0,Title,Genre,Description
0,Oscar et la dame rose (2009),drama,Listening in to a conversation between his do...
1,Cupid (1997),thriller,A brother and sister with a past incestuous r...
2,"Young, Wild and Wonderful (1980)",adult,As the bus empties the students for their fie...
3,The Secret Sin (1915),drama,To help their unemployed father make ends mee...
4,The Unrecovered (2007),drama,The film's title refers not only to the un-re...


In [50]:
#Using regex to correctly gather the year information in Title
#source: https://stackoverflow.com/questions/13807207/regex-find-a-number-between-parentheses

#myre = re.compile(".*\(([0-9]+)\).*")
#myre = re.compile("/^(19|20)[\d]{2,2}$/")
myre = re.compile("\([^\d]*(\d+)[^\d]*\)")
data = data.assign(Year=data['Title'].str.extract(myre))


In [51]:
data.head()

Unnamed: 0,Title,Genre,Description,Year
0,Oscar et la dame rose (2009),drama,Listening in to a conversation between his do...,2009
1,Cupid (1997),thriller,A brother and sister with a past incestuous r...,1997
2,"Young, Wild and Wonderful (1980)",adult,As the bus empties the students for their fie...,1980
3,The Secret Sin (1915),drama,To help their unemployed father make ends mee...,1915
4,The Unrecovered (2007),drama,The film's title refers not only to the un-re...,2007


In [135]:
data.shape

(54214, 4)

### Checking the amount of NAN values in each columns

In [52]:
data.isna().sum() / len(data)

Title          0.000000
Genre          0.000000
Description    0.000000
Year           0.049674
dtype: float64

In [53]:
data['Year'].isna().sum()

2693

2693 missing values in the Year column. This is about 5% of the dataset. Let's try to impute these values by grouping them into the year that had the most movies of that genre. If multiple years contain the same genre, we will set the NAN value to the mean year of that genre.

In [54]:
years_genres = data.groupby(['Year'])[['Genre']].agg(pd.Series.mode).reset_index()

In [55]:
years_genres.dtypes

Year     object
Genre    object
dtype: object

In [56]:
#Change Year col to integer, Genre column to string
years_genres['Year'] = years_genres['Year'].astype(int)

In [57]:
years_genres['Genre'] = years_genres['Genre'].astype('string')

In [58]:
years_genres.dtypes

Year      int64
Genre    string
dtype: object

In [59]:
years_genres['Genre'].unique()

<StringArray>
[     "[' documentary ' ' short ']",                    ' documentary ',
                          ' short ',                          ' drama ',
                         ' comedy ',                        ' western ',
          "[' drama ' ' western ']",           "[' comedy ' ' drama ']",
                        ' romance ',    "[' documentary ' ' fantasy ']",
 "[' documentary ' ' reality-tv ']",           "[' comedy ' ' short ']",
     "[' documentary ' ' horror ']"]
Length: 13, dtype: string

In [60]:
list(years_genres[years_genres['Genre'] == ' comedy ']['Year'])

[1927, 1941, 1943, 1955, 1959]

In [25]:
pd.set_option('display.max_rows', None)

In [62]:
data_null = data[data['Year'].isnull()].reset_index(drop=True)
data_null.head()

Unnamed: 0,Title,Genre,Description,Year
0,The Sandman (????/I),fantasy,A wizard attempting to capture Death to barga...,
1,Stealing Stradivarius (????),comedy,When Artie needs money for his daughters cons...,
2,The Wish Kin (????),adventure,Fourteen-year-old Colm Bell and his 11-year-o...,
3,Killing Grace (????),thriller,A prominent couple are returning home when th...,
4,Crooked Tree (????),thriller,The customarily docile bears in the north woo...,


In [63]:
list(years_genres[years_genres['Genre'] == ' comedy ']['Year'])

[1927, 1941, 1943, 1955, 1959]

In [64]:
data_null2 = data_null.copy()

In [65]:
data_null2['Year'][0] = 1

In [66]:
data_null2['Year'][0]

1

#### Without the drama genre
Experimenting here. Drama is the leading genre across many years, and can throw off the data quite a bit, so let's ignore years that have drama is their leading genre.
We will also check this performance with drama and compare dataframes.

In [67]:
#looping through this data frame, checking the genre for each value

for i in range(len(data_null2)):
    genre = data_null2['Genre'][i] #get the genre for each nan value
    #print(genre)
    #look at the years where this genre appeared the most
    if genre != ' drama ':
        years_w_genres = list(years_genres[years_genres['Genre'] == genre]['Year'])
        #print(years_w_genres, genre)
        #if the years_w_genres list is not empty
        if len(years_w_genres) != 0:
            ave_year = stats.mean(years_w_genres)  #then take the average of it
            #print('average year is:', ave_year)
            #Now assign ave_year to the current Year row where the value is nan.
            data_null2['Year'][i] = ave_year
            #print(data_null2['Year'][i])
        else:
            data_null2['Year'][i] = None

data_null2.isna().sum() / len(data_null2)

    
    
    

Title          0.000000
Genre          0.000000
Description    0.000000
Year           0.742295
dtype: float64

In [68]:
data_null.columns

Index(['Title', 'Genre', 'Description', 'Year'], dtype='object')

#### With Drama

In [69]:
data_null3 = data_null.copy()

In [70]:
#looping through this data frame, checking the genre for each value

for i in range(len(data_null3)):
    genre = data_null3['Genre'][i] #get the genre for each nan value
    #print(genre)
    #look at the years where this genre appeared the most
    years_w_genres = list(years_genres[years_genres['Genre'] == genre]['Year'])
    #print(years_w_genres, genre)
    #if the years_w_genres list is not empty
    if len(years_w_genres) != 0:
        ave_year = stats.mean(years_w_genres)  #then take the average of it
        #print('average year is:', ave_year)
        #Now assign ave_year to the current Year row where the value is nan.
        data_null3['Year'][i] = ave_year
        #print(data_null2['Year'][i])
    else:
        data_null3['Year'][i] = None

data_null3.isna().sum() / len(data_null3)

Title          0.000000
Genre          0.000000
Description    0.000000
Year           0.447456
dtype: float64

Not much of a difference! both values of the Year column have less than 1% of data missing.

In [71]:
print(data.isna().sum())
print(data.isna().sum() / len(data))

Title             0
Genre             0
Description       0
Year           2693
dtype: int64
Title          0.000000
Genre          0.000000
Description    0.000000
Year           0.049674
dtype: float64


In [72]:
print(data_null2.isna().sum())
print(data_null2.isna().sum() / len(data_null2))

Title             0
Genre             0
Description       0
Year           1999
dtype: int64
Title          0.000000
Genre          0.000000
Description    0.000000
Year           0.742295
dtype: float64


In [73]:
print(data_null3.isna().sum())
print(data_null3.isna().sum() / len(data_null3))

Title             0
Genre             0
Description       0
Year           1205
dtype: int64
Title          0.000000
Genre          0.000000
Description    0.000000
Year           0.447456
dtype: float64


In [74]:
#data_null3

# Ditching Old Imputation Method for Year, Now Using IMDB API

Trying to average out the years was going to throw off our data. Its better to find these movies on IMDB's API and find their years. There are some movies in this database that haven't been released yet, and will contain the phrase (in development) in their title. In this case, we set this row to "In Development" for its Year value.

In [86]:
data_null.isna().sum()

Title             0
Genre             0
Description       0
Year           2693
dtype: int64

In [77]:
data_null4 = data_null.copy()

In [50]:
moviesDB = imdb.IMDb()

#### Here we are finding the movies that are still in development

In [78]:
# Searching for a movie title
#moviesDB = imdb.IMDb()

for i in range(len(data_null4)):
#for i in range(0, 3):
    movie_title = data_null4['Title'][i]
    imdb_movies = moviesDB.search_movie(movie_title)

    print(f'Searching for {movie_title}:\n')

    development = '(in development)'

    #searching through imbd's database for current movie
    for movie in imdb_movies:
        title = movie['title']
        #if year contains '(in development)' go to next movie
        if development in title:
            print(f'{title} is still in development')
            data_null4['Year'][i] = 'In Development'
            break
        #year = movie['year']
        #print(f'{title}')
        #print(type(title))

Searching for  The Sandman (????/I) :

The Sandman  (in development) is still in development
Searching for  Stealing Stradivarius (????) :

Stealing Stradivarius (in development) is still in development
Searching for  The Wish Kin (????) :

The Wish Kin (in development) is still in development
Searching for  Killing Grace (????) :

Killing Grace (in development) is still in development
Searching for  Crooked Tree (????) :

Crooked Teeth (in development) (TV Mini Series) is still in development
Searching for  Two Women (????) :

Searching for  Operation Bannana Split (????) :

Operation Bannana Split (in development) is still in development
Searching for  The Cellar Door 2: Preymates (????) :

The Cellar Door 2: Preymates (in development) is still in development
Searching for  The Gang (????/II) :

The Gang  (in development) is still in development
Searching for  Ember: The Sapphire Empire (????) :

Ember: The Sapphire Empire (in development) is still in development
Searching for  The M

In [82]:
data_null4.Year.isna().sum()

1108

In [52]:
data_null4.Year.isna().sum()/len(data_null4)

0.41143705904196065

The number of null values were decreased by half! That leaves around 4% of the dataset empty.
We now have two options:

1) Since there are many movies that overlap with the same name and have been released on different years, we could just ignore these values for the data visualization and analysis part of this notebook. The year won't impact our machine learning model later on, so it is safe to keep these  rows because they still contain valuable information in the Description and Genre columns.

2) We could further play around with the IMDB API to try to find the years for these movies. However there is an issue. Some movies in the IMDB database simply do not even have an argument for year, and an error is thrown when trying to search for a year value.

In [134]:
data_null4.head()

Unnamed: 0.1,Unnamed: 0,Title,Genre,Description,Year
0,0,The Sandman (????/I),fantasy,A wizard attempting to capture Death to barga...,In Development
1,1,Stealing Stradivarius (????),comedy,When Artie needs money for his daughters cons...,In Development
2,2,The Wish Kin (????),adventure,Fourteen-year-old Colm Bell and his 11-year-o...,In Development
3,3,Killing Grace (????),thriller,A prominent couple are returning home when th...,In Development
4,4,Crooked Tree (????),thriller,The customarily docile bears in the north woo...,In Development


In [84]:
data_null4[data_null4['Year'].isnull()]

Unnamed: 0,Title,Genre,Description,Year
5,Two Women (????),documentary,"""We are two women, who have each grown in our...",
13,12: The Tamir Rice Story (????/I),drama,"On November 22, 2014, at 3:30 pm in Cleveland...",
15,Waiting for a Storm (????),documentary,The chars of the Brahmaputra are a curious ge...,
17,A Life to Love (????),drama,When Janie is forced to leave her home at the...,
20,"""Apocalyptic Survival"" (????)",reality-tv,Introduces our cast to a radically changed wo...,
21,The Living (????/II),horror,A cataclysmic event of Biblical proportions t...,
24,"Wheatus, You Might Die (????)",documentary,"In 2010, filmmaker Antony D. Lane joined Whea...",
26,Rice Cultivation in Bali & the System of Rice...,short,A new method of planting rice in Bali is prot...,
30,The Mako Project (????),documentary,3 artist hit the road surviving off their art...,
31,The Sacrifice of the Rose (????),drama,A New York couple in a strained marriage of n...,


In [87]:
data_null4.to_csv('PartialClean.csv') 

In [2]:
data_null4 = pd.read_csv('PartialClean.csv')

In [3]:
data_null4.isna().sum()

Unnamed: 0        0
Title             0
Genre             0
Description       0
Year           1108
dtype: int64

In [27]:
data_null5 = data_null4.copy()

In [28]:
data_null5 = data_null5[data_null5['Year'].isnull()].reset_index()

In [30]:
data_null5 = data_null5.drop(columns=['index', 'Unnamed: 0'])

In [31]:
data_null5

Unnamed: 0,Title,Genre,Description,Year
0,Two Women (????),documentary,"""We are two women, who have each grown in our...",
1,12: The Tamir Rice Story (????/I),drama,"On November 22, 2014, at 3:30 pm in Cleveland...",
2,Waiting for a Storm (????),documentary,The chars of the Brahmaputra are a curious ge...,
3,A Life to Love (????),drama,When Janie is forced to leave her home at the...,
4,"""Apocalyptic Survival"" (????)",reality-tv,Introduces our cast to a radically changed wo...,
5,The Living (????/II),horror,A cataclysmic event of Biblical proportions t...,
6,"Wheatus, You Might Die (????)",documentary,"In 2010, filmmaker Antony D. Lane joined Whea...",
7,Rice Cultivation in Bali & the System of Rice...,short,A new method of planting rice in Bali is prot...,
8,The Mako Project (????),documentary,3 artist hit the road surviving off their art...,
9,The Sacrifice of the Rose (????),drama,A New York couple in a strained marriage of n...,


In [33]:
data_null5['Year'][2]

nan

In [48]:
for i in range(0, 19):
    movie_title = data_null5['Title'][i]
    movies_db = moviesDB.search_movie(movie_title)

    print(f'Searching for {movie_title}:\n')

    development = '(in development)'

    for movie in movies_db:
        title_db = movie['title']
        print(title_db)
        if movie_title in title_db:
            year = movie['year']
            print(f'{title_db} - {year}')
        #print(year)
        print('The year is:', year)    
        data_null5['Year'][i] = year
    

Searching for  Two Women (????) :

Two Women
The year is: 2018
Two Women
The year is: 2018
Two Women
The year is: 2018
Two Women
The year is: 2018
Two Women
The year is: 2018
Between Two Women
The year is: 2018
Two Women
The year is: 2018
Two Women, Two Men
The year is: 2018
Two Women
The year is: 2018
Two Women, One Road
The year is: 2018
Two Women
The year is: 2018
Two Women
The year is: 2018
Two Women
The year is: 2018
Two Women
The year is: 2018
Two Women
The year is: 2018
Two women
The year is: 2018
Two Women
The year is: 2018
Two Women
The year is: 2018
Two Women
The year is: 2018
Two Women
The year is: 2018
Searching for  12: The Tamir Rice Story (????/I) :

12: The Tamir Rice Story
The year is: 2018
Searching for  Waiting for a Storm (????) :

Waiting for a Storm
The year is: 2018
Waiting for a Stranger
The year is: 2018
Waiting for a Star
The year is: 2018
Waiting for a star
The year is: 2018
Waiting for a Sign
The year is: 2018
Searching for a Storm
The year is: 2018
Waiting 

In [49]:
#data_null5['Year'][0] = 18
data_null5

Unnamed: 0,Title,Genre,Description,Year
0,Two Women (????),documentary,"""We are two women, who have each grown in our...",2018
1,12: The Tamir Rice Story (????/I),drama,"On November 22, 2014, at 3:30 pm in Cleveland...",2018
2,Waiting for a Storm (????),documentary,The chars of the Brahmaputra are a curious ge...,2018
3,A Life to Love (????),drama,When Janie is forced to leave her home at the...,2018
4,"""Apocalyptic Survival"" (????)",reality-tv,Introduces our cast to a radically changed wo...,2018


In [35]:
data_null5

Unnamed: 0,Title,Genre,Description,Year
0,Two Women (????),documentary,"""We are two women, who have each grown in our...",
1,12: The Tamir Rice Story (????/I),drama,"On November 22, 2014, at 3:30 pm in Cleveland...",
2,Waiting for a Storm (????),documentary,The chars of the Brahmaputra are a curious ge...,
3,A Life to Love (????),drama,When Janie is forced to leave her home at the...,
4,"""Apocalyptic Survival"" (????)",reality-tv,Introduces our cast to a radically changed wo...,
5,The Living (????/II),horror,A cataclysmic event of Biblical proportions t...,
6,"Wheatus, You Might Die (????)",documentary,"In 2010, filmmaker Antony D. Lane joined Whea...",
7,Rice Cultivation in Bali & the System of Rice...,short,A new method of planting rice in Bali is prot...,
8,The Mako Project (????),documentary,3 artist hit the road surviving off their art...,
9,The Sacrifice of the Rose (????),drama,A New York couple in a strained marriage of n...,


In [16]:
data_null5[['Year']].iloc[2]

Year    NaN
Name: 15, dtype: object

Set the year now.

In [131]:
for i in range(0, 1):
    #movie_title = data_null5['Title'][i]
    movie_title = 'Saving Flora'
    imdb_movies = moviesDB.search_movie(movie_title)
    #imdb_movies = 'Saving Flora'
    print(f'Searching for {movie_title}:\n')

    development = '(in development)'

    #searching through imbd's database for current movie
    for movie in imdb_movies:
        title = movie['title']
        
        if movie_title in title: 
            data_null5['Year'][i] = movie['year'] #set the year it was released
            print(f'{movie_title} was released on {movie["year"]}')
            break

Searching for Saving Flora:

Saving Flora was released on 2018


In [132]:
data_null5

Unnamed: 0.1,Unnamed: 0,Title,Genre,Description,Year
5,5,Two Women (????),documentary,"""We are two women, who have each grown in our...",
13,13,12: The Tamir Rice Story (????/I),drama,"On November 22, 2014, at 3:30 pm in Cleveland...",
15,15,Waiting for a Storm (????),documentary,The chars of the Brahmaputra are a curious ge...,
17,17,A Life to Love (????),drama,When Janie is forced to leave her home at the...,
20,20,"""Apocalyptic Survival"" (????)",reality-tv,Introduces our cast to a radically changed wo...,
21,21,The Living (????/II),horror,A cataclysmic event of Biblical proportions t...,
24,24,"Wheatus, You Might Die (????)",documentary,"In 2010, filmmaker Antony D. Lane joined Whea...",
26,26,Rice Cultivation in Bali & the System of Rice...,short,A new method of planting rice in Bali is prot...,
30,30,The Mako Project (????),documentary,3 artist hit the road surviving off their art...,
31,31,The Sacrifice of the Rose (????),drama,A New York couple in a strained marriage of n...,
