In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os


# **Movie Genre Classification With Machine Learning**

Using the dataset found [here](https://www.kaggle.com/datasets/hijest/genre-classification-dataset-imdb) we will build a machine learning model which will predict the genre of a movie based off the description provided. 

# **Import Dependencies**

First things first, we must import the dependencies required to build the model. 

Here is a quick summary of what each dependency is used for: 
*      **random**: shuffle the features in the data
*      **numpy**: create np.array
*      **nltk**: tokenization & lemmatization of sentences & words
*      **tensorflow**: model creation
    

In [1]:
import random
import numpy as np

import nltk
nltk.download('omw-1.4')
from nltk.stem import WordNetLemmatizer

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation, Dropout
from tensorflow.keras.optimizers import SGD

[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\verma\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


# **EDA**
    
Our target variable is "Genre", leaving "ID", "Title" and "Description" as our feature variables. 

ID is just the index of the movie in our dataset, meaning that it has no significance to the target variable
Title is the name of the movie, it is unlikely that it has enough information to determine the genre, so it is ruled out. 

By process of elimination the Description of the movie is determined to be the only significant feature in our dataset. 

# **Data Processing**

**Reading In Data**

First we open the datafile and read it line by line. Due to the nature of the text file each line is a single datapoint representing a movie. Splitting the line with " ::: " seperates the variables into an array with the following format: 

    [ID, Title, Genre, Description]

We then extract the genre and description into seperate arrays. 

**Note only the first thousand datapoints are processed and later used to develop the model, this is done to save time when building the model.*

In [4]:
lemmatizer = WordNetLemmatizer()
TOTAL_DATA_POINTS = 1000
genres = []
descriptions = []

with open('train_data.txt', encoding='utf-8') as f:
    # Your processing code here
    counter = 0
    for line in f:
        if counter <= TOTAL_DATA_POINTS:
            data = line.split(" ::: ")
            genres.append(data[2])
            descriptions.append(data[3])
            counter += 1
            

**Tokenization & Lemmatization**

Next the descriptions are tokenized and lemmatized and added to words[] which represents the vocabulary of our model. 

The same thing is done to the genres, and they are added to classes[]

The tokenized array of each description & its corresponding genre are added to documents[] as a tuple. Here is an example of the format of document: 

    [(['This', 'movie', 'is', 'funny'], 'comedy'), (['This', 'movie', 'is', 'scary'], 'horror')]

In [5]:
words = []
classes = []
documents = []
ignore_letters = ['?', '!', '.', ',']

for i in range(TOTAL_DATA_POINTS):
    word_list = nltk.word_tokenize(descriptions[i])
    words.extend(word_list)
    documents.append((word_list, genres[i]))
    if genres[i] not in classes:
        classes.append(genres[i])

# lemmatize words & add them to words[] if they're not in ignore_letters
words = [lemmatizer.lemmatize(word) for word in words if word not in ignore_letters]
words = sorted(set(words))          # remove any duplicates

print(documents[0])

(['Listening', 'in', 'to', 'a', 'conversation', 'between', 'his', 'doctor', 'and', 'parents', ',', '10-year-old', 'Oscar', 'learns', 'what', 'nobody', 'has', 'the', 'courage', 'to', 'tell', 'him', '.', 'He', 'only', 'has', 'a', 'few', 'weeks', 'to', 'live', '.', 'Furious', ',', 'he', 'refuses', 'to', 'speak', 'to', 'anyone', 'except', 'straight-talking', 'Rose', ',', 'the', 'lady', 'in', 'pink', 'he', 'meets', 'on', 'the', 'hospital', 'stairs', '.', 'As', 'Christmas', 'approaches', ',', 'Rose', 'uses', 'her', 'fantastical', 'experiences', 'as', 'a', 'professional', 'wrestler', ',', 'her', 'imagination', ',', 'wit', 'and', 'charm', 'to', 'allow', 'Oscar', 'to', 'live', 'life', 'and', 'love', 'to', 'the', 'full', ',', 'in', 'the', 'company', 'of', 'his', 'friends', 'Pop', 'Corn', ',', 'Einstein', ',', 'Bacon', 'and', 'childhood', 'sweetheart', 'Peggy', 'Blue', '.'], 'drama')


**Numerically Representing The Data**

We must then numerically represent our data in order to be able to use it to train our model. To do this we will convert documents[] into a [bag of words](https://en.wikipedia.org/wiki/Bag-of-words_model). Bag of words works by representing a word as a 1 at the index it is found at in its parent (string) array. Building on the previous example: 

    documents = [(['This', 'movie', 'is', 'funny'], 'comedy'), (['This', 'movie', 'is', 'scary'], 'horror')]
    classes = ['comedy', 'horror']
    words = ['This', 'movie', 'is', 'funny', 'scary']
    
    training = [[[1, 1, 1, 1, 0], [1, 0]], [[1, 1, 1, 0, 1], [0, 1]]]   <- Bag of Words representation of documents

In [6]:
# create training data
training = []
# create empty array for our output
output_empty = [0] * len(classes)

# training set, bag of words for each sentence
for document in documents:
    # initialize bag of words
    bag = []
    # list of tokenized words for the pattern
    word_patterns = document[0]
    # lemmatize each word in the pattern
    word_patterns = [lemmatizer.lemmatize(word.lower()) for word in word_patterns]
    # create our bag of words array with 1 if word found in current pattern
    for word in words:
        bag.append(1) if word in word_patterns else bag.append(0)

    # output is 0 for each tag and 1 for current tag (for each pattern)
    output_row = list(output_empty)
    output_row[classes.index(document[1])] = 1
    training.append([bag, output_row])

**Shuffling and Seperating Data**

Lastly, the order of the data is shuffled, and it is then seperated into target and feature variables. 

In [7]:
# shuffle features and turn into np.array
random.shuffle(training)
training = np.array(training)

# create train lists. X - patterns, Y - intents
train_x = list(training[:, 0])
train_y = list(training[:, 1])

  training = np.array(training)


# Model Development

A sequential model is built with 3 layers. The first layer has as many neurons as words in our vocabulary, the second layer has half as many neurons, and the final output layer has as many neuorns as genres. 

The loss is calculated with 'categorical_crossentropy' since there are multiple target categories. 

The model is then saved to be used here.

In [8]:
from tensorflow.keras.optimizers.legacy import SGD
# create train lists. X - patterns, Y - intents
train_x = list(training[:, 0])
train_y = list(training[:, 1])

num_words = len(words)
num_class = len(classes)


# create model - 3 layers. First layer contains as many neurons as words, second layer contains half as many neurons as the 
# first layer and 3rd output layer contains as many neurons as classes
model = Sequential()
model.add(Dense(num_words, input_shape=(len(train_x[0]),), activation='relu'))
model.add(Dropout(0.5))
model.add(Dense((num_words/2), activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(num_class, activation='softmax'))

# Compile model.
sgd = SGD(learning_rate=0.01, decay=1e-6, momentum=0.9, nesterov=True)
model.compile(loss='categorical_crossentropy', optimizer=sgd, metrics=['accuracy'])

# Fitting and saving the model.
hist = model.fit(np.array(train_x), np.array(train_y), epochs=11, batch_size=5, verbose=1)
model.save('GenreClassification.h5', hist)


Epoch 1/11
Epoch 2/11
Epoch 3/11
Epoch 4/11
Epoch 5/11
Epoch 6/11
Epoch 7/11
Epoch 8/11
Epoch 9/11
Epoch 10/11
Epoch 11/11


  saving_api.save_model(
