In [1]:
# First we need to import some packages
import keras
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix
import numpy as np
import matplotlib.pyplot as plt
import json
from copy import deepcopy
from keras.preprocessing.text import Tokenizer
from sklearn.model_selection import train_test_split

# Now read in the data
data=pd.read_csv('./tmdb_5000_movies.csv')

# Change data and get only what we want
data=deepcopy(data[['title','genres','overview']])

# Print out the first record
print("Data before cleaning: ")
print(data['title'][0])
print(data['genres'][0])
print(data['overview'][0], "\n\n")

# define a function to clean our genres from [{"id": 28, "name": "Action"}, {"id": 12, "name": "Adventure"}, {"id": 14, "name": "Fantasy"}, {"id": 878, "name": "Science Fiction"}]
# to [Action, Aventure, Fantasy, Science Fiction]
def clean_genres(genre_data):
    # List to hold results
    genre_list = []
    for item in genre_data:
        # Since data is json we can use the json lib
        json_data = json.loads(item)
        temp_list = []

        for dict_item in json_data:
            # Grab only the genre names
            temp_list.append(dict_item["name"])
    
        genre_list.append(temp_list)

    return genre_list

def clean_overview(overview_list):
    # List of words to remove
    words_to_remove = ["a", "and", "the", "but", "nor", "else", "or", "its", "it's", "of", "to", "in", "on", "is", "be",
                        "he", "his", "him" "she", "her", "an", "as", "for", "by", "are", "if", "it",
                        "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t",
                        "u", "v", "w", "x", "y", "z"
                      ]

    # List to hold results
    clean_overview_list = []
    for item in overview_list:
        # force item to be treated as a string
        item =str(item)
        
        # Split into a list of words
        words_list = item.split()
        
        temp_list = []
        # Remove any punctuation
        for word in words_list:
            # force item to be treated as a string, but first check if it is not empty
            if word:
                word = str(word)
                word = word.replace(",", "")
                word = word.replace(".", "")
                word = word.replace("\"", "")
                word = word.replace("-", "")
                word = word.replace(")", "")
                word = word.replace("(", "")
                word = word.replace("?", "")
                word = word.replace("!", "")
                word = word.replace("~", "")
                word = word.replace("`", "")
                word = word.replace("'", "")
                word = word.replace("’", "")
                word = word.replace(";", "")
                word = word.replace(":", "")
                word = word.replace("…", "")
                word = word.replace("−", "")
                temp_list.append(word)
        
        words_list = temp_list

        # Create a new list of filtered words
        filtered_words_list = [word for word in words_list if word.lower() not in words_to_remove]
        
        clean_overview_list.append(filtered_words_list)

    return clean_overview_list

data["genres"] = clean_genres(data["genres"])
data["overview"] = clean_overview(data["overview"])

# Print out the first record again
print("Data after cleaning: ")
print(data['title'][0])
print(data['genres'][0])
print(data['overview'][0], "\n\n")

Data before cleaning: 
Avatar
[{"id": 28, "name": "Action"}, {"id": 12, "name": "Adventure"}, {"id": 14, "name": "Fantasy"}, {"id": 878, "name": "Science Fiction"}]
In the 22nd century, a paraplegic Marine is dispatched to the moon Pandora on a unique mission, but becomes torn between following orders and protecting an alien civilization. 


Data after cleaning: 
Avatar
['Action', 'Adventure', 'Fantasy', 'Science Fiction']
['22nd', 'century', 'paraplegic', 'Marine', 'dispatched', 'moon', 'Pandora', 'unique', 'mission', 'becomes', 'torn', 'between', 'following', 'orders', 'protecting', 'alien', 'civilization'] 




In [2]:
# Next we use the Tokenizer from Keras to make our life easier. It will tokenize the text and encode (vectorize) it.
# But first let's copy our data

data_copy = deepcopy(data)

# Make one for each
overview_tokenizer = Tokenizer()
genre_tokenizer = Tokenizer()

# Feed tokenizers the data
overview_tokenizer.fit_on_texts(data_copy["overview"])
genre_tokenizer.fit_on_texts(data_copy["genres"])

# Show what the tokenizers found.
print("Overview word count:", overview_tokenizer.word_counts, "\n\n")
print("Genre word count:", genre_tokenizer.word_counts, "\n\n")
print("Overview unique word count:", len(overview_tokenizer.word_counts))
print("Genre unique word count:", len(genre_tokenizer.word_counts), "\n\n")

# Show how genre/overview are index
print("Overview index values:", overview_tokenizer.word_index, "\n\n")
print("Genre index values:", genre_tokenizer.word_index, "\n\n")



Genre word count: OrderedDict([('action', 1154), ('adventure', 790), ('fantasy', 424), ('science fiction', 535), ('crime', 696), ('drama', 2297), ('thriller', 1274), ('animation', 234), ('family', 513), ('western', 82), ('comedy', 1722), ('romance', 894), ('horror', 519), ('mystery', 348), ('history', 197), ('war', 144), ('music', 185), ('documentary', 110), ('foreign', 34), ('tv movie', 8)]) 


Overview unique word count: 23668
Genre unique word count: 20 




Genre index values: {'drama': 1, 'comedy': 2, 'thriller': 3, 'action': 4, 'romance': 5, 'adventure': 6, 'crime': 7, 'science fiction': 8, 'horror': 9, 'family': 10, 'fantasy': 11, 'mystery': 12, 'animation': 13, 'history': 14, 'music': 15, 'war': 16, 'documentary': 17, 'western': 18, 'foreign': 19, 'tv movie': 20} 




In [3]:
# Now we encode the genres and overview to create a matrix
encoded_overview = overview_tokenizer.texts_to_matrix(data_copy["overview"], mode="count")
encoded_genre = genre_tokenizer.texts_to_matrix(data_copy["genres"], mode="count")

print("Encoded overview data set:", encoded_overview, "\n\n")
print("Encoded genre data set:", encoded_genre, "\n\n")
print("Encoded overview data width:", len(encoded_overview[0]))
print("Encoded overview data height:", len(encoded_overview))
print("Encoded genre data width:", len(encoded_genre[0]))
print("Encoded genre data height:", len(encoded_genre), "\n\n")


Encoded overview data set: [[0. 0. 0. ... 0. 0. 0.]
 [0. 1. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 1. 1. ... 1. 1. 1.]
 [0. 0. 0. ... 0. 0. 0.]] 


Encoded genre data set: [[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 1. 1. ... 0. 0. 1.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]] 


Encoded overview data width: 23669
Encoded overview data height: 4803
Encoded genre data width: 21
Encoded genre data height: 4803 




In [4]:
# Our data is now ready :)... I think.
print("Sample Data:", data_copy["title"][0], data_copy["genres"][0], data_copy["overview"][0])
print("Sample Data Encoded:", data_copy["title"][0], encoded_genre[0], encoded_overview[0], "\n\n")

Sample Data: Avatar ['Action', 'Adventure', 'Fantasy', 'Science Fiction'] ['22nd', 'century', 'paraplegic', 'Marine', 'dispatched', 'moon', 'Pandora', 'unique', 'mission', 'becomes', 'torn', 'between', 'following', 'orders', 'protecting', 'alien', 'civilization']
Sample Data Encoded: Avatar [0. 0. 0. 0. 1. 0. 1. 0. 1. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.] [0. 0. 0. ... 0. 0. 0.] 




In [5]:
# Now we assign stuff back to our data pandaframe and delete data_copy

# Here we append to and assign to our data pandaframe
data["genres"] = data_copy["genres"]
data["overview"] = data_copy["overview"]
data["title"] = data_copy["title"]
data["one_hot_genre"] = list(encoded_genre)
data["one_hot_overview"] = list(encoded_overview)
data = data.append({"genre_word_index": genre_tokenizer.word_index}, ignore_index=True)
data = data.append({"overview_word_index": overview_tokenizer.word_index}, ignore_index=True)
del data_copy

# Lastly Print our updated data pandaframe
print(data)

                                         title  \
0                                       Avatar   
1     Pirates of the Caribbean: At World's End   
2                                      Spectre   
3                        The Dark Knight Rises   
4                                  John Carter   
...                                        ...   
4800                 Signed, Sealed, Delivered   
4801                          Shanghai Calling   
4802                         My Date with Drew   
4803                                       NaN   
4804                                       NaN   

                                             genres  \
0     [Action, Adventure, Fantasy, Science Fiction]   
1                      [Adventure, Fantasy, Action]   
2                        [Action, Adventure, Crime]   
3                  [Action, Crime, Drama, Thriller]   
4              [Action, Adventure, Science Fiction]   
...                                             ...   
4800          

In [14]:
# Cool, now we can begin to build our nerual network, but first we need x_train, x_test, y_train, y_test
X_train, y_train, X_test, y_test = train_test_split(data["one_hot_overview"], data["one_hot_genre"], test_size=0.20)
print("X_train size:", X_train.shape[0], "y_train size:", y_train.shape[0], "X_test size:", X_test.shape[0], "y_test size:", y_test.shape[0])

# Make the NN
my_NN = keras.Sequential()
my_NN.add(keras.layers.Dense())

X_train size: 3844 y_train size: 961 X_test size: 3844 y_test size: 961
