In [6]:
# First we need to import some packages
import keras
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix
import numpy as np
import matplotlib.pyplot as plt
import json
from copy import deepcopy
from keras.preprocessing.text import one_hot
from keras.preprocessing.sequence import pad_sequences

# Now read in the data
data=pd.read_csv('./tmdb_5000_movies.csv')

# Change data and get only what we want
data=data[['title','genres','overview']]

# Print out the first record
print("Data before cleaning: ")
print(data['title'][0])
print(data['genres'][0])
print(data['overview'][0], "\n\n")

# define a function to clean our genres from [{"id": 28, "name": "Action"}, {"id": 12, "name": "Adventure"}, {"id": 14, "name": "Fantasy"}, {"id": 878, "name": "Science Fiction"}]
# to [Action, Aventure, Fantasy, Science Fiction]
def clean_genres(genre_data):
    # List to hold results
    genre_list = []
    for item in genre_data:
        # Since data is json we can use the json lib
        json_data = json.loads(item)
        temp_list = []

        for dict_item in json_data:
            # Grab only the genre names
            temp_list.append(dict_item["name"])
    
        genre_list.append(temp_list)

    return genre_list

def clean_overview(overview_list):
    # List of words to remove
    words_to_remove = ["a", "and", "the", "but", "nor", "else", "or", "its", "it's", "of", "to", "in", "on", "is", "be",
                        "he", "his", "him" "she", "her", "an", "as", "for", "by", "are", "if", "it",
                        "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t",
                        "u", "v", "w", "x", "y", "z"
                      ]

    # List to hold results
    clean_overview_list = []
    for item in overview_list:
        # force item to be treated as a string
        item =str(item)
        
        # Split into a list of words
        words_list = item.split()
        
        temp_list = []
        # Remove any punctuation
        for word in words_list:
            # force item to be treated as a string
            word = str(word)
            word = word.replace(",", "")
            word = word.replace(".", "")
            word = word.replace("\"", "")
            word = word.replace("-", "")
            word = word.replace(")", "")
            word = word.replace("(", "")
            temp_list.append(word)
        
        words_list = temp_list

        # Create a new list of filtered words
        filtered_words_list = [word for word in words_list if word.lower() not in words_to_remove]
        
        clean_overview_list.append(filtered_words_list)

    return clean_overview_list

data["genres"] = clean_genres(data["genres"])
data["overview"] = clean_overview(data["overview"])

# Print out the first record again
print("Data after cleaning: ")
print(data['title'][0])
print(data['genres'][0])
print(data['overview'][0], "\n\n")

Data before cleaning: 
Avatar
[{"id": 28, "name": "Action"}, {"id": 12, "name": "Adventure"}, {"id": 14, "name": "Fantasy"}, {"id": 878, "name": "Science Fiction"}]
In the 22nd century, a paraplegic Marine is dispatched to the moon Pandora on a unique mission, but becomes torn between following orders and protecting an alien civilization. 


Data after cleaning: 
Avatar
['Action', 'Adventure', 'Fantasy', 'Science Fiction']
['22nd', 'century', 'paraplegic', 'Marine', 'dispatched', 'moon', 'Pandora', 'unique', 'mission', 'becomes', 'torn', 'between', 'following', 'orders', 'protecting', 'alien', 'civilization'] 




In [3]:
def find_unique_words(input_data):
    '''
        Here we need to find all unique words in the input data.
        In this case we will be past a list of words lists.
    '''
    unique_words = {}
    unique_word_index = 0
    for word_list in input_data:
        for word in word_list:
            # First check if dict is not empty
            if unique_words:
                # Not Empty, so we must check if the current word is in the unique_words dict.
                if word not in list(unique_words.values()):
                    # Increment the index
                    unique_word_index = unique_word_index + 1
                    unique_words[unique_word_index] = word
            else:
                # Empty, add fist item to dict. So index should be {1:'word'}
                # Increment the index
                unique_word_index = unique_word_index + 1
                unique_words[unique_word_index] = word
    
    # return the unique words dict and the unique word_index value
    return unique_words, unique_word_index

# Next, we need to figure out how many unique words/genres are in overview and genres
unique_words, unique_words_count = find_unique_words(data['overview'])
unique_genres, unique_genres_count = find_unique_words(data['genres'])

print("Total unique words in overview:", unique_words_count)
print("Total unique genres in genre:", unique_genres_count, "\n\n")

Total unique words in overview: 27526
Total unique genres in genre: 20 




In [5]:
# Our next step is to encode the genres and the overview columns. This should be a bit tricky...
# Let's copy our data just in case :)

def encode_data(data, size, name):
    '''
        This function needs to encode (vectorize) a given data set.
        It will return the encoded data set.
        data is the data we will encode.
        size is the number of unique words/items
        name is the name of the data inside "data".
    '''
    
    encoded_data = []
    for item in data[name]:
        temp = []
        for word in item:
            item = one_hot(word, size)
            if item:
                temp.append(item[0])
        encoded_data.append(temp)
    
    return encoded_data

data_copy = deepcopy(data)

# Now we encode the data
encoded_overview = encode_data(data_copy, unique_words_count, "overview")
encoded_genres = encode_data(data_copy, unique_genres_count, "genres")

print("First encoded overview item:", encoded_overview[0])
print("First encoded genres item:", encoded_genres[0])

First encoded overview item: [11453, 5907, 20345, 20881, 16530, 6804, 14476, 11612, 4003, 17955, 3403, 8024, 24165, 9780, 1714, 9430, 752]
First encoded genres item: [1, 15, 16, 16]
