in this notebook we will import the IMDB data and preprocess it before saving it into a new CSV that is ready to be used in our code.

In [1]:
import pandas as pd
import datetime as dt

movies = pd.read_csv('../../../Resources/IMDb_movies.csv',low_memory=False)

at first we will drop all movies with rating less than 6.5 as they are less likely to be liked with the user.

In [2]:
movies=movies[pd.notnull(movies['description'])]
movies = movies[movies['avg_vote'] > 5]
movies=movies.reset_index(drop=True)
movies['id']=movies.index

print(movies.shape)

(65754, 23)


exploring the data 

In [None]:
movies.head(5)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(10,10))
corr = movies.corr()
sns.heatmap(data=corr, square=True , annot=True, cbar=True,linewidth=2)

In [3]:
movies['genre'] = movies['genre'].fillna('')
for i in range(movies['genre'].size):
    r=movies['genre'][i].split(",")
    movies['genre'][i] = list(map(str.strip, r))
    

print(movies['genre'][0][0])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movies['genre'][i] = list(map(str.strip, r))


In [6]:
import itertools
import numpy as np
print(movies['genre'])
genre =  np.unique([*itertools.chain.from_iterable(movies['genre'])])
print(genre)


0                          [Romance]
1          [Biography, Crime, Drama]
2                            [Drama]
3                   [Drama, History]
4        [Adventure, Drama, Fantasy]
                    ...             
65749                        [Drama]
65750                [Comedy, Drama]
65751                       [Comedy]
65752                [Comedy, Drama]
65753                        [Drama]
Name: genre, Length: 65754, dtype: object
['Action' 'Adventure' 'Animation' 'Biography' 'Comedy' 'Crime'
 'Documentary' 'Drama' 'Family' 'Fantasy' 'Film-Noir' 'History' 'Horror'
 'Music' 'Musical' 'Mystery' 'News' 'Reality-TV' 'Romance' 'Sci-Fi'
 'Sport' 'Thriller' 'War' 'Western']


In [8]:
np.save("../../../Resources/genre.npy",genre)

we will transalte the actors/direcotrs/genres to numaric values that we can use.

In [None]:
import itertools
import numpy as np

movies['genre'] = movies['genre'].fillna('')
for i in range(movies['genre'].size):
    r=movies['genre'][i].split(",")
    movies['genre'][i] = list(map(str.strip, r))
    

print(movies['genre'][0][0])


movies['writer'] = movies['writer'].fillna('')
for i in range(movies['writer'].size):
    r=movies['writer'][i].split(",")
    movies['writer'][i] = list(map(str.strip, r))
    

print(movies['writer'][0][0])


movies['director'] = movies['director'].fillna('')
for i in range(movies['director'].size):
    r=movies['director'][i].split(",")
    movies['director'][i] = list(map(str.strip, r))
    

print(movies['director'][0][0])

movies['language'] = movies['language'].fillna('')
for i in range(movies['language'].size):
    r=movies['language'][i].split(",")
    movies['language'][i] = list(map(str.strip, r))
    

print(movies['language'][0][0])



In [None]:
print(movies['genre'])
genre =  np.unique([*itertools.chain.from_iterable(movies['genre'])])
print(genre)

print(movies['writer'])
writer =  np.unique([*itertools.chain.from_iterable(movies['writer'])])
print(writer)

print(movies['director'])
director =  np.unique([*itertools.chain.from_iterable(movies['director'])])
print(director)

print(movies['language'])
language =  np.unique([*itertools.chain.from_iterable(movies['language'])])
print(language)

In [None]:
from sklearn import preprocessing

label_encoder = preprocessing.LabelEncoder()

label_encoder.fit(genre)
genre_encoded = movies["genre"].apply(lambda x:label_encoder.transform(x))
movies['encoded_genre'] = genre_encoded

label_encoder.fit(writer)
writer_encoded = movies["writer"].apply(lambda x:label_encoder.transform(x))
movies['writer_encoded'] = writer_encoded


label_encoder.fit(director)
director_encoded = movies["director"].apply(lambda x:label_encoder.transform(x))
movies['director_encoded'] = director_encoded


label_encoder.fit(language)
language_encoded = movies["language"].apply(lambda x:label_encoder.transform(x))
movies['language_encoded'] = language_encoded


In [None]:
movies.head(5)
movies.shape

In [3]:
rating = pd.read_csv('../../../Resources/IMDb_ratings.csv',low_memory=False)
movies = pd.merge(movies, rating, on="imdb_title_id")

In [4]:
rating.columns

Index(['imdb_title_id', 'weighted_average_vote', 'total_votes', 'mean_vote',
       'median_vote', 'votes_10', 'votes_9', 'votes_8', 'votes_7', 'votes_6',
       'votes_5', 'votes_4', 'votes_3', 'votes_2', 'votes_1',
       'allgenders_0age_avg_vote', 'allgenders_0age_votes',
       'allgenders_18age_avg_vote', 'allgenders_18age_votes',
       'allgenders_30age_avg_vote', 'allgenders_30age_votes',
       'allgenders_45age_avg_vote', 'allgenders_45age_votes',
       'males_allages_avg_vote', 'males_allages_votes', 'males_0age_avg_vote',
       'males_0age_votes', 'males_18age_avg_vote', 'males_18age_votes',
       'males_30age_avg_vote', 'males_30age_votes', 'males_45age_avg_vote',
       'males_45age_votes', 'females_allages_avg_vote',
       'females_allages_votes', 'females_0age_avg_vote', 'females_0age_votes',
       'females_18age_avg_vote', 'females_18age_votes',
       'females_30age_avg_vote', 'females_30age_votes',
       'females_45age_avg_vote', 'females_45age_votes',
       

In [None]:
movies.to_csv("../../../Resources/modified_movies.csv")