In [1]:
import pandas as pd
pd.set_option('display.max_columns', None)
import numpy as np
import os
import requests
import json
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')
from PIL import Image
from io import BytesIO

In [2]:
current_path = os.getcwd()
os.chdir(current_path)

# 1. Lectura de Datos

In [3]:
target_movie_df_0 = pd.read_csv('Data/IMDB_Top250Engmovies2_OMDB_Detailed.csv').iloc[:, 1:]
target_movie_df_0.head(2)

Unnamed: 0,Title,Year,Rated,Released,Runtime,Genre,Director,Writer,Actors,Plot,Language,Country,Awards,Poster,Ratings.Source,Ratings.Value,Metascore,imdbRating,imdbVotes,imdbID,Type,tomatoMeter,tomatoImage,tomatoRating,tomatoReviews,tomatoFresh,tomatoRotten,tomatoConsensus,tomatoUserMeter,tomatoUserRating,tomatoUserReviews,tomatoURL,DVD,BoxOffice,Production,Website,Response
0,The Shawshank Redemption,1994,R,14 Oct 1994,142 min,"Crime, Drama",Frank Darabont,"Stephen King (short story ""Rita Hayworth and S...","Tim Robbins, Morgan Freeman, Bob Gunton, Willi...",Two imprisoned men bond over a number of years...,English,USA,Nominated for 7 Oscars. Another 19 wins & 30 n...,https://images-na.ssl-images-amazon.com/images...,Internet Movie Database,9.3/10,80.0,9.3,1825626,tt0111161,movie,,,,,,,,,,,http://www.rottentomatoes.com/m/shawshank_rede...,27 Jan 1998,,Columbia Pictures,,True
1,The Godfather,1972,R,24 Mar 1972,175 min,"Crime, Drama",Francis Ford Coppola,"Mario Puzo (screenplay), Francis Ford Coppola ...","Marlon Brando, Al Pacino, James Caan, Richard ...",The aging patriarch of an organized crime dyna...,"English, Italian, Latin",USA,Won 3 Oscars. Another 23 wins & 27 nominations.,https://images-na.ssl-images-amazon.com/images...,Internet Movie Database,9.2/10,100.0,9.2,1243444,tt0068646,movie,,,,,,,,,,,http://www.rottentomatoes.com/m/godfather/,09 Oct 2001,,Paramount Pictures,http://www.thegodfather.com,True


## 1.1 Actualización de Datos

In [4]:
# Leer archivo con credenciales API

from tkinter import *
from tkinter import filedialog
from tkinter import Tk

root = Tk()
root.withdraw()
root.wm_attributes('-topmost', 1)

filepath = filedialog.askopenfilename(parent=root, initialdir=current_path)


file = open(filepath, 'r')
credentials = json.load(file)

In [5]:
# Generación de función de Consulta a API IMDB

def movie_info(ID):
    
    if type('a').__name__ != 'str':
        return('Error: Solo cadenas de texto.')
    
    if ID[:2] != 'tt':
        
        url = "https://online-movie-database.p.rapidapi.com/auto-complete"
        
        querystring = {"q": ID}
        
        headers = {
            "X-RapidAPI-Key": credentials['API_KEY']
            ,"X-RapidAPI-Host": credentials['HOST']
        }
        
        response = json.loads(requests.request("GET", url, headers=headers, params=querystring).text)
        
        basic_info = pd.DataFrame()
        
        for i in range(len(response['d'])):
            basic_info = basic_info.append({'id':response['d'][i].get('id')
                                            ,'title':response['d'][i].get('l')
                                            ,'year':response['d'][i].get('y')
                                            ,'type':response['d'][i].get('qid')
                                            ,'rank':response['d'][i].get('rank')
                                            ,'cast':response['d'][i].get('s')
                                            ,'image':response['d'][i].get('i', {}).get('imageUrl')
                                           }
                                           , ignore_index=True
                                          )
        
        basic_info = basic_info[basic_info['type']=='movie']
        basic_info = basic_info[:1]
        
        complete_info = movie_info(list(basic_info['id'])[0])
    
    else:
        
        url_meta = "https://online-movie-database.p.rapidapi.com/title/get-meta-data"
        url_credits = "https://online-movie-database.p.rapidapi.com/title/get-full-credits"
        url_versions = "https://online-movie-database.p.rapidapi.com/title/get-versions"
        url_plot = "https://online-movie-database.p.rapidapi.com/title/get-plots"
        
        querystring_meta = {"ids": ID
                            ,"region":"US"
                           }
        querystring_credits = {"tconst": ID}
        querystring_versions = {"tconst": ID}
        querystring_plot = {"tconst": ID}
        
        headers = {
            "X-RapidAPI-Key": credentials['API_KEY']
            ,"X-RapidAPI-Host": credentials['HOST']
        }
        
        response_meta = json.loads(requests.request("GET", url_meta, headers=headers, params=querystring_meta).text)
        response_credits = json.loads(requests.request("GET", url_credits, headers=headers, params=querystring_credits).text)
        response_versions = json.loads(requests.request("GET", url_versions, headers=headers, params=querystring_versions).text)
        response_plot = json.loads(requests.request("GET", url_plot, headers=headers, params=querystring_plot).text)
        
        complete_info = pd.DataFrame()
        
        complete_info = complete_info.append({'id':ID
                                              ,'title':response_meta.get(ID, {}).get('title', {}).get('title')
                                              ,'year':response_meta.get(ID, {}).get('title', {}).get('year')
                                              ,'rated':response_meta.get(ID, {}).get('certificate')
                                              ,'released':datetime.strptime((response_meta.get(ID, {}).get('releaseDate')+'-01')[:10], '%Y-%m-%d')
                                              ,'runtime':response_meta.get(ID, {}).get('title', {}).get('runningTimeInMinutes')
                                              ,'genre':response_meta.get(ID, {}).get('genres')
                                              ,'directors':[response_credits.get('crew', {}).get('director', {})[k].get('name') for k in range(len(response_credits.get('crew', {}).get('director', {})))]
                                              ,'cinematographer':[response_credits.get('crew', {}).get('cinematographer', {})[k].get('name') for k in range(len(response_credits.get('crew', {}).get('cinematographer', {})))]
                                              ,'writers':[response_credits.get('crew', {}).get('writer', {})[k].get('name') for k in range(len(response_credits.get('crew', {}).get('writer', {})))]
                                              ,'actors':[response_credits.get('cast', {})[k].get('name') for k in range(len(response_credits.get('cast', {})))]
                                              ,'colorations':next(iter(response_versions.get('colorations')), None)
                                              ,'plot':next(iter(response_plot.get('plots')), None).get('text')
                                              ,'language':next(iter(response_versions.get('spokenLanguages')), None)
                                              ,'country':next(iter(response_versions.get('origins')), None)
                                              ,'poster':response_meta.get(ID, {}).get('title', {}).get('image', {}).get('url')
                                              ,'imdb_rating':response_meta.get(ID, {}).get('ratings', {}).get('rating')
                                              ,'imdb_count':response_meta.get(ID, {}).get('ratings', {}).get('ratingCount')
                                              ,'metascore':response_meta.get(ID, {}).get('metacritic', {}).get('metaScore')
                                              ,'meta_count':response_meta.get(ID, {}).get('metacritic', {}).get('reviewCount')
                                             }
                                             , ignore_index=True)
        
    return(complete_info)

In [6]:
target_movie_df = pd.read_pickle('Data/Top250Movies.pickle')
target_movie_df = target_movie_df.drop(columns = ['cinematographer', 'plot'])
target_movie_df.head(2)

Unnamed: 0,id,title,year,rated,released,runtime,genre,directors,writers,actors,colorations,language,country,poster,imdb_rating,imdb_count,metascore,meta_count
0,tt0111161,The Shawshank Redemption,1994.0,R,1994-10-14,142.0,[Drama],[Frank Darabont],"[Stephen King, Frank Darabont]","[Tim Robbins, Morgan Freeman, Bob Gunton, Will...",color,en,US,https://m.media-amazon.com/images/M/MV5BMDFkYT...,9.3,2663893.0,81.0,21.0
0,tt0068646,The Godfather,1972.0,R,1972-03-24,175.0,"[Crime, Drama]",[Francis Ford Coppola],"[Mario Puzo, Francis Ford Coppola, Mario Puzo]","[Marlon Brando, Al Pacino, James Caan, Richard...",color,en,US,https://m.media-amazon.com/images/M/MV5BM2MyNj...,9.2,1846042.0,100.0,16.0


## 1.2 Revisión de Nulos

In [7]:
# Revision variables nulas

target_movie_df.isnull().mean().sort_values(ascending=False)

meta_count     0.064
metascore      0.064
title          0.000
imdb_count     0.000
imdb_rating    0.000
poster         0.000
country        0.000
language       0.000
colorations    0.000
id             0.000
writers        0.000
directors      0.000
genre          0.000
runtime        0.000
released       0.000
rated          0.000
year           0.000
actors         0.000
dtype: float64

In [8]:
# Se calcula un estimado para los que sean null

from sklearn.linear_model import LinearRegression

lr = LinearRegression()

target_movie_df_null = target_movie_df[['id'
                                        ,'imdb_rating'
                                        ,'imdb_count'
                                        ,'year'
                                        ,'metascore'
                                        ,'meta_count'
                                       ]
                                      ]

testdf = target_movie_df_null[target_movie_df_null['metascore'].isnull()==True]
traindf = target_movie_df_null[target_movie_df_null['metascore'].isnull()==False]


# Calculo metascore

y_train = traindf['metascore']
X_train = traindf[['year', 'imdb_rating']]

lr.fit(X_train, y_train)

X_test = testdf[['year', 'imdb_rating']]
pred = lr.predict(X_test)

testdf['metascore']= pred
testdf['metascore'] = testdf['metascore'].apply(lambda x: np.floor(x))


# Calculo meta count

y_train = traindf['meta_count']
X_train = traindf[['imdb_rating', 'imdb_count']]

lr.fit(X_train, y_train)

X_test = testdf[['imdb_rating', 'imdb_count']]
pred = lr.predict(X_test)

testdf['meta_count'] = pred
testdf['meta_count'] = testdf['meta_count'].apply(lambda x: np.floor(x))

target_movie_df = target_movie_df.merge(testdf
                                        , left_on='id'
                                        , right_on='id'
                                        , how = 'left'
                                        , suffixes=('', '_y')
                                       )

target_movie_df['metascore'] = target_movie_df['metascore'].combine_first(target_movie_df['metascore_y'])
target_movie_df['meta_count'] = target_movie_df['meta_count'].combine_first(target_movie_df['meta_count_y'])

target_movie_df = target_movie_df.drop(columns=['imdb_rating_y'
                                                , 'imdb_count_y'
                                                , 'year_y'
                                                , 'metascore_y'
                                                , 'meta_count_y'
                                               ]
                                      )

target_movie_df.isnull().mean().sort_values(ascending=False)

id             0.0
title          0.0
metascore      0.0
imdb_count     0.0
imdb_rating    0.0
poster         0.0
country        0.0
language       0.0
colorations    0.0
actors         0.0
writers        0.0
directors      0.0
genre          0.0
runtime        0.0
released       0.0
rated          0.0
year           0.0
meta_count     0.0
dtype: float64

## 1.3 Feature Engineering

In [9]:
# Generar dias desde el estreno

target_movie_df['days_since_premiere'] = target_movie_df['released'].apply(lambda x: (datetime.now()-x).days)
target_movie_df = target_movie_df.drop(columns=['released'])

In [10]:
# Generar Top 5 generos
genres = []

for i in range(6):
    target_movie_df['genre'+'_'+str(i+1)] = target_movie_df['genre'].apply(lambda x: x[i] if len(x) > i else np.NaN)
    genres = genres + list(target_movie_df['genre'+'_'+str(i+1)].apply(lambda x: str(x)))

genres = sorted(list(set(genres)))[1:]

for i in genres:
    target_movie_df['g_' + i] = target_movie_df.apply(lambda x: 1 if (x.genre_1 == i)
                                                              | (x.genre_2 == i)
                                                              | (x.genre_3 == i)
                                                              | (x.genre_4 == i)
                                                              | (x.genre_5 == i)
                                                              | (x.genre_6 == i)
                                                           else 0
                                               , axis = 1
                                              )

top_5 = list(target_movie_df[[s for s in list(target_movie_df.columns) if 'g_' in s]].sum().sort_values(ascending=False)[:5].index)

for i in top_5:
    target_movie_df['gen_' + i[2:]] = target_movie_df[i]

target_movie_df = target_movie_df.drop(columns = [s for s in list(target_movie_df.columns) if 'genre' in s])
target_movie_df = target_movie_df.drop(columns = [s for s in list(target_movie_df.columns) if 'g_' in s])

In [None]:
# Generar Top 5 directores
directors = []

for i in range(6):
    target_movie_df['director'+'_'+str(i+1)] = target_movie_df['directors'].apply(lambda x: x[i] if len(x) > i else np.NaN)
    directors = directors + list(target_movie_df['director'+'_'+str(i+1)].apply(lambda x: str(x)))

directors = sorted(list(set(directors)))[1:]

for i in directors:
    target_movie_df['d_' + i] = target_movie_df.apply(lambda x: 1 if (x.director_1 == i)
                                                              | (x.director_2 == i)
                                                              | (x.director_3 == i)
                                                              | (x.director_4 == i)
                                                              | (x.director_5 == i)
                                                              | (x.director_6 == i)
                                                           else 0
                                               , axis = 1
                                              )

top_5 = list(target_movie_df[[s for s in list(target_movie_df.columns) if 'd_' in s]].sum().sort_values(ascending=False)[:5].index)

for i in top_5:
    target_movie_df['dir_' + i[2:]] = target_movie_df[i]

target_movie_df = target_movie_df.drop(columns = [s for s in list(target_movie_df.columns) if 'director' in s])
target_movie_df = target_movie_df.drop(columns = [s for s in list(target_movie_df.columns) if ('d_' in s) & (s != '')])

In [None]:
# Generar Top 5 writers

writers = []

for i in range(28):
    target_movie_df['writer'+'_'+str(i+1)] = target_movie_df['writers'].apply(lambda x: x[i] if len(x) > i else np.NaN)
    writers = writers + list(target_movie_df['writer'+'_'+str(i+1)].apply(lambda x: str(x)))

writers = sorted(list(set(writers)))[1:]

for i in writers:
    target_movie_df['w_' + i] = target_movie_df.apply(lambda x: 1 if (x.writer_1 == i)
                                                              | (x.writer_2 == i)
                                                              | (x.writer_3 == i)
                                                              | (x.writer_4 == i)
                                                              | (x.writer_5 == i)
                                                              | (x.writer_6 == i)
                                                           else 0
                                               , axis = 1
                                              )

top_5 = list(target_movie_df[[s for s in list(target_movie_df.columns) if 'w_' in s]].sum().sort_values(ascending=False)[:5].index)

for i in top_5:
    target_movie_df['wri_' + i[2:]] = target_movie_df[i]

target_movie_df = target_movie_df.drop(columns = [s for s in list(target_movie_df.columns) if 'writer' in s])
target_movie_df = target_movie_df.drop(columns = [s for s in list(target_movie_df.columns) if 'w_' in s])

In [None]:
# Generar Top 10 actors

actors = []

for i in range(28):
    target_movie_df['actor'+'_'+str(i+1)] = target_movie_df['actors'].apply(lambda x: x[i] if len(x) > i else np.NaN)
    actors = actors + list(target_movie_df['actor'+'_'+str(i+1)].apply(lambda x: str(x)))

actors = sorted(list(set(actors)))[1:]

for i in actors:
    target_movie_df['a_' + i] = target_movie_df.apply(lambda x: 1 if (x.actor_1 == i)
                                                              | (x.actor_2 == i)
                                                              | (x.actor_3 == i)
                                                              | (x.actor_4 == i)
                                                              | (x.actor_5 == i)
                                                              | (x.actor_6 == i)
                                                           else 0
                                               , axis = 1
                                              )

top_5 = list(target_movie_df[[s for s in list(target_movie_df.columns) if 'a_' in s]].sum().sort_values(ascending=False)[:10].index)

for i in top_5:
    target_movie_df['act_' + i[2:]] = target_movie_df[i]

target_movie_df = target_movie_df.drop(columns = [s for s in list(target_movie_df.columns) if 'actor' in s])
target_movie_df = target_movie_df.drop(columns = [s for s in list(target_movie_df.columns) if ('a_' in s) & (s != 'meta_count')])

# 2. Análisis Descriptivo

## 2.1 Variables Categóricas

In [None]:
target_movie_df.info()

In [None]:
target_movie_df.select_dtypes(include=['object']).describe()

In [None]:
import seaborn as sns
from matplotlib import pyplot as plt

In [None]:
fig, ax =plt.subplots(2,2,figsize=(24, 12))
j, k = 0, 0

for i in list(target_movie_df[['rated', 'colorations', 'language', 'country']].columns):
  sns.countplot(x=target_movie_df[i]
              , palette = 'crest'
              , ax=ax[j,k]
               )
  ax[j,k].set_title(i)

  if k == 1:
    j = j + 1
    k = 0
  else:
    k = k + 1 

plt.show()

In [None]:
# Se actualizan los rates antiguos a los actuales 

target_movie_df['rated'] = target_movie_df['rated'].apply(lambda x: 'PG' if x == 'Passed'
                                                                         else 'PG-13' if x == 'Approved'
                                                                         else 'G' if x == 'GP' else x
                                                         )

# Se actualiza el país de origen y se modifica el lenguaje de origen por booleano de inglés

target_movie_df['english_movie'] = target_movie_df['language'].apply(lambda x: 1 if x == 'en' else 0)
target_movie_df['US_movie'] = target_movie_df['country'].apply(lambda x: 1 if x == 'US' else 0)
target_movie_df['color'] = target_movie_df['colorations'].apply(lambda x: 1 if x == 'color' else 0)

target_movie_df = target_movie_df.drop(columns = ['colorations', 'language', 'country'])

In [None]:
fig, ax =plt.subplots(2,2,figsize=(24, 12))
j, k = 0, 0

for i in list(target_movie_df[['rated', 'english_movie', 'US_movie', 'color']].columns):
  sns.countplot(x=target_movie_df[i]
              , palette = 'crest'
              , ax=ax[j,k]
               )
  ax[j,k].set_title(i)

  if k == 1:
    j = j + 1
    k = 0
  else:
    k = k + 1 

plt.show()

In [None]:
fig, ax =plt.subplots(2,2,figsize=(24, 12))
j, k = 0, 0

for i in list(target_movie_df[['rated', 'english_movie', 'US_movie', 'color']].columns):
  sns.boxplot(data=target_movie_df.applymap(lambda x: 'si' if (x == 1) & (type(x).__name__ != 'str')
                                            else 'no' if (x == 0) & (type(x).__name__ != 'str') else x)
              , x='imdb_rating'
              , y=i
              , palette = 'crest'
              , ax=ax[j,k]
               )
  ax[j,k].set_title(i)

  if k == 1:
    j = j + 1
    k = 0
  else:
    k = k + 1 

plt.show()

In [None]:
fig, ax =plt.subplots(2,2,figsize=(24, 12))
j, k = 0, 0

for i in list(target_movie_df[['rated', 'english_movie', 'US_movie', 'color']].columns):
  sns.boxplot(data=target_movie_df.applymap(lambda x: 'si' if (x == 1) & (type(x).__name__ != 'str')
                                            else 'no' if (x == 0) & (type(x).__name__ != 'str') else x)
              , x='imdb_count'
              , y=i
              , palette = 'crest'
              , ax=ax[j,k]
               )
  ax[j,k].set_title(i)

  if k == 1:
    j = j + 1
    k = 0
  else:
    k = k + 1 

plt.show()

Para las varibales cuatro variables categóricas utlizadas se observa una gran concentración de películas en inglés. Así como peliculas americanas. También con una mayor concentración de películas a color, pero este rubro no tan concentrado. Al revisar la distribución de score de acuerdo a cada categoría se observa una menor concentración de valoraciones en películas admitidas para publicos jovenes con acompañamiento. Mientras que las otras categorìas tienen un comportamiento más similar. Para revisar la popularidad se toman el número de reviews donde se ve un número menor para películos PG y películas a blanco y negro.

## 2.2 Variables Cuantitativas

In [None]:
target_movie_df.select_dtypes(include=['float64','int']).describe()

In [None]:
target_movie_df.select_dtypes(include=['float64','int']).corr().style.background_gradient(cmap='coolwarm')

In [None]:
fig, ax =plt.subplots(2,2,figsize=(24, 12))
j, k = 0, 0

target_movie_df_2 = target_movie_df.copy()

target_movie_df_2['imbd_alto'] = target_movie_df_2['imdb_rating'].apply(lambda x: 'alto' if x > 8.2 else 'bajo')

for i in list(target_movie_df_2[['runtime', 'metascore', 'imdb_count', 'metascore']].columns):
  sns.histplot(data=target_movie_df_2
               , x=i
               , hue='imbd_alto'
               , element='step'
               , palette = 'crest'
               , ax=ax[j,k]
               )
  ax[j,k].set_title(i + ' ' + r"$\rho$" + ': ' + str(round(target_movie_df_2[i].corr(target_movie_df_2['imdb_rating']), 4)))

  if k == 1:
    j = j + 1
    k = 0
  else:
    k = k + 1 

plt.show()

Se puede observar para las variables numéricas del dataset que la valoración del público tiene una fuerte correlación con la popularidad de esta película. Tambien es interesante la relación de la valoración respecto al tiempo de duración del filme, agrupandose las mejor valoradas dentr de hora y media a dos horas. Tambien es interesante ver que no hay una gran relación entre lo que valora el público y lo que dice la critica especializada (Metascore). Por último la valoración del publico en algunos casos puede estar correlacionada con algunos directores, escritores y actores.

## 2.3 Generación de Cluster

Se procede a hacer un cluster para ver si hay un patron en la películas más deseadas

In [None]:
# Se codifica rate para poder clusterizar

target_movie_df['rated'] = target_movie_df['rated'].apply(lambda x: 1 if x == 'Not Rated'
                                                          else 2 if x == 'G'
                                                          else 3 if x == 'PG-13'
                                                          else 4 if x == 'PG'
                                                          else 5 if x == 'R'
                                                          else 6)

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(target_movie_df.drop(columns=['id', 'title', 'poster']))

In [None]:
from sklearn.cluster import KMeans

distortions = []
K = range(1,10)
for k in K:
    kmeanModel = KMeans(n_clusters=k)
    kmeanModel.fit(scaler.transform(target_movie_df.drop(columns=['id', 'title', 'poster'])))
    distortions.append(kmeanModel.inertia_)

plt.figure(figsize=(16,8))
plt.plot(K, distortions, 'bx-')
plt.xlabel('k')
plt.ylabel('Distortion')
plt.title('The Elbow Method showing the optimal k')
plt.show()

In [None]:
# Definiciones Clusters
kmeanModel = KMeans(n_clusters=6)
kmeanModel.fit(scaler.transform(target_movie_df.drop(columns=['id', 'title', 'poster'])))

target_movie_df['cluster_KMEANS'] = kmeanModel.predict(scaler.transform(target_movie_df.drop(columns=['id', 'title', 'poster'])))

target_movie_df.head(3)

In [None]:
sns.boxplot(data=target_movie_df
            , x='cluster_KMEANS'
            , y='imdb_rating'
            , palette = 'crest'
           )

# 3. Generación de Funcion TOP 10

In [None]:
def top_10_recomentation(movie_name):
    df_test = movie_info(movie_name)
    df_test = df_test.drop(columns = ['cinematographer', 'plot'])

    df_test['days_since_premiere'] = df_test['released'].apply(lambda x: (datetime.now()-x).days)
    df_test = df_test.drop(columns=['released'])

    for i in list(target_movie_df.iloc[:, 10:15].columns):
        df_test[i] = df_test['genre'].apply(lambda x: 1 if i[4:] in str(x) else 0)

    for i in list(target_movie_df.iloc[:, 15:20].columns):
        df_test[i] = df_test['directors'].apply(lambda x: 1 if i[4:] in str(x) else 0)

    for i in list(target_movie_df.iloc[:, 20:25].columns):
        df_test[i] = df_test['writers'].apply(lambda x: 1 if i[4:] in str(x) else 0)

    for i in list(target_movie_df.iloc[:, 25:35].columns):
        df_test[i] = df_test['actors'].apply(lambda x: 1 if i[4:] in str(x) else 0)

    df_test['rated'] = df_test['rated'].apply(lambda x: 'PG' if x == 'Passed'
                                              else 'PG-13' if x == 'Approved'
                                              else 'G' if x == 'GP' else x
                                             )
    df_test['english_movie'] = df_test['language'].apply(lambda x: 1 if x == 'en' else 0)
    df_test['US_movie'] = df_test['country'].apply(lambda x: 1 if x == 'US' else 0)
    df_test['color'] = df_test['colorations'].apply(lambda x: 1 if x == 'color' else 0)

    df_test = df_test.drop(columns = ['colorations', 'language', 'country', 'genre', 'directors', 'writers', 'actors'])

    df_test['rated'] = df_test['rated'].apply(lambda x: 1 if x == 'Not Rated'
                                              else 2 if x == 'G'
                                              else 3 if x == 'PG-13'
                                              else 4 if x == 'PG'
                                              else 5 if x == 'R'
                                              else 6
                                             )
    df_test = df_test.fillna(0)

    df_test['cluster_KMEANS'] = kmeanModel.predict(scaler.transform(df_test.drop(columns=['id', 'title', 'poster'])))

    scaler_2 = StandardScaler()
    scaler_2.fit(target_movie_df.drop(columns=['id', 'title', 'poster']))

    from sklearn.neighbors import NearestNeighbors

    neigh = NearestNeighbors(n_neighbors=10)
    nbrs = neigh.fit(scaler_2.transform(target_movie_df.drop(columns=['id', 'title', 'poster'])))
    distances, indices = nbrs.kneighbors(scaler_2.transform((pd.concat([target_movie_df
                                                                        , df_test
                                                                       ]
                                                                      ).reset_index(drop=True)
                                                            ).drop(columns=['id', 'title', 'poster'])
                                                           )
                                        )

    target_movie_df.iloc[list(indices[-1])]

    k = 0
    plt.figure()
    f, axarr = plt.subplots(10,1,figsize=(12, 36)) 

    for i in list(target_movie_df.iloc[list(indices[-1])]['id']):

        title = list(target_movie_df[target_movie_df['id']==i]['title'])[0]
        poster = list(target_movie_df[target_movie_df['id']==i]['poster'])[0]
        imdb_score = list(target_movie_df[target_movie_df['id']==i]['imdb_rating'])[0]
        metascore = list(target_movie_df[target_movie_df['id']==i]['metascore'])[0]

        response = requests.get(poster)
        img = Image.open(BytesIO(response.content))

        axarr[k].imshow(img.resize((240,352)))
        axarr[k].set_title(str(k+1) + '. ' + title + ' | IMDB Score: ' + str(imdb_score) + ' | MetaScore: ' + str(metascore))
        axarr[k].axis('off')

        k = k +1

    plt.show()  