# Nettoyage movies_tags

In [1]:
import pandas as pd

In [2]:
df_movies_tags = pd.read_csv('../src/data/interim/movies_tags.csv')
df_movies_tags.head()

Unnamed: 0,movieId,title,tags
0,1,Toy Story (1995),"adventure, animation, children, comedy, fantas..."
1,2,Jumanji (1995),"adventure, children, fantasy, scary, see also:..."
2,3,Grumpier Old Men (1995),"comedy, romance, moldy, walter matthau, howard..."
3,4,Waiting to Exhale (1995),"comedy, drama, romance, revenge, characters, c..."
4,5,Father of the Bride Part II (1995),"comedy, childhood classics, worst movies ever,..."


In [3]:
df_movies_tags.dtypes

movieId     int64
title      object
tags       object
dtype: object

In [4]:
df_movies_tags['tags'] = df_movies_tags['tags'].apply(lambda x: x.split(','))


df_movies_tags.head(2).values

array([[1, 'Toy Story (1995)',
        list(['adventure', ' animation', ' children', ' comedy', ' fantasy', ' buy', ' soothing', ' humorous', ' usa', ' ya boy', ' watched', ' national film registry', ' time travel', ' dolls', ' villian hurts toys', ' very good', ' first cgi film', ' engaging', ' pixar animation', ' 2009 reissue in stereoscopic 3-d', ' fanciful', ' want', ' dvd-video', ' cute', ' computer animation', ' heroic mission', ' avi', ' unlikely friendships', " erlend's dvds", ' action figure', ' rated-g', " tumey's vhs", ' kids movie', ' tã\x83â©a leoni does not star in this movie', ' ã©ë\x9câ®ã¤â¸â\x82¬ã©â¸â£', ' action figures', ' imdb top 250', ' story', ' cg animation', ' friendship', ' family', ' want to see again', ' family film', ' warm', ' animated', ' toys come to life', ' classic', ' daring rescues', ' tim allen', ' voice acting', ' 3d', ' innovative', ' lots of heart', ' tom hanks', ' kids and family', ' 55 movies every kid should see--entertainment weekly', ' cleve

In [5]:
import re

def clean_tags(tags_list):
    clean_tags_list = []
    for tag in tags_list:
        # Vérifier si le tag contient des caractères spéciaux autres que l'espace
        if not any(re.search(r'[^\w\s_-]', word) for word in tag):
            clean_tags_list.append(tag.strip())
    clean_tags_list = list(set(clean_tags_list))
    return clean_tags_list

# Appliquer la fonction clean_tags à la colonne 'tags' du DataFrame
df_movies_tags['tags'] = df_movies_tags['tags'].apply(clean_tags)

# Afficher les tags nettoyés
print(df_movies_tags.head(1).values)



[[1 'Toy Story (1995)'
  list(['national film registry', 'friendship', 'disney animated feature', 'very good', 'comedy', 'disney', 'cgi', 'tim allen', 'engaging', 'animated', 'toys', 'light', 'cartoon', 'lots of heart', 'adventure', 'rated-g', 'story', 'usa', 'want to see again', '55 movies every kid should see--entertainment weekly', 'villian hurts toys', 'the boys', 'action figure', 'heroic mission', 'avi', 'innovative', 'bright', 'john lasseter', 'classic', 'pixar animation', 'dolls', 'first cgi film', 'ya boy', 'warm', 'family film', 'time travel', '2009 reissue in stereoscopic 3-d', 'kids movie', 'fun', 'toys come to life', 'animation', 'voice acting', 'cute', 'witty', 'unlikely friendships', 'soothing', 'humorous', 'buy', '3d', 'clever', 'buzz lightyear', 'kids and family', 'buddy movie', 'children', 'fanciful', 'dvd-video', 'watched', 'almost favorite', 'pixar', 'daring rescues', 'computer animation', 'tom hanks', 'toy', 'clv', 'rousing', 'fantasy', 'family', 'action figures', '

In [6]:
df_movies_tags['tags'] = df_movies_tags['tags'].apply(lambda x: ', '.join(x))

In [7]:
df_movies_tags

Unnamed: 0,movieId,title,tags
0,1,Toy Story (1995),"national film registry, friendship, disney ani..."
1,2,Jumanji (1995),"time, bad cgi, childhood recaptured, robin wil..."
2,3,Grumpier Old Men (1995),"moldy, no_fa_ganes, romance, sequel, old, good..."
3,4,Waiting to Exhale (1995),"romance, characters, comedy, clv, drama, girli..."
4,5,Father of the Bride Part II (1995),"sequel, touching, diane keaton, good sequel, f..."
...,...,...,...
27273,131254,Kein Bund fÃ¼r's Leben (2007),comedy
27274,131256,"Feuer, Eis & Dosenbier (2002)",comedy
27275,131258,The Pirates (2014),"adventure, mutiny, pirates, korea, bandits, whale"
27276,131260,Rentun Ruusu (2001),


In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer()

matrice_tfidf = tfidf.fit_transform(df_movies_tags['tags'])

print(matrice_tfidf.shape)

(27278, 21324)


In [9]:
from sklearn.metrics.pairwise import cosine_similarity

sim_cosinus = cosine_similarity(matrice_tfidf, matrice_tfidf)

In [10]:
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances

sim_euclidienne = 1 / (1 + euclidean_distances(matrice_tfidf))

In [11]:
indices = pd.Series(range(0,len(df_movies_tags)), index=df_movies_tags.title)

In [12]:
def recommandations(titre, cos_sim, num_recommendations=10):
    idx = indices[titre]
    scores_similarite = list(enumerate(cos_sim[idx]))
    scores_similarite = sorted(scores_similarite, key=lambda x: x[1], reverse=True)
    top_similair = scores_similarite[1:num_recommendations+1]
    res = [(indices.index[idx], score) for idx, score in top_similair]
    return res


In [15]:
print("Recommandations pour 'Interstellar' similarité cosinus: \n",recommandations("Seven (a.k.a. Se7en) (1995)", sim_cosinus))

Recommandations pour 'Interstellar' similarité cosinus: 
 [('Employee of the Month (2004)', 0.3211485751623087), ('Mother Knows Best (1997)', 0.28980797236729006), ('Kiss the Girls (1997)', 0.2825552689514304), ('Passengers (2008)', 0.2700010803190702), ('Sixth Sense, The (1999)', 0.2629931310474234), ('High Crimes (2002)', 0.2583394861676635), ('Usual Suspects, The (1995)', 0.2566326114482667), ('No Country for Old Men (2007)', 0.25467941389373927), ('Fight Club (1999)', 0.25289941371243174), ('Shutter Island (2010)', 0.24830867960706848)]
