In [210]:
import ot
import numpy as np
import pandas as pd
import csv
import matplotlib.pyplot as plt
from wordcloud import WordCloud, STOPWORDS
import random
from sklearn.metrics.pairwise import cosine_similarity
from scipy.spatial.distance import cosine
from numpy.linalg import norm

### DATASETS

In [2]:
movies = pd.read_csv("../DATASETS/ml-20m/movies.csv")

In [3]:
genome_scores = pd.read_csv("../DATASETS/ml-20m/genome-scores.csv")

In [4]:
genome_tags = pd.read_csv("../DATASETS/ml-20m/genome-tags.csv")

In [5]:
ratings = pd.read_csv("../DATASETS/ml-20m/ratings.csv")

In [7]:
tags = pd.read_csv("../DATASETS/ml-20m/tags.csv")

In [8]:
links = pd.read_csv("../DATASETS/ml-20m/links.csv")

In [50]:
movies_ratings_data = movies.merge(ratings, on = 'movieId', how='inner')

### Binarisation des données

In [None]:
# supprimer films sans tag

In [12]:
tags.isnull().any()

userId       False
movieId      False
tag           True
timestamp    False
dtype: bool

In [13]:
tags = tags.dropna()

In [15]:
# supprimer utilisateurs sans interaction, ie sans note
# Tout est déjà supprimé

### I - Matrice de coût

In [105]:
(movies_ratings_data[movies_ratings_data.userId == 1]).head(5)

Unnamed: 0,movieId,title,genres,userId,rating,timestamp
49695,2,Jumanji (1995),Adventure|Children|Fantasy,1,3.5,1112486027
346885,29,"City of Lost Children, The (Cité des enfants p...",Adventure|Drama|Fantasy|Mystery|Sci-Fi,1,3.5,1112484676
366121,32,Twelve Monkeys (a.k.a. 12 Monkeys) (1995),Mystery|Sci-Fi|Thriller,1,3.5,1112484819
526043,47,Seven (a.k.a. Se7en) (1995),Mystery|Thriller,1,3.5,1112484727
582574,50,"Usual Suspects, The (1995)",Crime|Mystery|Thriller,1,3.5,1112484580


In [117]:
genome = genome_scores.merge(genome_tags, on = 'tagId', how = 'inner')

In [115]:
genome

Unnamed: 0,movieId,tagId,relevance,tag
0,1,1,0.02500,007
1,2,1,0.03975,007
2,3,1,0.04350,007
3,4,1,0.03725,007
4,5,1,0.04200,007
...,...,...,...,...
11709763,130578,1128,0.01325,zombies
11709764,130840,1128,0.13375,zombies
11709765,131013,1128,0.01625,zombies
11709766,131168,1128,0.01725,zombies


In [350]:
user1 = movies_ratings_data[movies_ratings_data.userId == 1].head(5)
user1 = user1.reset_index(drop=True)

In [351]:
user2 = movies_ratings_data[movies_ratings_data.userId == 2].head(7)
user2 = user2.reset_index(drop=True)

In [352]:
user1

Unnamed: 0,movieId,title,genres,userId,rating,timestamp
0,2,Jumanji (1995),Adventure|Children|Fantasy,1,3.5,1112486027
1,29,"City of Lost Children, The (Cité des enfants p...",Adventure|Drama|Fantasy|Mystery|Sci-Fi,1,3.5,1112484676
2,32,Twelve Monkeys (a.k.a. 12 Monkeys) (1995),Mystery|Sci-Fi|Thriller,1,3.5,1112484819
3,47,Seven (a.k.a. Se7en) (1995),Mystery|Thriller,1,3.5,1112484727
4,50,"Usual Suspects, The (1995)",Crime|Mystery|Thriller,1,3.5,1112484580


In [353]:
user2

Unnamed: 0,movieId,title,genres,userId,rating,timestamp
0,3,Grumpier Old Men (1995),Comedy|Romance,2,4.0,974820889
1,62,Mr. Holland's Opus (1995),Drama,2,5.0,974820598
2,70,From Dusk Till Dawn (1996),Action|Comedy|Horror|Thriller,2,5.0,974820691
3,110,Braveheart (1995),Action|Drama|War,2,4.0,974820658
4,242,Farinelli: il castrato (1994),Drama|Musical,2,3.0,974820776
5,260,Star Wars: Episode IV - A New Hope (1977),Action|Adventure|Sci-Fi,2,5.0,974821014
6,266,Legends of the Fall (1994),Drama|Romance|War|Western,2,5.0,974820748


In [363]:
n = len(user1)
s = len(user2)

#""""""""""""""""""""""""""#
#     Matrice de coût      #
#""""""""""""""""""""""""""#
#   bewteen users 1 and 2  #
#       preferences        #

M = np.zeros((n, s))
M

for i in range(n):
    for j in range(s):
        
        # pour comparer la distance entre deux films, on prend les tags genomes 
        # des deux, ici ce sont vi et vj
        # ce qui donne la matrice de coût M
        # où Mij = 1 - sim(vi,vj) (cosinue similtary)
        
        # user1
        us1_mvId = user1.iloc[i][0] # pour récuperer le movieId
        v1 = genome_scores[genome_scores.movieId == us1_mvId].relevance.values

        # user2
        us2_mvId = user2.iloc[j][0] # pour récuprer le movieId
        v2 = genome_scores[genome_scores.movieId == us2_mvId].relevance.values
        
        M[i, j] = cosine(v1.reshape(1, -1), v2.reshape(1, -1))
    
M = pd.DataFrame(data = M, index = list(user1.title.values), columns=list(user2.title.values))

In [364]:
M

Unnamed: 0,Grumpier Old Men (1995),Mr. Holland's Opus (1995),From Dusk Till Dawn (1996),Braveheart (1995),Farinelli: il castrato (1994),Star Wars: Episode IV - A New Hope (1977),Legends of the Fall (1994)
Jumanji (1995),0.313482,0.347761,0.448752,0.364774,0.469818,0.301235,0.430599
"City of Lost Children, The (Cité des enfants perdus, La) (1995)",0.421913,0.423167,0.450774,0.362767,0.315885,0.288331,0.408456
Twelve Monkeys (a.k.a. 12 Monkeys) (1995),0.405523,0.419272,0.417914,0.33154,0.383743,0.279748,0.419714
Seven (a.k.a. Se7en) (1995),0.386789,0.41521,0.300731,0.299987,0.384674,0.342937,0.410744
"Usual Suspects, The (1995)",0.344711,0.352174,0.314965,0.254655,0.354001,0.275666,0.361112


### II - Plan de transport