In [1]:
import numpy as np
import pymongo
from sklearn.decomposition import NMF
from sklearn.metrics import mean_squared_error
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
client = pymongo.MongoClient('localhost:27017')

db = client['movie']

movies = db['movie_json']
users = db['user_json']

In [3]:
distinct_genres = movies.aggregate([
    {
        "$project": {
            "genres": {"$split": ["$genres", "|"]}
        }
    },{
        "$unwind": "$genres"
    },
    {
        "$group": {
            "_id": "$genres",
            "count": {"$sum": 1}
        }
    },
    {
        "$sort": {
            "count": -1
        }
    }
])

[genre for genre in distinct_genres]

[{'_id': 'Drama', 'count': 1603},
 {'_id': 'Comedy', 'count': 1200},
 {'_id': 'Action', 'count': 503},
 {'_id': 'Thriller', 'count': 492},
 {'_id': 'Romance', 'count': 471},
 {'_id': 'Horror', 'count': 343},
 {'_id': 'Adventure', 'count': 283},
 {'_id': 'Sci-Fi', 'count': 276},
 {'_id': "Children's", 'count': 251},
 {'_id': 'Crime', 'count': 211},
 {'_id': 'War', 'count': 143},
 {'_id': 'Documentary', 'count': 127},
 {'_id': 'Musical', 'count': 114},
 {'_id': 'Mystery', 'count': 106},
 {'_id': 'Animation', 'count': 104},
 {'_id': 'Western', 'count': 68},
 {'_id': 'Fantasy', 'count': 68},
 {'_id': 'Film-Noir', 'count': 44}]

In [4]:
desired_genres = ['Comedy', 'Horror']
filtered_movies = movies.find({"genres": {"$regex": f".*{'|'.join(desired_genres)}.*"}})

In [5]:
list(filtered_movies)

[{'_id': 1,
  'title': 'Toy Story (1995)',
  'genres': "Animation|Children's|Comedy"},
 {'_id': 3, 'title': 'Grumpier Old Men (1995)', 'genres': 'Comedy|Romance'},
 {'_id': 4, 'title': 'Waiting to Exhale (1995)', 'genres': 'Comedy|Drama'},
 {'_id': 5, 'title': 'Father of the Bride Part II (1995)', 'genres': 'Comedy'},
 {'_id': 7, 'title': 'Sabrina (1995)', 'genres': 'Comedy|Romance'},
 {'_id': 11,
  'title': 'American President, The (1995)',
  'genres': 'Comedy|Drama|Romance'},
 {'_id': 12,
  'title': 'Dracula: Dead and Loving It (1995)',
  'genres': 'Comedy|Horror'},
 {'_id': 19,
  'title': 'Ace Ventura: When Nature Calls (1995)',
  'genres': 'Comedy'},
 {'_id': 21, 'title': 'Get Shorty (1995)', 'genres': 'Action|Comedy|Drama'},
 {'_id': 34, 'title': 'Babe (1995)', 'genres': "Children's|Comedy|Drama"},
 {'_id': 38, 'title': 'It Takes Two (1995)', 'genres': 'Comedy'},
 {'_id': 39, 'title': 'Clueless (1995)', 'genres': 'Comedy|Romance'},
 {'_id': 45, 'title': 'To Die For (1995)', 'genre

In [6]:
movies_ids = movies.find({'genres': {'$regex': 'Comedy|Horror'}}).distinct('_id')
users_ids = users.distinct('_id')

In [7]:
df = pd.DataFrame(0, index=users_ids, columns=movies_ids)
df

Unnamed: 0,1,3,4,5,7,11,12,19,21,34,...,3933,3935,3938,3939,3940,3941,3942,3943,3944,3948
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6036,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6037,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6038,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6039,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [8]:
films = ['film1', 'film2', 'film3']

In [9]:
idx = 0
for f in films:
    print(idx, f)
    idx += 1

0 film1
1 film2
2 film3


In [10]:
for idx, f in enumerate(films):
    print(idx, f)

0 film1
1 film2
2 film3


In [11]:
for u_id in users_ids:
    user = users.find_one({'_id': u_id})

    for m in user['movies']:
        
        m_id = m['movieid']
        rating = m['rating']

        if m_id in movies_ids:
            df.loc[u_id, m_id] = rating

In [12]:
df

Unnamed: 0,1,3,4,5,7,11,12,19,21,34,...,3933,3935,3938,3939,3940,3941,3942,3943,3944,3948
1,5,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,4,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6036,0,0,2,0,0,3,0,0,3,5,...,0,0,0,0,0,0,0,0,0,0
6037,0,0,0,0,0,0,0,0,0,3,...,0,0,0,0,0,0,0,0,0,0
6038,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6039,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [13]:
df = df[df.sum(axis=1) > 0]
df

Unnamed: 0,1,3,4,5,7,11,12,19,21,34,...,3933,3935,3938,3939,3940,3941,3942,3943,3944,3948
1,5,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,4,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6036,0,0,2,0,0,3,0,0,3,5,...,0,0,0,0,0,0,0,0,0,0
6037,0,0,0,0,0,0,0,0,0,3,...,0,0,0,0,0,0,0,0,0,0
6038,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6039,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [14]:
df_sparse = df.astype(pd.SparseDtype("float", 0))

In [15]:
nmf = NMF(n_components=2)

# Fit the model to the user-item matrix
nmf.fit(df_sparse)

U = nmf.transform(df_sparse)  # User matrix
M = nmf.components_  # Item matrix

pred_matrix = np.dot(U, M)

print(mean_squared_error(df_sparse.values, pred_matrix))

0.43060464516145647


In [16]:
movies_genres = {}

for m_id in movies_ids:
    cur_movie = movies.find_one({'_id': m_id}, {'genres': 1})
    cur_genres = cur_movie['genres']  # Utilisation de crochets [] pour accéder aux valeurs de la clé
    if 'Comedy' in cur_genres and 'Horror' in cur_genres:
        movies_genres[m_id] = 'red'
    elif 'Comedy' in cur_genres:
        movies_genres[m_id] = 'lightgreen'
    else:
        movies_genres[m_id] = 'black'

movies_genres


{1: 'lightgreen',
 3: 'lightgreen',
 4: 'lightgreen',
 5: 'lightgreen',
 7: 'lightgreen',
 11: 'lightgreen',
 12: 'red',
 19: 'lightgreen',
 21: 'lightgreen',
 34: 'lightgreen',
 38: 'lightgreen',
 39: 'lightgreen',
 45: 'lightgreen',
 52: 'lightgreen',
 54: 'lightgreen',
 63: 'lightgreen',
 64: 'lightgreen',
 65: 'lightgreen',
 68: 'lightgreen',
 69: 'lightgreen',
 70: 'red',
 72: 'lightgreen',
 75: 'lightgreen',
 84: 'lightgreen',
 87: 'lightgreen',
 88: 'lightgreen',
 93: 'lightgreen',
 96: 'lightgreen',
 101: 'lightgreen',
 102: 'lightgreen',
 104: 'lightgreen',
 106: 'lightgreen',
 107: 'lightgreen',
 109: 'lightgreen',
 115: 'lightgreen',
 118: 'lightgreen',
 119: 'lightgreen',
 122: 'lightgreen',
 125: 'lightgreen',
 129: 'lightgreen',
 133: 'lightgreen',
 135: 'lightgreen',
 141: 'lightgreen',
 144: 'lightgreen',
 152: 'black',
 153: 'lightgreen',
 156: 'lightgreen',
 157: 'lightgreen',
 166: 'lightgreen',
 171: 'lightgreen',
 174: 'lightgreen',
 176: 'lightgreen',
 177: 'black

In [18]:
# plt.scatter(M[0, :], M[1,:], c=movies_genres.values())
import matplotlib.pyplot as plt
import mplcursors

# Tracer le nuage de points avec les couleurs correspondant aux genres
scatter = plt.scatter(M[0, :], M[1, :], c=list(movies_genres.values()))

# Définir les informations à afficher lors du survol
@scatter.tooltip
def on_hover(sel):
    index = sel.target.index
    movie_id = movies_ids[index]
    movie_title = movies.find_one({'_id': movie_id}, {'title': 1})['title']
    movie_genre = list(movies_genres.values())[index]
    return f"Film : {movie_title}\nGenre : {movie_genre}"

# Afficher le graphique
plt.xlabel('X')
plt.ylabel('Y')
plt.title('Scatter plot avec infos sur le hover')
mplcursors.cursor(hover=True)
plt.show()


ModuleNotFoundError: No module named 'mplcursors'