In [1]:
import pandas as pd
import numpy as np
import warnings
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
warnings.filterwarnings("ignore")
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [3]:
# Modèles
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import NearestNeighbors

# Ce dont on a besoin
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import cross_val_score, GridSearchCV

In [4]:
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.linear_model import LogisticRegression

In [5]:
chemin_bd = r"./bd_ignore/"

In [20]:
df_merge_recom22 = pd.read_csv(chemin_bd+"resultat2/df_merge_recom21.csv")

In [None]:
df_merge_recom22 =df_merge_recom22.drop(columns=['startYear','homepage','overview','poster_path'])

In [None]:
df_merge_recom22 = df_merge_recom22.fillna('None')

In [22]:
merge_filmR = df_merge_recom22[df_merge_recom22['titleType'] == 'movie']
merge_serieR = df_merge_recom22[df_merge_recom22['titleType'] == 'tvSeries']

In [None]:
agg_df = df_merge_recom22.groupby(['original_title'])['primaryName'].agg(list).reset_index()
agg_df

In [None]:
# Agrégation de primaryName par original_title
#agg_df = df_merge_recom22.groupby(['original_title'])['primaryName'].agg(list).reset_index()

# Merge pour conserver toutes les colonnes originales
df_merge_recom22 = df_merge_recom22.drop_duplicates(subset=['original_title']).merge(agg_df, on='original_title', how='left')

# Transformation de genres_y en listes
df_merge_recom22['genres_liste'] = df_merge_recom22['genres_y'].apply(lambda x: x.split(","))

In [None]:
# Initialisation des ensembles pour contenir tous les genres et tous les noms
tous_les_genres = set()
#tous_les_acteurs = set()

# Mise à jour de l'ensemble des genres
for genres in df_merge_recom22['genres_liste']:
    tous_les_genres.update(genres)

# Mise à jour de l'ensemble des acteurs
#for acteurs in df_merge_recom22['primaryName_y']:
    #tous_les_acteurs.update(acteurs)

In [None]:
# Création de colonnes binaires pour chaque genre
for genre in tous_les_genres:
    df_merge_recom22[f'genre_{genre}'] = df_merge_recom22['genres_liste'].apply(lambda x: genre in x)

In [None]:
from sklearn.preprocessing import MultiLabelBinarizer
from scipy.sparse import csr_matrix

# Initialiser le MultiLabelBinarizer
mlb = MultiLabelBinarizer(sparse_output=True)

# Appliquer la transformation
sparse_binarized_data = mlb.fit_transform(df_merge_recom22['primaryName_y'])

# Convertir en DataFrame sparse si nécessaire
binarized_df = pd.DataFrame.sparse.from_spmatrix(
    sparse_binarized_data, columns=[f'acteur_{actor}' for actor in mlb.classes_]
)

# Fusionner avec le DataFrame original
df_merge_recom22 = pd.concat([df_merge_recom22.reset_index(drop=True), binarized_df.reset_index(drop=True)], axis=1)



In [None]:
df_merge_recom22.columns

In [None]:
def encodage_X(X, type='standard'):
  index = X.index
  X_num = X.select_dtypes('number')
  X_cat = X.select_dtypes(['object', 'category', 'string'])
  # Comme ça les dates, on n'y touche pas

  if type == 'standard':
    from sklearn.preprocessing import StandardScaler
    SN = StandardScaler()
    X_num_SN = pd.DataFrame(SN.fit_transform(X_num), columns=X_num.columns, index=index)

  else:
    from sklearn.preprocessing import MinMaxScaler
    SN = MinMaxScaler()
    X_num_SN = pd.DataFrame(SN.fit_transform(X_num), columns=X_num.columns, index=index)

  X_cat_dummies = pd.get_dummies(X_cat)
  X_encoded = pd.concat([X_num_SN, X_cat_dummies], axis=1)


In [None]:
X_encoded, SN = encodage_X(X, type='normalisation')
X_encoded

In [None]:
agg_df

In [None]:
X_encoded_m = pd.concat([X_encoded,agg_df], axis=1)

In [None]:
def evaluate_k(X_encoded, k_range):
    """
    Évalue différentes valeurs de k en utilisant la somme des distances aux voisins
    et le score de silhouette comme métriques.

    Args:
        X_encoded (DataFrame): Données normalisées
        k_range (range): Plage de valeurs de k à tester

    Returns:
        tuple: (distances moyennes, scores de silhouette)
    """
    from sklearn.metrics import silhouette_score
    from sklearn.cluster import KMeans

    avg_distances = []
    silhouette_scores = []

    for k in k_range:
        # Calcul des distances moyennes pour chaque k
        model = NearestNeighbors(n_neighbors=k)
        model.fit(X_encoded)
        distances, _ = model.kneighbors(X_encoded)
        avg_distances.append(np.mean(distances))

        # Calcul du score de silhouette
        # Nous utilisons KMeans pour créer des clusters et évaluer la qualité
        kmeans = KMeans(n_clusters=k, random_state=42)
        clusters = kmeans.fit_predict(X_encoded)
        if k > 1:  # Le score de silhouette nécessite au moins 2 clusters
            silhouette_scores.append(silhouette_score(X_encoded, clusters))
        else:
            silhouette_scores.append(0)

    return avg_distances, silhouette_scores

In [None]:
# Définition de la plage de k à tester
k_range = range(1, 21)  # Test des valeurs de k de 1 à 20

# Évaluation des différentes valeurs de k
avg_distances, silhouette_scores = evaluate_k(X_encoded, k_range)

# Création d'une visualisation pour aider à choisir k
plt.figure(figsize=(12, 5))

# Premier graphique : Distance moyenne aux voisins
plt.subplot(1, 2, 1)
plt.plot(k_range, avg_distances, 'bo-')
plt.xlabel('Nombre de voisins (k)')
plt.ylabel('Distance moyenne aux voisins')
plt.title('Distance moyenne en fonction de k')
plt.grid(True)

# Second graphique : Score de silhouette
plt.subplot(1, 2, 2)
plt.plot(k_range[1:], silhouette_scores[1:], 'ro-')  # On commence à k=2
plt.xlabel('Nombre de voisins (k)')
plt.ylabel('Score de silhouette')
plt.title('Score de silhouette en fonction de k')
plt.grid(True)

plt.tight_layout()
plt.show()

In [None]:
# On choisit k
k=6

model = NearestNeighbors(n_neighbors=k, metric='euclidean')
model.fit(X_encoded)

In [None]:
df_merge_recom22.sample(3)

In [None]:
# Affichage d'un échantillon de 3 lignes
df_merge_recom22.sample(3)

In [None]:
#Encodage
def encodage_X(X, type='standard'):
  index = X.index
  X_num = X.select_dtypes('number')
  X_cat = X.select_dtypes(['object', 'category', 'string'])
  # Comme ça les dates, on n'y touche pas

  if type == 'standard':
    from sklearn.preprocessing import StandardScaler
    SN = StandardScaler()
    X_num_SN = pd.DataFrame(SN.fit_transform(X_num), columns=X_num.columns, index=index)

  else:
    from sklearn.preprocessing import MinMaxScaler
    SN = MinMaxScaler()
    X_num_SN = pd.DataFrame(SN.fit_transform(X_num), columns=X_num.columns, index=index)

  X_cat_dummies = pd.get_dummies(X_cat)
  X_encoded = pd.concat([X_num_SN, X_cat_dummies], axis=1)

  return X_encoded, SN

In [None]:
len(tous_les_genres)
len(tous_les_acteurs)
df_merge_recom22.info()

In [None]:
df_merge_recom22.columns

In [None]:
type(df_merge_recom22['genres_y'].unique())
list(df_merge_recom22['genres_y'].unique())
df_merge_recom22.info()

In [None]:
list(df_merge_recom22['genres_y'].unique())
len(list(df_merge_recom22['genres_y'].unique()))

In [None]:
df_merge_recom22.isna().sum()

In [None]:
df_merge_recom22['genres_y'].unique()

In [None]:
list(df_merge_recom22['genres_y'].unique())

In [None]:
df_merge_recom22['startYear'] = df_merge_recom22['startYear'].fillna('None')
df_merge_recom22['genres_y'] = df_merge_recom22['genres_y'].fillna('None')
df_merge_recom22 = df_merge_recom22.fillna('None')

In [None]:
#df_merge_recom22[df_merge_recom22['genres_y'] == '\\N']

In [None]:
import ast

In [None]:
df_merge_recom22['genres_liste'] = df_merge_recom22['genres_y'].apply(lambda x: x.split(","))

In [None]:
df_merge_recom22['genres_liste']

In [None]:
# def transfo_liste(x):
#   if isinstance(x, str):
#     return ast.literal_eval(x)
#   else:
#     return x

# -> fonction de jo 

In [None]:
# transformer la colonne genre en liste
#df_merge_recom22['genres_liste'] = df_merge_recom22['genres_liste'].apply(transfo_liste)
#df_merge_recom22.sample(5)
# je force la colonne genre_liste à n'être que des listes

In [None]:
tous_les_genres = set()

In [None]:
tous_les_genres = set()
for genres in df_merge_recom22['genres_liste']:
  tous_les_genres.update(genres)

tous_les_genres
#ensemble qui contient tous mes genres

In [None]:
len(tous_les_genres)

In [None]:
for genre in tous_les_genres:
  df_merge_recom22[f'genre_{genre}'] = df_merge_recom22['genres_liste'].apply(lambda x: genre in x)

df_merge_recom22.sample(3)

In [None]:
#df avec tous les genres en booleens

df_merge_recom22.head(5)

In [None]:
df_merge_recom22.info()

In [None]:
caracteristiques = ['titleType', 'startYear']

In [None]:
X_genre = df_merge_recom22.select_dtypes(['bool'])
X_genre

In [None]:
X_genrecopy = X_genre.copy()

In [None]:
def encodage_X(X, type='standard'):
  index = X.index
  X_num = X.select_dtypes('number')
  X_cat = X.select_dtypes(['object', 'category', 'string'])
  #X_genrecopy = df_merge_recom22.select_dtypes(['bool'])
  # Comme ça les dates, on n'y touche pas

  if type == 'standard':
    from sklearn.preprocessing import StandardScaler
    SN = StandardScaler()
    X_num_SN = pd.DataFrame(SN.fit_transform(X_num), columns=X_num.columns, index=index)

  else:
    from sklearn.preprocessing import MinMaxScaler
    SN = MinMaxScaler()
    X_num_SN = pd.DataFrame(SN.fit_transform(X_num), columns=X_num.columns, index=index)

  X_cat_dummies = pd.get_dummies(X_cat)
  X_encoded = pd.concat([X_num_SN, X_cat_dummies], axis=1)
  #X_encoded = pd.concat([X_num_SN, X_genre], axis=1)

  return X_encoded, SN

In [None]:
X_encoded, SN = encodage_X(X, type='normalisation')

In [None]:
def evaluate_k(X_encoded, k_range):
    """
    Évalue différentes valeurs de k en utilisant la somme des distances aux voisins
    et le score de silhouette comme métriques.


    Args:
        X_encoded (DataFrame): Données normalisées
        k_range (range): Plage de valeurs de k à tester


    Returns:
        tuple: (distances moyennes, scores de silhouette)
    """
    from sklearn.metrics import silhouette_score
    from sklearn.cluster import KMeans


    avg_distances = []
    silhouette_scores = []


    for k in k_range:
        # Calcul des distances moyennes pour chaque k
        model = NearestNeighbors(n_neighbors=k)
        model.fit(X_encoded)
        distances, _ = model.kneighbors(X_encoded)
        avg_distances.append(np.mean(distances))


        # Calcul du score de silhouette
        # Nous utilisons KMeans pour créer des clusters et évaluer la qualité
        kmeans = KMeans(n_clusters=k, random_state=42)
        clusters = kmeans.fit_predict(X_encoded)
        if k > 1:  # Le score de silhouette nécessite au moins 2 clusters
            silhouette_scores.append(silhouette_score(X_encoded, clusters))
        else:
            silhouette_scores.append(0)


    return avg_distances, silhouette_scores

In [None]:
merge_filmR

In [None]:
model = NearestNeighbors(n_neighbors=k, metric='euclidean')

In [None]:
filmR = merge_filmR['imdb_id']
nom_film = 'tt0113101'


# Vérifier s'il existe dans dataset


merge_filmR[merge_filmR['imdb_id'] == 'tt0113101o']

In [None]:
caract_film = merge_filmR[merge_filmR['imdb_id'] == 'tt0113101']
caract_film = caract_film[caracteristiques]


caract_film


In [None]:
def film_similaire(nom_film):


  # Vérifier si le film existe dans le dataset
  if nom_film not in merge_filmR['imdb_id'].values:
      return f"Le film numero {nom_film} n'est pas dans le dataset."


  # Récupérer les caractéristiques du Pokémon
  filmC = merge_filmR[merge_filmR['imdb_id'] == nom_film]


  # Vérifier si le Pokémon est légendaire
  if not merge_filmR['imdb_id'].iloc[0]:
      return f"{nom_film} n'est pas un film dans le dataset."


  # Je recopie ce qu'on a fait avant:
  caract_film = merge_filmR[merge_filmR['imdb_id'] == nom_film][caracteristiques]


  caract_film_encoded = encodage_predict(caract_film)


  distances, indices = model.kneighbors(caract_film_encoded)


  return merge_filmR.iloc[indices[0]].reset_index(drop=True)

In [None]:
index = X.index
X_num = X.select_dtypes('number')
X_cat = X.select_dtypes(['object', 'category', 'string'])

In [None]:
from sklearn.preprocessing import MinMaxScaler
SN = MinMaxScaler()
X_num_SN = pd.DataFrame(SN.fit_transform(X_num), columns=X_num.columns, index=index)

In [None]:
X = merge_filmR[caracteristiques]
# ne prend que les colonnes qui sont dans caracteristiques

X_encoded, SN = encodage_X(X, type='normalisation')

In [None]:
X_encoded

In [None]:
X

In [None]:
X['titleType'].unique()

In [None]:
df_merge_recom22.info()

In [None]:
def evaluate_k(X_encoded, k_range):
    """
    Évalue différentes valeurs de k en utilisant la somme des distances aux voisins
    et le score de silhouette comme métriques.

    Args:
        X_encoded (DataFrame): Données normalisées
        k_range (range): Plage de valeurs de k à tester

    Returns:
        tuple: (distances moyennes, scores de silhouette)
    """
    from sklearn.metrics import silhouette_score
    from sklearn.cluster import KMeans

    avg_distances = []
    silhouette_scores = []

    for k in k_range:
        # Calcul des distances moyennes pour chaque k
        model = NearestNeighbors(n_neighbors=k)
        model.fit(X_encoded)
        distances, _ = model.kneighbors(X_encoded)
        avg_distances.append(np.mean(distances))

        # Calcul du score de silhouette
        # Nous utilisons KMeans pour créer des clusters et évaluer la qualité
        kmeans = KMeans(n_clusters=k, random_state=42)
        clusters = kmeans.fit_predict(X_encoded)
        if k > 1:  # Le score de silhouette nécessite au moins 2 clusters
            silhouette_scores.append(silhouette_score(X_encoded, clusters))
        else:
            silhouette_scores.append(0)

    return avg_distances, silhouette_scores

In [None]:
#evaluate_k(X_encoded, k_range)
k_range = range(1, 21)  # Test des valeurs de k de 1 à 20

In [None]:
X_encoded

In [None]:
#evaluate_k(X_encoded, k_range)

In [None]:
# On choisit k
k=6

model = NearestNeighbors(n_neighbors=k, metric='euclidean')
#model.fit(X_encoded)

In [None]:
filmR = merge_filmR['imdb_id']
nom_film = 'tt0113101'


# Vérifier s'il existe dans dataset


merge_filmR[merge_filmR['imdb_id'] == 'tt0113101o']

In [None]:
caract_film = merge_filmR[merge_filmR['imdb_id'] == 'tt0113101']
caract_film = caract_film[caracteristiques]


caract_film

In [None]:
def encodage_predict(df_a_predire):
  X_num = df_a_predire.select_dtypes('number')
  X_cat = df_a_predire.select_dtypes(['object', 'category', 'string'])

  X_num_SN = pd.DataFrame(SN.transform(X_num), columns=X_num.columns).reset_index(drop=True)

  X_cat_dummies = pd.get_dummies(X_cat).reset_index(drop=True)
  X_encoded_predire = pd.concat([X_num_SN, X_cat_dummies], axis=1)

  df_predict = X_encoded_predire

  # DataFrame vide qui a les mêmes colonnes que X_encoded
  df_final = pd.DataFrame(columns=X_encoded.columns)

  # On veut que le DataFrame ait le même nombre de lignes que df_predict
  df_final = df_final.reindex(index=df_predict.index)
  # On met tous les NaN à False
  df_final = df_final.fillna(False)

  # On parcourt chaque colonne de df_predict
  # Si la colonne est présente dans X_encoded alors on la garde
  # Sinon, on la met à False
  for column in df_predict.columns:
    if column in X_encoded.columns:
      df_final[column] = df_predict[column]

  return df_final

In [None]:
def film_similaires(imdb_id):

  # Vérifier si le Film existe dans le dataset
  if imdb_id not in df_merge_recom22['imdb_id'].values:
      return f"Le Film {imdb_id } n'est pas dans le dataset."

  # Récupérer les caractéristiques du Film
  film = df_merge_recom22[df_merge_recom22['imdb_id'] == imdb_id ]

  # Vérifier si le Film 
  if not film['imdb_id'].iloc[0]:
      return f"{imdb_id} n'est pas un Film."

  # Je recopie ce qu'on a fait avant:
  caract_film = df_merge_recom22[df_merge_recom22['imdb_id'] == imdb_id ][caracteristiques]

  caract_film_encoded = encodage_predict(caract_film)

  distances, indices = model.kneighbors(caract_film_encoded)

  return df_merge_recom22.iloc[indices[0]].reset_index(drop=True)

In [None]:
film_similaires('tt0113101')

In [None]:
encodage_predict(df_merge_recom22)