In [114]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score


In [178]:
data=pd.read_csv(r'dataset-final - 2000.csv')

In [179]:
# Extraire les lignes avec des NaN dans RATING
data_missing_rating = data[data['RATING_SUR_5'].isnull()]

# Supprimer les lignes avec des NaN dans RATING du DataFrame principal
data = data.dropna(subset=['RATING_SUR_5'])

# Afficher le résultat
print(f"Nombre de lignes avec des NaN extraites : {len(data_missing_rating)}")
print(f"Nouvelle taille du dataset nettoyé : {len(data)}")


Nombre de lignes avec des NaN extraites : 153
Nouvelle taille du dataset nettoyé : 2314


In [180]:
data['SYNOPSIS'] = data['SYNOPSIS'].fillna("")
data['REVIEWS'] = data['REVIEWS'].fillna("")
data['CAST'] = data['CAST'].fillna("")
data['PRODUCERS'] = data['PRODUCERS'].fillna("")
data['GENRES'] = data['GENRES'].fillna("")


POUR 3000

In [181]:
vectorizer_synopsis = TfidfVectorizer(max_features=800)  # Limiter le nombre de caractéristiques à 500
vectorizer_reviews = TfidfVectorizer(max_features=1000)
vectorizer_genres = TfidfVectorizer(max_features=500)  # Limiter le nombre de caractéristiques à 500
vectorizer_cast = TfidfVectorizer(max_features=500)
vectorizer_producers = TfidfVectorizer(max_features=500)
genre_features = vectorizer_genres.fit_transform(data['GENRES']).toarray()
cast_features = vectorizer_cast.fit_transform(data['CAST']).toarray()
producers_features = vectorizer_producers.fit_transform(data['PRODUCERS']).toarray()
synopsis_features = vectorizer_synopsis.fit_transform(data['SYNOPSIS']).toarray()
reviews_features = vectorizer_reviews.fit_transform(data['REVIEWS']).toarray()

In [183]:
scaler = StandardScaler()
duration_features = scaler.fit_transform(data[['DURATION_MIN']])


In [184]:
X = np.hstack([synopsis_features, reviews_features, genre_features, 
               cast_features, producers_features, duration_features])
y = data['RATING_SUR_5']
# Diviser les données en train et test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)


In [121]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.experimental import enable_iterative_imputer  # Activer les outils expérimentaux
from sklearn.impute import IterativeImputer, SimpleImputer
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import cross_val_score
import numpy as np

In [86]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'learning_rate': [0.01, 0.05, 0.1],
    'max_iter': [100, 200, 300],
    'max_depth': [3, 5, 10],
    'min_samples_leaf': [10, 20, 50],
}

grid_search = GridSearchCV(
    estimator=HistGradientBoostingRegressor(),
    param_grid=param_grid,
    cv=3,
    scoring='neg_mean_squared_error',
    n_jobs=-1
)

grid_search.fit(X_train, y_train)
best_model = grid_search.best_estimator_
print("Meilleurs hyperparamètres :", grid_search.best_params_)


Meilleurs hyperparamètres : {'learning_rate': 0.05, 'max_depth': 10, 'max_iter': 300, 'min_samples_leaf': 20}


In [87]:
from sklearn.metrics import mean_squared_error, mean_absolute_error

y_pred = best_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)

print(f"Mean Squared Error (MSE): {mse}")
print(f"Mean Absolute Error (MAE): {mae}")


Mean Squared Error (MSE): 0.19491713941878638
Mean Absolute Error (MAE): 0.33273580354537247


In [168]:

# Liste des modèles à tester
models = [
    ('Hist Gradient Boosting', HistGradientBoostingRegressor()),
]

# Entraîner et évaluer chaque modèle
best_model = None
best_score = float('inf')  # Initialiser avec une valeur très élevée pour comparaison
for name, model in models:
    model.fit(X_train, y_train)  # Entraîner le modèle
    y_pred = model.predict(X_test)  # Prédictions
    
    # Évaluer la performance
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    print(f'{name} Results:')
    print(f'Mean Squared Error: {mse:.4f}')
    print(f'Mean Absolute Error: {mae:.4f}')
    print(f'R² Score: {r2:.4f}')
    print('-' * 40)
    
    # Comparer pour trouver le meilleur modèle
    if mse < best_score:
        best_score = mse
        best_model = model

# Afficher le meilleur modèle trouvé
print(f'Best Model: {best_model}')


Hist Gradient Boosting Results:
Mean Squared Error: 0.1989
Mean Absolute Error: 0.3384
R² Score: 0.6717
----------------------------------------
Best Model: HistGradientBoostingRegressor()


In [169]:
import joblib

# Chemin pour sauvegarder le modèle
model_path = r"C:\Users\Aycha\Desktop\M2_BDIA\NLP\Projet_movie\final\model\modele_3000.pkl"

# Sauvegarder le modèle avec joblib
joblib.dump(best_model, model_path)
print(f"Modèle sauvegardé sous {model_path}")


Modèle sauvegardé sous C:\Users\Aycha\Desktop\M2_BDIA\NLP\Projet_movie\final\model\modele_3000.pkl


In [63]:
# Remplacer les NaN par une chaîne vide
data_missing_rating['SYNOPSIS'] = data_missing_rating['SYNOPSIS'].fillna("")
data_missing_rating['REVIEWS'] = data_missing_rating['REVIEWS'].fillna("")
data_missing_rating['GENRES'] = data_missing_rating['GENRES'].fillna("")
data_missing_rating['CAST'] = data_missing_rating['CAST'].fillna("")
data_missing_rating['PRODUCERS'] = data_missing_rating['PRODUCERS'].fillna("")


In [64]:
# Générer les caractéristiques pour les données manquantes
synopsis_features_missing = vectorizer_synopsis.transform(data_missing_rating['SYNOPSIS']).toarray()
reviews_features_missing = vectorizer_reviews.transform(data_missing_rating['REVIEWS']).toarray()
genre_features_missing = vectorizer_genres.transform(data_missing_rating['GENRES']).toarray()
cast_features_missing = vectorizer_cast.transform(data_missing_rating['CAST']).toarray()
producers_features_missing = vectorizer_producers.transform(data_missing_rating['PRODUCERS']).toarray()
duration_features_missing = scaler.transform(data_missing_rating[['DURATION_MIN']])

# Concatenation des caractéristiques
X_missing = np.hstack([
    synopsis_features_missing, 
    reviews_features_missing, 
    genre_features_missing, 
    cast_features_missing, 
    producers_features_missing, 
    duration_features_missing
])

# Vérifier et aligner les dimensions si nécessaire
if X_missing.shape[1] != X_train.shape[1]:
    diff = X_train.shape[1] - X_missing.shape[1]
    X_missing = np.hstack([X_missing, np.zeros((X_missing.shape[0], diff))])

# Faire des prédictions
y_pred_missing = best_model.predict(X_missing)
data_missing_rating['RATING_SUR_5'] = y_pred_missing

# Fusionner les données
data = pd.concat([data, data_missing_rating])


In [None]:
data

In [66]:
data['RATING_SUR_5'].isnull().count()

np.int64(2467)

In [67]:
data.to_csv(r'dataset-final.csv')

Ensemble learning 

In [170]:
import joblib

model1 = joblib.load(r"C:\Users\Aycha\Desktop\M2_BDIA\NLP\Projet_movie\final\model\modele_2000_V1.pkl")
model2 = joblib.load(r"C:\Users\Aycha\Desktop\M2_BDIA\NLP\Projet_movie\final\model\modele_2000_V2.pkl")
model3 = joblib.load(r"C:\Users\Aycha\Desktop\M2_BDIA\NLP\Projet_movie\final\model\modele_3000.pkl")


In [None]:
mse1 = mean_squared_error(y_test, model1.predict(X_test))
mse2 = mean_squared_error(y_test, model2.predict(X_test))


print("MSE du modèle 1 :", mse1)
print("MSE du modèle 2 :", mse2)


MSE du modèle 1 : 0.1628395510257533
MSE du modèle 2 : 0.49721034762499494


In [171]:
mse3 = mean_squared_error(y_test, model3.predict(X_test))
print("MSE du modèle 3 :", mse3)

MSE du modèle 3 : 0.19886851672789724


In [185]:
from sklearn.ensemble import StackingRegressor
from sklearn.linear_model import LinearRegression

# Créer un empilement des modèles
stacked_model = StackingRegressor(
    estimators=[
        ('model1', model1),
        ('model2', model2),
        ('model3', model3)
    ],
    final_estimator=LinearRegression()  # Modèle final
)

# Entraîner le modèle empilé
stacked_model.fit(X_train, y_train)

# Évaluer le modèle empilé
stacked_mse = mean_squared_error(y_test, stacked_model.predict(X_test))
print("MSE du modèle empilé :", stacked_mse)


MSE du modèle empilé : 0.1607921624864349


In [190]:
mae = mean_absolute_error(y_test, stacked_model.predict(X_test))
r2 = r2_score(y_test, stacked_model.predict(X_test))

print(f'Mean Absolute Error: {mae:.4f}')
print(f'R² Score: {r2:.4f}')
 

Mean Absolute Error: 0.3035
R² Score: 0.7445


In [None]:
model12 = joblib.load(r"C:\Users\Aycha\Desktop\M2_BDIA\NLP\Projet_movie\final\model\modele_ensemble_12.pkl")
model16 = joblib.load(r"C:\Users\Aycha\Desktop\M2_BDIA\NLP\Projet_movie\final\model\modele_ensemble_16.pkl")


In [192]:
from sklearn.ensemble import StackingRegressor
from sklearn.ensemble import HistGradientBoostingRegressor

# Créer un empilement des modèles
stacked_model = StackingRegressor(
    estimators=[
        ('model1', model12),
        ('model2', model16),
    ],
    final_estimator=LinearRegression()  # Modèle final
)

# Entraîner le modèle empilé
stacked_model.fit(X_train, y_train)

# Évaluer le modèle empilé
stacked_mse = mean_squared_error(y_test, stacked_model.predict(X_test))
print("MSE du modèle empilé :", stacked_mse)


MSE du modèle empilé : 0.16061861732244032


In [186]:
import joblib

# Chemin pour sauvegarder le modèle
model_path = r"C:\Users\Aycha\Desktop\M2_BDIA\NLP\Projet_movie\final\model\modele_ensemble_12.pkl"

# Sauvegarder le modèle avec joblib
joblib.dump(stacked_model, model_path)
print(f"Modèle sauvegardé sous {stacked_model}")

Modèle sauvegardé sous StackingRegressor(estimators=[('model1', HistGradientBoostingRegressor()),
                              ('model2', HistGradientBoostingRegressor()),
                              ('model3', HistGradientBoostingRegressor())],
                  final_estimator=LinearRegression())
