In [17]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score
from imblearn.over_sampling import SMOTE
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Aycha\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Aycha\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [18]:
df = pd.read_csv('dataset-final.csv')

In [19]:
df

Unnamed: 0,MOVIE_NAME,SYNOPSIS,YEAR,CAST,GENRES,PRODUCERS,REVIEWS,DURATION_MIN,RATING_SUR_5,MAJORITY_SENTIMENT,MOST_FREQUENT_SENTIMENT,Rating_Category
0,wicked,A cul-de-sac in an oppressive suburb becomes a...,1998.0,"Julia Stiles, William R. Moses, Patrick Muldoo...",Thriller,Frank Beddor; Greg Steinberg,"['one unhinged viewing experience life', 'dont...",88.0,2.64,"['neutral', 'positive', 'neutral', 'neutral', ...",positive,Negative
1,gladiator-ii,Years after witnessing the death of the revere...,2024.0,"Paul Mescal, Denzel Washington, Pedro Pascal, ...",Drama; Action; Adventure,Michael A. Pruss; Winston Azzopardi; David Fra...,"['review may contain spoiler handle truth', 'c...",148.0,3.37,"['positive', 'neutral', 'neutral', 'negative',...",neutral,Neutral
2,moana-2,After receiving an unexpected call from her wa...,2024.0,"Auliʻi Cravalho, Dwayne Johnson, Hualālai Chun...",Comedy; Animation; Adventure; Family,Christina Chen; Yvett Merino Flores,"['water looked worse original', 'schaffrillas ...",100.0,2.87,"['negative', 'positive', 'positive', 'neutral'...",positive,Negative
3,the-substance,A fading celebrity decides to use a black mark...,2024.0,"Demi Moore, Margaret Qualley, Dennis Quaid, Ed...",Horror; Science Fiction,Coralie Fargeat; Eric Fellner; Tim Bevan,"['mama girl inside', 'review may contain spoil...",141.0,3.85,"['neutral', 'positive', 'positive', 'neutral',...",positive,Positive
4,our-little-secret,After discovering their significant others are...,2024.0,"Lindsay Lohan, Ian Harding, Kristin Chenoweth,...",Drama; Romance; Comedy,Mike Elliott; Lisa Gooding,['dating much younger woman close enough welco...,99.0,2.36,"['positive', 'negative', 'neutral', 'positive'...",positive,Negative
...,...,...,...,...,...,...,...,...,...,...,...,...
2513,18 to Party,"Teens grapple with a spate of recent suicides,...",2020,"Jeff Roda (Director), Alivia Clark (Amy), Tann...",Comedy,,['tim cogshell loved angsty little period dram...,80.0,,['positive'],positive,Negative
2514,1945,A village is forced to face up to its ill-gott...,2017,"Ferenc Török (Director), Péter Rudolf (Szentes...",Drama,,['leslie felperin sombre accomplished somewhat...,91.0,,['positive'],positive,Negative
2515,1985,When an adventurous teen discovers a secret ma...,2016,"Kang Vang (Director), Chang Yang (Billy aka Be...",Comedy; Drama; Adventure,,[''],114.0,,['neutral'],neutral,Negative
2516,1992,Martin is 17 and spends all day recording ever...,30m,"Anthony Doncque (Director), Mathieu Dessertine...",Comedy; Drama,,[''],,,['neutral'],neutral,Negative


In [20]:
# Combiner les colonnes pertinentes en une seule colonne de texte
df['Combined_Text'] = df['REVIEWS'] + "/" + df['SYNOPSIS'] + "/" + df['GENRES'] + "/" + df['CAST'] + "/" + df['PRODUCERS'].fillna("")


In [21]:
df['Combined_Text'] = df['Combined_Text'].str.replace(r'/+', '/', regex=True).str.strip('/')

In [22]:
stop_words = set(stopwords.words('english'))
def preprocess_text(text):
    if pd.isnull(text):
        return ""
    tokens = word_tokenize(text.lower())  # Tokenisation et mise en minuscule
    tokens = [word for word in tokens if word.isalnum()]  # Supprimer la ponctuation
    tokens = [word for word in tokens if word not in stop_words]  # Supprimer les stopwords
    return " ".join(tokens)

df['Processed_Text'] = df['Combined_Text'].apply(preprocess_text)


In [23]:
df['Normalized_Duration'] = df['DURATION_MIN'] / df['DURATION_MIN'].max()

In [24]:
from scipy.sparse import hstack
vectorizer = TfidfVectorizer(ngram_range=(1, 2), max_features=10000)
X_text = vectorizer.fit_transform(df['Processed_Text'])

# Ajouter les colonnes numériques
numeric_features = df[['Normalized_Duration']].fillna(0)
X = hstack([X_text, numeric_features])


In [25]:
y = df['Rating_Category'].map({'Positive': 2, 'Neutral': 1, 'Negative': 0})

In [26]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42)


In [27]:
# Rééquilibrer les classes
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)


In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
param_grid_svm = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'rbf'],
    'class_weight': ['balanced', None]
}
grid_search = GridSearchCV(SVC(), param_grid_svm, cv=3, scoring='f1_macro')
grid_search.fit(X_train, y_train)
print("Best Parameters for SVM:", grid_search.best_params_)


In [29]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, accuracy_score

def evaluate_models(X_train, X_test, y_train, y_test):
    models = {
        "Logistic Regression": LogisticRegression(class_weight='balanced', max_iter=1000),
        "Random Forest": RandomForestClassifier(class_weight='balanced', random_state=42),
        "Naive Bayes": MultinomialNB(),
        "SVM": SVC(kernel='linear', class_weight='balanced',C=1, probability=True),
        "XGBoost": XGBClassifier(scale_pos_weight=1, use_label_encoder=False),
        "Neural Network": MLPClassifier(hidden_layer_sizes=(100,), max_iter=300)
    }

    results = {}
    
    for name, model in models.items():
        print(f"\nTraining {name}...")
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        
        accuracy = accuracy_score(y_test, y_pred)
        report = classification_report(y_test, y_pred, target_names=['Negative', 'Neutral', 'Positive'])
        
        results[name] = {
            "accuracy": accuracy,
            "classification_report": report
        }
        
        print(f"\n{name} Results:")
        print(f"Accuracy: {accuracy}")
        print(f"\nClassification Report:\n{report}")
    
    return results


In [30]:
results = evaluate_models(X_train, X_test, y_train, y_test)


Training Logistic Regression...

Logistic Regression Results:
Accuracy: 0.6957671957671958

Classification Report:
              precision    recall  f1-score   support

    Negative       0.67      0.71      0.69       102
     Neutral       0.58      0.38      0.46        92
    Positive       0.74      0.85      0.79       184

    accuracy                           0.70       378
   macro avg       0.67      0.64      0.65       378
weighted avg       0.68      0.70      0.68       378


Training Random Forest...

Random Forest Results:
Accuracy: 0.6587301587301587

Classification Report:
              precision    recall  f1-score   support

    Negative       0.82      0.60      0.69       102
     Neutral       0.86      0.07      0.12        92
    Positive       0.61      0.99      0.76       184

    accuracy                           0.66       378
   macro avg       0.76      0.55      0.52       378
weighted avg       0.73      0.66      0.58       378


Training Naive Ba

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



SVM Results:
Accuracy: 0.6851851851851852

Classification Report:
              precision    recall  f1-score   support

    Negative       0.64      0.73      0.68       102
     Neutral       0.54      0.35      0.42        92
    Positive       0.75      0.83      0.79       184

    accuracy                           0.69       378
   macro avg       0.65      0.63      0.63       378
weighted avg       0.67      0.69      0.67       378


Training XGBoost...


Parameters: { "scale_pos_weight", "use_label_encoder" } are not used.




XGBoost Results:
Accuracy: 0.6931216931216931

Classification Report:
              precision    recall  f1-score   support

    Negative       0.86      0.72      0.78       102
     Neutral       0.55      0.20      0.29        92
    Positive       0.66      0.93      0.77       184

    accuracy                           0.69       378
   macro avg       0.69      0.61      0.61       378
weighted avg       0.68      0.69      0.66       378


Training Neural Network...

Neural Network Results:
Accuracy: 0.6640211640211641

Classification Report:
              precision    recall  f1-score   support

    Negative       0.70      0.58      0.63       102
     Neutral       0.53      0.23      0.32        92
    Positive       0.67      0.93      0.78       184

    accuracy                           0.66       378
   macro avg       0.63      0.58      0.58       378
weighted avg       0.65      0.66      0.63       378



In [31]:
results2 = evaluate_models(X_train_resampled, X_test,  y_train_resampled, y_test)


Training Logistic Regression...

Logistic Regression Results:
Accuracy: 0.6772486772486772

Classification Report:
              precision    recall  f1-score   support

    Negative       0.67      0.65      0.66       102
     Neutral       0.54      0.29      0.38        92
    Positive       0.71      0.89      0.79       184

    accuracy                           0.68       378
   macro avg       0.64      0.61      0.61       378
weighted avg       0.66      0.68      0.65       378


Training Random Forest...

Random Forest Results:
Accuracy: 0.6402116402116402

Classification Report:
              precision    recall  f1-score   support

    Negative       0.82      0.59      0.69       102
     Neutral       0.40      0.11      0.17        92
    Positive       0.61      0.93      0.74       184

    accuracy                           0.64       378
   macro avg       0.61      0.54      0.53       378
weighted avg       0.62      0.64      0.59       378


Training Naive Ba

Parameters: { "scale_pos_weight", "use_label_encoder" } are not used.




XGBoost Results:
Accuracy: 0.6693121693121693

Classification Report:
              precision    recall  f1-score   support

    Negative       0.83      0.73      0.77       102
     Neutral       0.40      0.22      0.28        92
    Positive       0.67      0.86      0.75       184

    accuracy                           0.67       378
   macro avg       0.63      0.60      0.60       378
weighted avg       0.65      0.67      0.64       378


Training Neural Network...

Neural Network Results:
Accuracy: 0.6587301587301587

Classification Report:
              precision    recall  f1-score   support

    Negative       0.73      0.55      0.63       102
     Neutral       0.44      0.53      0.48        92
    Positive       0.76      0.78      0.77       184

    accuracy                           0.66       378
   macro avg       0.64      0.62      0.63       378
weighted avg       0.67      0.66      0.66       378



Autre methodes

In [34]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Aycha\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Aycha\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Aycha\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [36]:
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    # Tokenisation
    tokens = word_tokenize(text.lower())
    # Suppression des stopwords et lemmatisation
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words and word.isalpha()]
    return ' '.join(tokens)

# Remplacer les NaN par une chaîne vide
df['SYNOPSIS'] = df['SYNOPSIS'].fillna("")
df['CAST'] = df['CAST'].fillna("")
df['PRODUCERS'] = df['PRODUCERS'].fillna("")

# Appliquer la fonction preprocess_text sur les colonnes concernées
df['SYNOPSIS'] = df['SYNOPSIS'].apply(preprocess_text)
df['CAST'] = df['CAST'].apply(preprocess_text)
df['PRODUCERS'] = df['PRODUCERS'].apply(preprocess_text)


In [37]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=1000)
synopsis_tfidf = vectorizer.fit_transform(df['SYNOPSIS'])
cast_tfidf = vectorizer.fit_transform(df['CAST'])
producers_tfidf = vectorizer.fit_transform(df['PRODUCERS'])


In [38]:
# Colonnes numériques
numerical_features = df[['YEAR', 'DURATION_MIN', 'RATING_SUR_5']].values


In [43]:
def convert_duration_to_minutes(duration):
    if isinstance(duration, str):
        # Si la durée est exprimée en heures ('h')
        if 'h' in duration:
            hours = int(duration.replace('h', ''))  # Extraire le nombre d'heures
            return hours * 60  # Convertir en minutes
        # Si la durée est exprimée en minutes ('m')
        elif 'm' in duration:
            return int(duration.replace('m', ''))  # Extraire et retourner les minutes
    return 0  # Si la valeur est mal formatée, on retourne 0

# Appliquer la conversion de la colonne DURATION_MIN
df['DURATION_MIN'] = df['DURATION_MIN'].apply(convert_duration_to_minutes)

# Convertir YEAR en valeurs numériques
df['YEAR'] = pd.to_numeric(df['YEAR'], errors='coerce')


In [52]:
import scipy.sparse
from scipy.sparse import csr_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Préparation des données
df[['YEAR', 'DURATION_MIN', 'RATING_SUR_5']] = df[['YEAR', 'DURATION_MIN', 'RATING_SUR_5']].fillna(0)
df[['YEAR', 'DURATION_MIN', 'RATING_SUR_5']] = df[['YEAR', 'DURATION_MIN', 'RATING_SUR_5']].astype(float)

# TF-IDF pour les colonnes textuelles
tfidf_vectorizer = TfidfVectorizer()
synopsis_tfidf = tfidf_vectorizer.fit_transform(df['SYNOPSIS'])
cast_tfidf = tfidf_vectorizer.fit_transform(df['CAST'])
producers_tfidf = tfidf_vectorizer.fit_transform(df['PRODUCERS'])

# Encodage des labels
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(df['Rating_Category'])

# Caractéristiques numériques converties en matrice sparse
numerical_features = df[['YEAR', 'DURATION_MIN']].values
numerical_features_sparse = csr_matrix(numerical_features)

# Combinaison des matrices
X = scipy.sparse.hstack([synopsis_tfidf, cast_tfidf, producers_tfidf, numerical_features_sparse])

# Diviser en train/test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42)


In [54]:
import numpy as np
from sklearn.metrics import accuracy_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier

# Dictionnaire pour stocker les résultats
model_results = {}

# Fonction pour entraîner un modèle et évaluer ses performances
def train_and_evaluate_model(model, model_name, X_train, X_test, y_train, y_test):
    # Appliquer SMOTE pour équilibrer les classes dans le jeu d'entraînement
    smote = SMOTE(random_state=42)
    X_train_res, y_train_res = smote.fit_resample(X_train, y_train)
    
    # Entraîner le modèle sur les données rééchantillonnées
    model.fit(X_train_res, y_train_res)
    y_pred = model.predict(X_test)
    
    # Calcul des métriques
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred, output_dict=True)
    weighted_f1 = report['weighted avg']['f1-score']
    
    # Stocker les résultats
    model_results[model_name] = {'accuracy': accuracy, 'weighted_f1': weighted_f1}
    
    # Afficher les résultats pour chaque modèle
    print(f"\n{model_name} Results:")
    print(f"Accuracy: {accuracy:.4f}")
    print("Classification Report:")
    print(classification_report(y_test, y_pred))
    print(model.get_params())
    
# Définir les modèles à évaluer
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'Random Forest': RandomForestClassifier(),
    'SVM': SVC(),
    'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='logloss'),
    'Neural Network': MLPClassifier(max_iter=500)
}

# Entraîner et évaluer chaque modèle
for model_name, model in models.items():
    train_and_evaluate_model(model, model_name, X_train, X_test, y_train, y_test)

# Sélectionner le meilleur modèle selon l'accuracy
best_model_name = max(model_results, key=lambda name: model_results[name]['accuracy'])
best_model = model_results[best_model_name]

# Résumé du meilleur modèle
print(f"\nBest Model: {best_model_name}")
print(f"Accuracy: {best_model['accuracy']:.4f}")
print(f"Classification Report for Best Model:\n{best_model['report']}")
print("Best Model Parameters:")
for param, value in best_model['parameters'].items():
    print(f"  {param}: {value}")


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(



Logistic Regression Results:
Accuracy: 0.6772
Classification Report:
              precision    recall  f1-score   support

           0       0.76      0.74      0.75       102
           1       0.49      0.26      0.34        92
           2       0.68      0.85      0.76       184

    accuracy                           0.68       378
   macro avg       0.64      0.62      0.62       378
weighted avg       0.66      0.68      0.65       378

{'C': 1.0, 'class_weight': None, 'dual': False, 'fit_intercept': True, 'intercept_scaling': 1, 'l1_ratio': None, 'max_iter': 1000, 'multi_class': 'deprecated', 'n_jobs': None, 'penalty': 'l2', 'random_state': None, 'solver': 'lbfgs', 'tol': 0.0001, 'verbose': 0, 'warm_start': False}

Random Forest Results:
Accuracy: 0.6455
Classification Report:
              precision    recall  f1-score   support

           0       0.66      0.75      0.70       102
           1       0.71      0.16      0.27        92
           2       0.63      0.83     

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



SVM Results:
Accuracy: 0.2540
Classification Report:
              precision    recall  f1-score   support

           0       0.35      0.09      0.14       102
           1       0.25      0.95      0.39        92
           2       0.00      0.00      0.00       184

    accuracy                           0.25       378
   macro avg       0.20      0.34      0.18       378
weighted avg       0.15      0.25      0.13       378

{'C': 1.0, 'break_ties': False, 'cache_size': 200, 'class_weight': None, 'coef0': 0.0, 'decision_function_shape': 'ovr', 'degree': 3, 'gamma': 'scale', 'kernel': 'rbf', 'max_iter': -1, 'probability': False, 'random_state': None, 'shrinking': True, 'tol': 0.001, 'verbose': False}


Parameters: { "use_label_encoder" } are not used.




XGBoost Results:
Accuracy: 0.6376
Classification Report:
              precision    recall  f1-score   support

           0       0.74      0.68      0.71       102
           1       0.42      0.27      0.33        92
           2       0.65      0.80      0.72       184

    accuracy                           0.64       378
   macro avg       0.61      0.58      0.59       378
weighted avg       0.62      0.64      0.62       378

{'objective': 'multi:softprob', 'base_score': None, 'booster': None, 'callbacks': None, 'colsample_bylevel': None, 'colsample_bynode': None, 'colsample_bytree': None, 'device': None, 'early_stopping_rounds': None, 'enable_categorical': False, 'eval_metric': 'logloss', 'feature_types': None, 'gamma': None, 'grow_policy': None, 'importance_type': None, 'interaction_constraints': None, 'learning_rate': None, 'max_bin': None, 'max_cat_threshold': None, 'max_cat_to_onehot': None, 'max_delta_step': None, 'max_depth': None, 'max_leaves': None, 'min_child_weight'

KeyError: 'report'