In [None]:
import pandas as pd
import re
import string
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from nltk import word_tokenize
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.utils import resample
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTE
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.base import BaseEstimator, TransformerMixin

# Télécharger les ressources nltk nécessaires
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('punkt')
stop_words = set(stopwords.words('english'))

# Charger les données à partir des fichiers CSV
train_data = pd.read_csv("HAI817_Projet_train.csv")
test_data = pd.read_csv("HAI817_Projet_test.csv")

# Concaténer les données
df = pd.concat([train_data, test_data], ignore_index=True)


In [None]:
# Supprimer les colonnes non nécessaires
df.drop(columns=['ID', 'public_id'], inplace=True)

# Vérifier les valeurs manquantes et les supprimer
df.dropna(subset=['title', 'text'], how='all', inplace=True)



In [None]:
df['our rating'].value_counts()

In [None]:
# Afficher la répartition des classes
plt.figure(figsize=(8, 6))
sns.countplot(data=df, x='our rating', order=df['our rating'].value_counts().index)
plt.title('Répartition des classes')
plt.xlabel('Classe')
plt.ylabel('Nombre')
plt.show()

In [None]:
# Définir la fonction de nettoyage de texte
def MyCleanText(X, lowercase=True, removestopwords=False, removedigit=False, getstemmer=False, getlemmatisation=False):
    sentence = str(X)
    sentence = re.sub(r'[^\w\s]', ' ', sentence)
    sentence = re.sub(r'\s+[a-zA-Z]\s+', ' ', sentence)
    sentence = re.sub(r'\s+', ' ', sentence, flags=re.I)
    tokens = word_tokenize(sentence)
    
    if lowercase:
        tokens = [token.lower() for token in tokens]
    
    table = str.maketrans('', '', string.punctuation)
    words = [token.translate(table) for token in tokens]
    words = [word for word in words if word.isalnum()]
    
    if removedigit:
        words = [word for word in words if not word.isdigit()]
    
    if removestopwords:
        words = [word for word in words if not word in stop_words]
    
    if getlemmatisation:
        lemmatizer = WordNetLemmatizer()
        words = [lemmatizer.lemmatize(word) for word in words]
    
    if getstemmer:
        ps = PorterStemmer()
        words = [ps.stem(word) for word in words]
    
    sentence = ' '.join(words)
    return sentence


In [None]:
# Définir la classe de normalisation de texte
class TextNormalizer(BaseEstimator, TransformerMixin):
    def __init__(self, removestopwords=False, lowercase=False, removedigit=False, getstemmer=False, getlemmatisation=False):
        self.lowercase = lowercase
        self.getstemmer = getstemmer
        self.removestopwords = removestopwords
        self.getlemmatisation = getlemmatisation
        self.removedigit = removedigit
    
    def transform(self, X, **transform_params):
        X = X.copy()
        return [MyCleanText(text, lowercase=self.lowercase, getstemmer=self.getstemmer, removestopwords=self.removestopwords, getlemmatisation=self.getlemmatisation, removedigit=self.removedigit) for text in X]
    
    def fit(self, X, y=None, **fit_params):
        return self
    
    def fit_transform(self, X, y=None, **fit_params):
        return self.fit(X).transform(X)
    
    def get_params(self, deep=True):
        return {'lowercase': self.lowercase, 'getstemmer': self.getstemmer, 'removestopwords': self.removestopwords, 'getlemmatisation': self.getlemmatisation, 'removedigit': self.removedigit}
    
    def set_params(self, **parameters):
        for parameter, value in parameters.items():
            setattr(self, parameter, value)
        return self


In [None]:

# Définir les classifieurs et leurs hyperparamètres, ces hyperparametres ont les a trouvés grace a GridSearch en utilisant une pipeline que vous allez trouver commentée en bas.
classifiers = {
    'SVC': SVC(C=300, gamma='scale', kernel='linear'),
   'LogisticRegression': LogisticRegression(C=100, penalty='l2', solver='liblinear'),
     'GradientBoosting': GradientBoostingClassifier(learning_rate=0.2, n_estimators=300),
    'KNN': KNeighborsClassifier(metric='euclidean', n_neighbors=7, weights='distance'),
    'AdaBoost': AdaBoostClassifier(learning_rate=0.1, n_estimators=50),
    'RandomForest': RandomForestClassifier(max_features='sqrt', n_estimators=100)
}


In [None]:

def execute_classifications(classifiers, train_texts, train_labels, test_texts, test_labels, labels):
    results = []
    for name, classifier in classifiers.items():
        pipeline = ImbPipeline([
            ('text_normalizer', TextNormalizer(lowercase=True, removestopwords=True)),
            ('tfidf', TfidfVectorizer()),
            ('smote', SMOTE(random_state=42)),
            ('classifier', classifier)
        ])
        
        cv_scores = cross_val_score(pipeline, train_texts, train_labels, cv=10, scoring='accuracy')
        pipeline.fit(train_texts, train_labels)
        test_predictions = pipeline.predict(test_texts)
        conf_matrix = confusion_matrix(test_labels, test_predictions)
        class_report = classification_report(test_labels, test_predictions)  # Ajout du rapport de classification

        results.append({
            'Classifier': name,
            'CV Score Mean': cv_scores.mean(),
            'Confusion Matrix': conf_matrix,
            'Classification Report': class_report,  # Ajout au dictionnaire
            'CV_Score': cv_scores
        })
    return results


In [None]:

def display_results(results, title, labels):
    # Afficher les matrices de confusion avec labels et les rapports de classification
    for result in results:
        print(f"Classification Report pour {result['Classifier']}:\n{result['Classification Report']}")
        plt.figure(figsize=(10, 7))
        sns.heatmap(result['Confusion Matrix'], annot=True, fmt='d', cmap='Blues', 
                    xticklabels=labels, yticklabels=labels)
        plt.title(f'Matrice de confusion pour {result["Classifier"]}')
        plt.xlabel('Prédictions')
        plt.ylabel('Étiquettes réelles')
        plt.show()
    
    # Afficher les scores de validation croisée moyens
    for result in results:
        print(f"Validation croisée (k=10) pour {result['Classifier']} - Accuracy: {result['CV Score Mean']}")

    # Créer un DataFrame pour les scores
    cv_results = pd.DataFrame({
        'Classifier': [],
        'CV Score': []
    })
    for result in results:
        temp_df = pd.DataFrame({'Classifier': [result['Classifier']] * len(result['CV_Score']), 'CV Score': result['CV_Score']})
        cv_results = pd.concat([cv_results, temp_df], ignore_index=True)
    
    # Créer un boxplot pour comparer les classifieurs
    plt.figure(figsize=(16, 8))
    sns.boxplot(x='Classifier', y='CV Score', data=cv_results)
    plt.title(f'Comparaison des Classifieurs - {title}')
    plt.xlabel('Classifieur')
    plt.ylabel('Score de Validation Croisée')
    plt.xticks(rotation=45)
    plt.show()


CLASSIFICATION 1 : TRUE VS FALSE

In [None]:

# Classification 1 : TRUE vs FALSE
df1 = df[df['our rating'].isin(['true', 'false'])].copy()
df1['our rating'] = df1['our rating'].map({'true': 1, 'false': 0})
majority = df1[df1['our rating'] == 0]
minority = df1[df1['our rating'] == 1]
majority_downsampled = resample(majority, replace=False, n_samples=len(minority), random_state=42)
balanced_df1 = pd.concat([majority_downsampled, minority])
train_df1, test_df1 = train_test_split(balanced_df1, test_size=0.2, random_state=42)
train_titles_texts1 = train_df1['title'] + " " + train_df1['text']
test_titles_texts1 = test_df1['title'] + " " + test_df1['text']
results1 = execute_classifications(classifiers, train_titles_texts1, train_df1['our rating'], test_titles_texts1, test_df1['our rating'], ['False', 'True'])
display_results(results1, 'True vs False', ['False', 'True'])

PICKLE DU MODELE SVC

In [None]:
import pickle

# Préparer le pipeline avec les paramètres optimaux pour SVC
pipeline = Pipeline([
    ('text_normalizer', TextNormalizer(lowercase=True, removestopwords=True)),
    ('tfidf', TfidfVectorizer()),
    ('classifier', SVC(C=300, gamma='scale', kernel='linear'))
])

# Entraîner le modèle
pipeline.fit(train_titles_texts1, train_df1['our rating'])

# Sauvegarder le modèle avec pickle
with open('svc_model.pkl', 'wb') as file:
    pickle.dump(pipeline, file)


Explain like i was 5 YO

In [None]:
import eli5
from eli5.sklearn import PermutationImportance
df1 = df[df['our rating'].isin(['true', 'false'])].copy()
df1['our rating'] = df1['our rating'].map({'true': 1, 'false': 0})
df1['title'] = df1['title'].fillna('')
df1['text'] = df1['text'].fillna('')

df1_subset = df1.sample(frac=0.10, random_state=42)


train_titles_texts1 = df1_subset['title'] + " " + df1_subset['text']
train_labels1 = df1_subset['our rating']

pipeline = ImbPipeline([
    ('text_normalizer', TextNormalizer(lowercase=True, removestopwords=True)),
    ('tfidf', TfidfVectorizer()),
    ('classifier', SVC(C=300, gamma='scale', kernel='linear', probability=True))
])

pipeline.fit(train_titles_texts1, train_labels1)

# Feature importance with ELI5
tfidf_transformed = pipeline.named_steps['tfidf'].transform(train_titles_texts1).toarray()
perm = PermutationImportance(pipeline.named_steps['classifier'], random_state=42).fit(tfidf_transformed, train_labels1)
eli5.show_weights(perm, feature_names=pipeline.named_steps['tfidf'].get_feature_names_out())

# Choose a single instance from the test set to explain
sample_text = train_titles_texts1.iloc[0]
sample_vectorized = pipeline.named_steps['tfidf'].transform([sample_text]).toarray()

# Explain the prediction for the single instance
explanation = eli5.explain_prediction(pipeline.named_steps['classifier'], sample_vectorized[0], feature_names=pipeline.named_steps['tfidf'].get_feature_names_out())
eli5.show_prediction(pipeline.named_steps['classifier'], sample_vectorized[0], feature_names=pipeline.named_steps['tfidf'].get_feature_names_out())

# Display the explanations
print(f"Prediction for the sample text: {'True' if pipeline.named_steps['classifier'].predict(sample_vectorized)[0] == 1 else 'False'}")
display(eli5.show_prediction(pipeline.named_steps['classifier'], sample_vectorized[0], feature_names=pipeline.named_steps['tfidf'].get_feature_names_out()))


CLASSIFICATION 2 : TRUE AND FALSE VS OTHER

In [None]:
# Classification 2 : TRUE and FALSE vs OTHER
df2 = df[df['our rating'].isin(['true', 'false', 'other'])].copy()
df2['our rating'] = df2['our rating'].apply(lambda x: 0 if x in ['true', 'false'] else 1)
majority = df2[df2['our rating'] == 0]
minority = df2[df2['our rating'] == 1]
majority_downsampled = resample(majority, replace=False, n_samples=len(minority), random_state=42)
balanced_df2 = pd.concat([majority_downsampled, minority])
train_df2, test_df2 = train_test_split(balanced_df2, test_size=0.2, random_state=42)
train_titles_texts2 = train_df2['title'] + " " + train_df2['text']
test_titles_texts2 = test_df2['title'] + " " + test_df2['text']
results2 = execute_classifications(classifiers, train_titles_texts2, train_df2['our rating'], test_titles_texts2, test_df2['our rating'], ['True/False', 'Other'])
display_results(results2, 'True/False vs Other', ['True/False', 'Other'])


CLASSIFICATION 3 : TRUE VS FALSE VS OTHER VS MIXTURE

In [None]:
# Classification 3 : TRUE vs FALSE vs OTHER vs MIXTURE
df3 = df[df['our rating'].isin(['true', 'false', 'mixture', 'other'])].copy()
rating_map = {'true': 0, 'false': 1, 'mixture': 2, 'other': 3}
df3['our rating'] = df3['our rating'].map(rating_map)
classes = df3['our rating'].unique()
dfs = [df3[df3['our rating'] == cls] for cls in classes]
majority_size = min([len(cls_df) for cls_df in dfs])
dfs_downsampled = [resample(cls_df, replace=False, n_samples=majority_size, random_state=42) for cls_df in dfs]
balanced_df3 = pd.concat(dfs_downsampled)
train_df3, test_df3 = train_test_split(balanced_df3, test_size=0.2, random_state=42)
train_titles_texts3 = train_df3['title'] + " " + train_df3['text']
test_titles_texts3 = test_df3['title'] + " " + test_df3['text']
results3 = execute_classifications(classifiers, train_titles_texts3, train_df3['our rating'], test_titles_texts3, test_df3['our rating'], ['True', 'False', 'Mixture', 'Other'])
display_results(results3, 'True vs False vs Mixture vs Other', ['True', 'False', 'Mixture', 'Other'])

LES PIPELINES AVEC GRIDSEARCH POUR TROUVER LES MEILLEURS HYPERPARAMETRES ET MEILLEURS PRETRAITEMENTS, LANCEZ TOUT LE BLOC, CELA PREND DES HEURES, DES FOIS DES JOURS...

PIPELINE CLASSIFICATION 1 

In [None]:
!pip install xgboost imbalanced-learn

from xgboost import XGBClassifier
import pandas as pd
import re
import string
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from nltk import word_tokenize
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV, cross_val_score, train_test_split
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTE
import matplotlib.pyplot as plt

# Télécharger les ressources nltk nécessaires
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('punkt')
stop_words = set(stopwords.words('english'))

# Charger les données à partir des fichiers CSV
train_data = pd.read_csv("HAI817_Projet_train.csv")
test_data = pd.read_csv("HAI817_Projet_test.csv")

# Concaténer les données
df = pd.concat([train_data, test_data], ignore_index=True)

# Supprimer les colonnes non nécessaires
df.drop(columns=['ID', 'public_id'], inplace=True)

# Vérifier les valeurs manquantes et les supprimer
df.dropna(subset=['title', 'text'], how='all', inplace=True)

# Filtrer les lignes où 'our rating' est 'true' ou 'false'
filtered_df = df[df['our rating'].isin(['true', 'false'])].copy()

# Convertir 'true' en 1 et 'false' en 0
filtered_df['our rating'] = filtered_df['our rating'].map({'true': 1, 'false': 0})

# Downsample la classe majoritaire
majority = filtered_df[filtered_df['our rating'] == 0]
minority = filtered_df[filtered_df['our rating'] == 1]

# Downsample majority class
majority_downsampled = resample(majority, 
                              replace=False,    # échantillon sans remplacement
                              n_samples=len(minority),  # pour correspondre au nombre de minorités
                              random_state=42)  # pour la reproductibilité

# Concaténer les classes minoritaires et majoritaires rééchantillonnées
balanced_df = pd.concat([majority_downsampled, minority])

# Séparer les données en jeu d'entraînement et de test
train_df, test_df = train_test_split(balanced_df, test_size=0.2, random_state=42)

# Préparer les ensembles d'entraînement et de test
train_titles_texts = train_df['title'] + " " + train_df['text']
test_titles_texts = test_df['title'] + " " + test_df['text']

# Définir la fonction de nettoyage de texte
def MyCleanText(X, lowercase=True, removestopwords=False, removedigit=False, getstemmer=False, getlemmatisation=False):
    sentence = str(X)
    sentence = re.sub(r'[^\w\s]', ' ', sentence)
    sentence = re.sub(r'\s+[a-zA-Z]\s+', ' ', sentence)
    sentence = re.sub(r'\s+', ' ', sentence, flags=re.I)
    tokens = word_tokenize(sentence)
    
    if lowercase:
        tokens = [token.lower() for token in tokens]
    
    table = str.maketrans('', '', string.punctuation)
    words = [token.translate(table) for token in tokens]
    words = [word for word in words if word.isalnum()]
    
    if removedigit:
        words = [word for word in words if not word.isdigit()]
    
    if removestopwords:
        words = [word for word in words if not word in stop_words]
    
    if getlemmatisation:
        lemmatizer = WordNetLemmatizer()
        words = [lemmatizer.lemmatize(word) for word in words]
    
    if getstemmer:
        ps = PorterStemmer()
        words = [ps.stem(word) for word in words]
    
    sentence = ' '.join(words)
    return sentence

# Définir la classe de normalisation de texte
class TextNormalizer(BaseEstimator, TransformerMixin):
    def __init__(self, removestopwords=False, lowercase=False, removedigit=False, getstemmer=False, getlemmatisation=False):
        self.lowercase = lowercase
        self.getstemmer = getstemmer
        self.removestopwords = removestopwords
        self.getlemmatisation = getlemmatisation
        self.removedigit = removedigit
    
    def transform(self, X, **transform_params):
        X = X.copy()
        return [MyCleanText(text, lowercase=self.lowercase, getstemmer=self.getstemmer, removestopwords=self.removestopwords, getlemmatisation=self.getlemmatisation, removedigit=self.removedigit) for text in X]
    
    def fit(self, X, y=None, **fit_params):
        return self
    
    def fit_transform(self, X, y=None, **fit_params):
        return self.fit(X).transform(X)
    
    def get_params(self, deep=True):
        return {'lowercase': self.lowercase, 'getstemmer': self.getstemmer, 'removestopwords': self.removestopwords, 'getlemmatisation': self.getlemmatisation, 'removedigit': self.removedigit}
    
    def set_params(self, **parameters):
        for parameter, value in parameters.items():
            setattr(self, parameter, value)
        return self

# Préparer les configurations de prétraitement
preprocessing_options = [
    {'lowercase': True, 'removestopwords': True, 'removedigit': False, 'getlemmatisation': False},
    {'lowercase': True, 'removestopwords': True, 'removedigit': True, 'getlemmatisation': True},
    {'lowercase': True, 'removestopwords': True, 'removedigit': False, 'getlemmatisation': True},
    {'lowercase': False, 'removestopwords': True, 'removedigit': True, 'getlemmatisation': True},
    {'lowercase': True, 'removestopwords': False, 'removedigit': True, 'getlemmatisation': True},
    {'lowercase': True, 'removestopwords': True, 'removedigit': True, 'getlemmatisation': True}
]
# Définir les classifieurs et leurs grilles de paramètres
classifiers = {
    'SVC': SVC(),
    'RandomForest': RandomForestClassifier(),
    'GradientBoosting': GradientBoostingClassifier(),
    'LogisticRegression': LogisticRegression(),
    'KNN': KNeighborsClassifier(),
    'AdaBoost': AdaBoostClassifier(),
    'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='logloss')
}

param_grids = {
    'SVC': {
        'classifier__C': [0.1, 1, 10, 100, 300],
        'classifier__gamma': ['scale', 'auto'],
        'classifier__kernel': ['linear', 'rbf']
    },
    'RandomForest': {
        'classifier__n_estimators': [100, 200, 300],
        'classifier__max_features': ['sqrt', 'log2']
    },
    'GradientBoosting': {
        'classifier__n_estimators': [100, 200, 300],
        'classifier__learning_rate': [0.01, 0.1, 0.2]
    },
    'LogisticRegression': {
        'classifier__C': [0.1, 1, 10, 100],
        'classifier__penalty': ['l1', 'l2'],
        'classifier__solver': ['liblinear']
    },
    'KNN': {
        'classifier__n_neighbors': [3, 5, 7, 9],
        'classifier__weights': ['uniform', 'distance'],
        'classifier__metric': ['euclidean', 'manhattan']
    },
    'AdaBoost': {
        'classifier__n_estimators': [50, 100, 200],
        'classifier__learning_rate': [0.01, 0.1, 1.0]
    },
    'XGBoost': {
        'classifier__n_estimators': [100, 200, 300],
        'classifier__learning_rate': [0.01, 0.1, 0.2],
        'classifier__max_depth': [3, 5, 7]
    }
}

results = []

for preprocess_params in preprocessing_options:
    preprocess_name = '_'.join([k for k, v in preprocess_params.items() if v])
    for classifier_name, classifier in classifiers.items():
        pipeline = ImbPipeline([
            ('text_normalizer', TextNormalizer(**preprocess_params)),
            ('tfidf', TfidfVectorizer()),
            ('smote', SMOTE(random_state=42)),
            ('classifier', classifier)
        ])
        
        param_grid = param_grids[classifier_name]
        
        grid_search = GridSearchCV(pipeline, param_grid, cv=10, scoring='accuracy')
        grid_search.fit(train_titles_texts, train_df['our rating'])
        
        mean_score = grid_search.best_score_
        
        results.append({
            'Preprocessing': preprocess_name,
            'Classifier': classifier_name,
            'Accuracy': mean_score,
            'Best Params': grid_search.best_params_
        })

# Afficher les résultats
results_df = pd.DataFrame(results)
print(results_df)

# Sauvegarder les résultats dans un fichier CSV
results_df.to_csv("results_with_smote.csv", index=False)


PIPELINE CLASSIFICATION 2

In [None]:
!pip install xgboost imbalanced-learn

from xgboost import XGBClassifier
import pandas as pd
import re
import string
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from nltk import word_tokenize
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV, cross_val_score, train_test_split
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTE
import matplotlib.pyplot as plt

# Télécharger les ressources nltk nécessaires
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('punkt')
stop_words = set(stopwords.words('english'))

# Charger les données à partir des fichiers CSV
train_data = pd.read_csv("HAI817_Projet_train.csv")
test_data = pd.read_csv("HAI817_Projet_test.csv")

# Concaténer les données
df = pd.concat([train_data, test_data], ignore_index=True)

# Supprimer les colonnes non nécessaires
df.drop(columns=['ID', 'public_id'], inplace=True)

# Vérifier les valeurs manquantes et les supprimer
df.dropna(subset=['title', 'text'], how='all', inplace=True)

# Filtrer les lignes où 'our rating' est 'true', 'false' ou 'other'
df = df[df['our rating'].isin(['true', 'false', 'other'])].copy()
df['our rating'] = df['our rating'].apply(lambda x: 0 if x in ['true', 'false'] else 1)

# Downsample la classe majoritaire
majority = df[df['our rating'] == 0]
minority = df[df['our rating'] == 1]

# Downsample majority class
majority_downsampled = resample(majority, 
                              replace=False,    # échantillon sans remplacement
                              n_samples=len(minority),  # pour correspondre au nombre de minorités
                              random_state=42)  # pour la reproductibilité

# Concaténer les classes minoritaires et majoritaires rééchantillonnées
balanced_df = pd.concat([majority_downsampled, minority])

# Séparer les données en jeu d'entraînement et de test
train_df, test_df = train_test_split(balanced_df, test_size=0.2, random_state=42)

# Préparer les ensembles d'entraînement et de test
train_titles_texts = train_df['title'] + " " + train_df['text']
test_titles_texts = test_df['title'] + " " + test_df['text']

# Définir la fonction de nettoyage de texte
def MyCleanText(X, lowercase=True, removestopwords=False, removedigit=False, getstemmer=False, getlemmatisation=False):
    sentence = str(X)
    sentence = re.sub(r'[^\w\s]', ' ', sentence)
    sentence = re.sub(r'\s+[a-zA-Z]\s+', ' ', sentence)
    sentence = re.sub(r'\s+', ' ', sentence, flags=re.I)
    tokens = word_tokenize(sentence)
    
    if lowercase:
        tokens = [token.lower() for token in tokens]
    
    table = str.maketrans('', '', string.punctuation)
    words = [token.translate(table) for token in tokens]
    words = [word for word in words if word.isalnum()]
    
    if removedigit:
        words = [word for word in words if not word.isdigit()]
    
    if removestopwords:
        words = [word for word in words if not word in stop_words]
    
    if getlemmatisation:
        lemmatizer = WordNetLemmatizer()
        words = [lemmatizer.lemmatize(word) for word in words]
    
    if getstemmer:
        ps = PorterStemmer()
        words = [ps.stem(word) for word in words]
    
    sentence = ' '.join(words)
    return sentence

# Définir la classe de normalisation de texte
class TextNormalizer(BaseEstimator, TransformerMixin):
    def __init__(self, removestopwords=False, lowercase=False, removedigit=False, getstemmer=False, getlemmatisation=False):
        self.lowercase = lowercase
        self.getstemmer = getstemmer
        self.removestopwords = removestopwords
        self.getlemmatisation = getlemmatisation
        self.removedigit = removedigit
    
    def transform(self, X, **transform_params):
        X = X.copy()
        return [MyCleanText(text, lowercase=self.lowercase, getstemmer=self.getstemmer, removestopwords=self.removestopwords, getlemmatisation=self.getlemmatisation, removedigit=self.removedigit) for text in X]
    
    def fit(self, X, y=None, **fit_params):
        return self
    
    def fit_transform(self, X, y=None, **fit_params):
        return self.fit(X).transform(X)
    
    def get_params(self, deep=True):
        return {'lowercase': self.lowercase, 'getstemmer': self.getstemmer, 'removestopwords': self.removestopwords, 'getlemmatisation': self.getlemmatisation, 'removedigit': self.removedigit}
    
    def set_params(self, **parameters):
        for parameter, value in parameters.items():
            setattr(self, parameter, value)
        return self

# Préparer les configurations de prétraitement
preprocessing_options = [
    {'lowercase': True, 'removestopwords': True, 'removedigit': False, 'getlemmatisation': False},
    {'lowercase': True, 'removestopwords': True, 'removedigit': True, 'getlemmatisation': True},
    {'lowercase': True, 'removestopwords': True, 'removedigit': False, 'getlemmatisation': True},
    {'lowercase': False, 'removestopwords': True, 'removedigit': True, 'getlemmatisation': True},
    {'lowercase': True, 'removestopwords': False, 'removedigit': True, 'getlemmatisation': True},
    {'lowercase': True, 'removestopwords': True, 'removedigit': True, 'getlemmatisation': True}
]
# Définir les classifieurs et leurs grilles de paramètres
classifiers = {
    'SVC': SVC(),
    'RandomForest': RandomForestClassifier(),
    'GradientBoosting': GradientBoostingClassifier(),
    'LogisticRegression': LogisticRegression(),
    'KNN': KNeighborsClassifier(),
    'AdaBoost': AdaBoostClassifier(),
    'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='logloss')
}

param_grids = {
    'SVC': {
        'classifier__C': [0.1, 1, 10, 100, 300],
        'classifier__gamma': ['scale', 'auto'],
        'classifier__kernel': ['linear', 'rbf']
    },
    'RandomForest': {
        'classifier__n_estimators': [100, 200, 300],
        'classifier__max_features': ['sqrt', 'log2']
    },
    'GradientBoosting': {
        'classifier__n_estimators': [100, 200, 300],
        'classifier__learning_rate': [0.01, 0.1, 0.2]
    },
    'LogisticRegression': {
        'classifier__C': [0.1, 1, 10, 100],
        'classifier__penalty': ['l1', 'l2'],
        'classifier__solver': ['liblinear']
    },
    'KNN': {
        'classifier__n_neighbors': [3, 5, 7, 9],
        'classifier__weights': ['uniform', 'distance'],
        'classifier__metric': ['euclidean', 'manhattan']
    },
    'AdaBoost': {
        'classifier__n_estimators': [50, 100, 200],
        'classifier__learning_rate': [0.01, 0.1, 1.0]
    },
    'XGBoost': {
        'classifier__n_estimators': [100, 200, 300],
        'classifier__learning_rate': [0.01, 0.1, 0.2],
        'classifier__max_depth': [3, 5, 7]
    }
}

results = []

for preprocess_params in preprocessing_options:
    preprocess_name = '_'.join([k for k, v in preprocess_params.items() if v])
    for classifier_name, classifier in classifiers.items():
        pipeline = ImbPipeline([
            ('text_normalizer', TextNormalizer(**preprocess_params)),
            ('tfidf', TfidfVectorizer()),
            ('smote', SMOTE(random_state=42)),
            ('classifier', classifier)
        ])
        
        param_grid = param_grids[classifier_name]
        
        grid_search = GridSearchCV(pipeline, param_grid, cv=10, scoring='accuracy')
        grid_search.fit(train_titles_texts, train_df['our rating'])
        
        mean_score = grid_search.best_score_
        
        results.append({
            'Preprocessing': preprocess_name,
            'Classifier': classifier_name,
            'Accuracy': mean_score,
            'Best Params': grid_search.best_params_
        })

# Afficher les résultats
results_df = pd.DataFrame(results)
print(results_df)

# Sauvegarder les résultats dans un fichier CSV
results_df.to_csv("final/2emeclassification_results_with_smote.csv", index=False)

PIPELINE CLASSIFICATION 3

In [None]:
!pip install xgboost imbalanced-learn

from xgboost import XGBClassifier
import pandas as pd
import re
import string
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk import word_tokenize
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import cross_val_score, GridSearchCV, train_test_split
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import classification_report, confusion_matrix
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTE

# Télécharger les ressources nltk nécessaires
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('punkt')
stop_words = set(stopwords.words('english'))

# Charger les données à partir des fichiers CSV
train_data = pd.read_csv("HAI817_Projet_train.csv")
test_data = pd.read_csv("HAI817_Projet_test.csv")

# Concaténer les données
df = pd.concat([train_data, test_data], ignore_index=True)

# Supprimer les colonnes non nécessaires
df.drop(columns=['ID', 'public_id'], inplace=True)

# Vérifier les valeurs manquantes et les supprimer
df.dropna(subset=['title', 'text'], how='all', inplace=True)

# Filtrer les lignes où 'our rating' est 'true', 'false', 'mixture', ou 'other'
df = df[df['our rating'].isin(['true', 'false', 'mixture', 'other'])].copy()

# Mapper les valeurs de 'our rating' à des entiers
rating_map = {'true': 0, 'false': 1, 'mixture': 2, 'other': 3}
df['our rating'] = df['our rating'].map(rating_map)

# Upsample les classes minoritaires
classes = df['our rating'].unique()
dfs = [df[df['our rating'] == cls] for cls in classes]
majority_size = max([len(cls_df) for cls_df in dfs])
dfs_upsampled = [resample(cls_df, replace=True, n_samples=majority_size, random_state=42) for cls_df in dfs]
balanced_df = pd.concat(dfs_upsampled)

# Définir la fonction de nettoyage de texte
def MyCleanText(X, lowercase=True, removestopwords=False, removedigit=False, getstemmer=False, getlemmatisation=False):
    sentence = str(X)
    sentence = re.sub(r'[^\w\s]', ' ', sentence)
    sentence = re.sub(r'\s+[a-zA-Z]\s+', ' ', sentence)
    sentence = re.sub(r'\s+', ' ', sentence, flags=re.I)
    tokens = word_tokenize(sentence)
    
    if lowercase:
        tokens = [token.lower() for token in tokens]
    
    table = str.maketrans('', '', string.punctuation)
    words = [token.translate(table) for token in tokens]
    words = [word for word in words if word.isalnum()]
    
    if removedigit:
        words = [word for word in words if not word.isdigit()]
    
    if removestopwords:
        words = [word for word in words if not word in stop_words]
    
    if getlemmatisation:
        lemmatizer = WordNetLemmatizer()
        words = [lemmatizer.lemmatize(word) for word in words]
    
    if getstemmer:
        ps = PorterStemmer()
        words = [ps.stem(word) for word in words]
    
    sentence = ' '.join(words)
    return sentence

# Définir la classe de normalisation de texte
class TextNormalizer(BaseEstimator, TransformerMixin):
    def __init__(self, removestopwords=False, lowercase=False, removedigit=False, getstemmer=False, getlemmatisation=False):
        self.lowercase = lowercase
        self.getstemmer = getstemmer
        self.removestopwords = removestopwords
        self.getlemmatisation = getlemmatisation
        self.removedigit = removedigit
    
    def transform(self, X, **transform_params):
        X = X.copy()
        return [MyCleanText(text, lowercase=self.lowercase, getstemmer=self.getstemmer, removestopwords=self.removestopwords, getlemmatisation=self.getlemmatisation, removedigit=self.removedigit) for text in X]
    
    def fit(self, X, y=None, **fit_params):
        return self
    
    def fit_transform(self, X, y=None, **fit_params):
        return self.fit(X).transform(X)
    
    def get_params(self, deep=True):
        return {'lowercase': self.lowercase, 'getstemmer': self.getstemmer, 'removestopwords': self.removestopwords, 'getlemmatisation': self.getlemmatisation, 'removedigit': self.removedigit}
    
    def set_params(self, **parameters):
        for parameter, value in parameters.items():
            setattr(self, parameter, value)
        return self

titles_texts = balanced_df['title'] + " " + balanced_df['text']

# Configurations pour les tests
datasets = {
    'Title_Text': titles_texts
}

preprocessing_options = [
    {'lowercase': True, 'removestopwords': True, 'removedigit': False, 'getlemmatisation': False},
    {'lowercase': True, 'removestopwords': True, 'removedigit': True, 'getlemmatisation': True},
    {'lowercase': True, 'removestopwords': True, 'removedigit': False, 'getlemmatisation': True},
    {'lowercase': False, 'removestopwords': True, 'removedigit': True, 'getlemmatisation': True},
    {'lowercase': True, 'removestopwords': False, 'removedigit': True, 'getlemmatisation': True},
    {'lowercase': True, 'removestopwords': True, 'removedigit': True, 'getlemmatisation': True}
]

# Définir les classifieurs et leurs grilles de paramètres
classifiers = {
    'SVC': SVC(),
    'RandomForest': RandomForestClassifier(),
    'GradientBoosting': GradientBoostingClassifier(),
    'LogisticRegression': LogisticRegression(),
    'KNN': KNeighborsClassifier(),
    'AdaBoost': AdaBoostClassifier(),
    'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='logloss')
}

param_grids = {
    'SVC': {
        'classifier__C': [0.1, 1, 10, 100, 300],
        'classifier__gamma': ['scale', 'auto'],
        'classifier__kernel': ['linear', 'rbf']
    },
    'RandomForest': {
        'classifier__n_estimators': [100, 200, 300],
        'classifier__max_features': ['sqrt', 'log2']
    },
    'GradientBoosting': {
        'classifier__n_estimators': [100, 200, 300],
        'classifier__learning_rate': [0.01, 0.1, 0.2]
    },
    'LogisticRegression': {
        'classifier__C': [0.1, 1, 10, 100],
        'classifier__penalty': ['l1', 'l2'],
        'classifier__solver': ['liblinear']
    },
    'KNN': {
        'classifier__n_neighbors': [3, 5, 7, 9],
        'classifier__weights': ['uniform', 'distance'],
        'classifier__metric': ['euclidean', 'manhattan']
    },
    'AdaBoost': {
        'classifier__n_estimators': [50, 100, 200],
        'classifier__learning_rate': [0.01, 0.1, 1.0]
    },
    'XGBoost': {
        'classifier__n_estimators': [100, 200, 300],
        'classifier__learning_rate': [0.01, 0.1, 0.2],
        'classifier__max_depth': [3, 5, 7]
    }
}

results = []

for dataset_name, dataset in datasets.items():
    for preprocess_params in preprocessing_options:
        preprocess_name = '_'.join([k for k, v in preprocess_params.items() if v])
        for classifier_name, classifier in classifiers.items():
            pipeline = ImbPipeline([
                ('text_normalizer', TextNormalizer(**preprocess_params)),
                ('tfidf', TfidfVectorizer()),
                ('smote', SMOTE(random_state=42)),
                ('classifier', classifier)
            ])
            
            param_grid = param_grids[classifier_name]
            
            grid_search = GridSearchCV(pipeline, param_grid, cv=10, scoring='accuracy')
            grid_search.fit(dataset, balanced_df['our rating'])
            
            mean_score = grid_search.best_score_
            
            results.append({
                'Preprocessing': preprocess_name,
                'Classifier': classifier_name,
                'Accuracy': mean_score,
                'Best Params': grid_search.best_params_
            })

# Afficher les résultats
results_df = pd.DataFrame(results)
print(results_df)

# Sauvegarder les résultats dans un fichier CSV
results_df.to_csv("final/3meclassification_results_with_smote.csv", index=False)