In [None]:
import pandas as pd
from itertools import product
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

import warnings
warnings.filterwarnings("ignore")

In [None]:
# Chargement du jeu de données des iris
iris = load_iris()
X = iris.data
y = iris.target

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
# Load the dataset from CSV
data = pd.read_csv("./dataset_with_dummies.csv")

# X = data.drop({'Target'}, axis=1)  # Features
# y = data['Target']  # Target variable
X = data.drop({'Cible'}, axis=1)  # Features
y = data['Cible']  # Target variable

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
# Initialisation du modèle de régression logistique
logistic_regression = LogisticRegression(C=10, max_iter=1000, solver='lbfgs')  # Spécification du nombre maximal d'itérations

# Entraînement du modèle
logistic_regression.fit(X_train, y_train)

# Prédiction sur l'ensemble de test
y_pred = logistic_regression.predict(X_test)

In [None]:
#Analyse des meilleurs hyperparametres
logistic_regression = LogisticRegression()

parameters = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
    'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
    'max_iter': [100, 1000, 10000]
}

# Use GridSearchCV for testing different parameters
grid_search = GridSearchCV(logistic_regression, parameters, cv=5)
grid_search.fit(X_train, y_train)

# Get the results of parameter grid search
results = pd.DataFrame(grid_search.cv_results_)

# Print a comparative table of parameter combinations and their performance metrics
print("Comparison of different parameter combinations:")
print(results[['param_C', 'param_solver', 'param_max_iter', 'mean_test_score', 'std_test_score']])

# Get the best parameters and their corresponding accuracy
best_params = grid_search.best_params_
best_accuracy = grid_search.best_score_
print(f"\nBest parameters: {best_params}")
print(f"Accuracy with best parameters: {best_accuracy}")

In [None]:
# Évaluation des performances du modèle
accuracy = metrics.accuracy_score(y_test, y_pred)
print(f"Précision du modèle : {accuracy}")

# Affichage du rapport de classification et de la matrice de confusion
print("Rapport de classification :")
print(metrics.classification_report(y_test, y_pred))

print("Matrice de confusion :")
print(metrics.confusion_matrix(y_test, y_pred))

In [None]:
#Find best class weight combinaison for max accuracy
unique_classes = data['Cible'].unique()

best_accuracy = 0
best_class_weights = None

# Define different weight combinations for all classes
weight_combinations = product([1, 5, 10], repeat=len(unique_classes))

# Iterate through different weight combinations
for weights in weight_combinations:
    class_weights = {class_label: weight for class_label, weight in zip(unique_classes, weights)}

    # Train the model using the class weights
    logistic_regression = LogisticRegression(C=10, max_iter=1000, solver='lbfgs', class_weight=class_weights)
    logistic_regression.fit(X_train, y_train)

    # Make predictions on the test set
    y_pred = logistic_regression.predict(X_test)

    # Evaluate the model's performance
    accuracy = metrics.accuracy_score(y_test, y_pred)

    print(f"Accuracy with class weights {class_weights}: {accuracy}")

    # Track the class weights that result in the best accuracy
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_class_weights = class_weights

print(f"Best accuracy achieved: {best_accuracy} with class weights: {best_class_weights}")

In [None]:
#Conversion du dataset pour ajouter des colonnes dummies pour les categories
import pandas as pd
import json

# Load the JSON mapping
with open('donnee_info.json', 'r', encoding='utf-8') as file:
    mapping = json.load(file)

# Read the CSV file
df = pd.read_csv('fr_dataset.csv')

# Columns to be replaced using the JSON mapping
columns_to_replace = [
    "État civil",
    "Mode d'application",
    "Cours",
    "Présence jour/soir",
    "Qualification antérieure",
    "Nationalité",
    "Qualification mère",
    "Qualification père",
    "Occupation mère",
    "Occupation père",
    "Déplacé",
    "Besoins éducatifs spéciaux",
    "Dettes",
    "Frais de scolarité à jour",
    "Sexe",
    "Bourse",
    "International",
    # "Cible"
]

# Replace integer values with string equivalents
for column in columns_to_replace:
    df[column] = df[column].astype(str).map(mapping[column])

# Save the modified data to a new CSV file
df.to_csv('modified_dataset.csv', index=False, encoding='utf-8')


# Convert categorical variables into dummy/indicator variables
for column in columns_to_replace:
    dummies = pd.get_dummies(df[column], prefix=column)
    df = pd.concat([df, dummies], axis=1)
    df.drop(column, axis=1, inplace=True)  # Drop the original column after creating dummies

# Save the dataset with dummy columns to a new CSV file
df.to_csv('dataset_with_dummies.csv', index=False, encoding='utf-8')
