#Pour tester les 2 algorithmes, il faut prendre ces codes dans le notebook de base, et de l'exécuter, je ne suis pas arrivé à télécharger le notebook de base car j'ai dépassé les 10 MB.

#1. SVM

### Étape1. Définir le modèle d'apprentissage

In [0]:
from pyspark.ml.classification import LinearSVC

svm = LinearSVC(featuresCol="features", labelCol="label", maxIter=10, regParam=1.0)


### Étape 2. Construisez le pipeline

In [0]:
# check class imbalance before creating the pipeline
class_distribution = dataset.groupBy('income').count().orderBy('income')
class_distribution.show()


+------+-----+
|income|count|
+------+-----+
| <=50K|24720|
|  >50K| 7841|
+------+-----+



In [0]:
pipeline1 = Pipeline(stages=[stringIndexer, encoder, labelToIndex, vecAssembler, svm])

# Define the pipeline model.
pipelineModel1 = pipeline1.fit(trainDF)

# Apply the pipeline model to the test dataset.
predDF1 = pipelineModel1.transform(testDF)

display(predDF1)

display the basic rows

In [0]:
display(predDF1.select("features","rawPrediction", "prediction", "label"))

In [0]:
# see if transformations, such as encoding and assembling affect the balance
class_distribution_after_pipeline = predDF1.groupBy('prediction').count().orderBy('prediction')
class_distribution_after_pipeline.show()

+----------+-----+
|prediction|count|
+----------+-----+
|       0.0| 6394|
|       1.0|   91|
+----------+-----+



On peut voir que notre data est imbalance

### Etape 3 : Evaluation du model

In [0]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator

bcEvaluator1 = BinaryClassificationEvaluator(metricName="areaUnderROC")
print(f"Area under ROC curve: {bcEvaluator1.evaluate(predDF1)}")

mcEvaluator1 = MulticlassClassificationEvaluator(metricName="accuracy")
print(f"Accuracy: {mcEvaluator1.evaluate(predDF1)}")

# Configure the evaluator
recallEvaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="recallByLabel")
print(f"recall : {recallEvaluator.evaluate(predDF1)}")

# Evaluate the model using MulticlassClassificationEvaluator for F1-Score
multi_evaluator_f1 = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="f1")
print(f"F1 Score: {multi_evaluator_f1.evaluate(predDF1)}")

Area under ROC curve: 0.884278072268483
Accuracy: 0.7702390131071704
recall : 0.9995925020374898
F1 Score: 0.6829980614324778


### Étape 4. Réglage des hyperparamètres

In [0]:
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator, CrossValidatorModel

paramGrid1 = ParamGridBuilder() \
    .addGrid(svm.regParam, [0.1, 0.01]) \
    .addGrid(svm.maxIter, [10, 100, 1000]) \
    .build()


In [0]:
crossval1 = CrossValidator(estimator=pipeline1,
                          estimatorParamMaps=paramGrid1,
                          evaluator=bcEvaluator1,
                          numFolds=3)  # Use 3+ folds in practice

# Fit the model
cvModel = crossval1.fit(trainDF)
bestModel = cvModel.bestModel
predictions = bestModel.transform(testDF)


In [0]:
print(f"Area under ROC curve: {bcEvaluator.evaluate(predictions)}")
print(f"Accuracy: {mcEvaluator.evaluate(predictions)}")
print(f"Recall: {recallEvaluator.evaluate(predictions)}")
print(f"F1 Score: {multi_evaluator_f1.evaluate(predictions)}")

Area under ROC curve: 0.9001913586659108
Accuracy: 0.8473400154202004
Recall: 0.9366340668296659
F1 Score: 0.8400158547740251


### En peut remarqer que l'accuracy et F1 Scors ont augmenter qui dit que notre model a bien été Amélioré. 

#2. Radom Forest

%md
### Étape1. Définir le modèle d'apprentissage

In [0]:
from pyspark.ml.classification import RandomForestClassifier
# Replace the Logistic Regression with Random Forest Classifier
rf = RandomForestClassifier(featuresCol="features", labelCol="label", numTrees=10)


### Étape 2. Construisez le pipeline

In [0]:
from pyspark.ml import Pipeline

# Define the pipeline based on the stages created in previous steps, replacing lr with rf.
pipeline2 = Pipeline(stages=[stringIndexer, encoder, labelToIndex, vecAssembler, rf])
# Fit the model
pipelineModel2 = pipeline2.fit(trainDF)

# Make predictions on the test data
predDF2 = pipelineModel2.transform(testDF)
display(predDF2)

In [0]:
nombre_de_colonnes = len(predDF2.columns)
print("Nombre de colonnes dans le DataFrame :", nombre_de_colonnes)


Nombre de colonnes dans le DataFrame : 34


In [0]:
display(predDF2.select("features","rawPrediction", "prediction", "label"))

### Etape 3 : Evaluation du model

In [0]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator

# Binary classification evaluator for area under ROC
bcEvaluator2 = BinaryClassificationEvaluator(metricName="areaUnderROC")
print(f"Area under ROC curve: {bcEvaluator2.evaluate(predDF2)}")

# Multi-class classification evaluator for accuracy
mcEvaluator2 = MulticlassClassificationEvaluator(metricName="accuracy")
print(f"Accuracy: {mcEvaluator2.evaluate(predDF2)}")

# Configure the evaluator for recall
recallEvaluator2 = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="weightedRecall")
print(f"Recall: {recallEvaluator2.evaluate(predDF2)}")

multi_evaluator_f2 = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="f1")
print(f"F1 Score: {multi_evaluator_f2.evaluate(predDF2)}")

Area under ROC curve: 0.8860698875801749
Accuracy: 0.825905936777178
Recall: 0.825905936777178
F1 Score: 0.801125231731012


### Etape 4 : Réglage des paramétre (Random forest)

In [0]:
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.evaluation import BinaryClassificationEvaluator


In [0]:
paramGrid2 = ParamGridBuilder() \
    .addGrid(rf.numTrees, [10, 20, 30]) \
    .addGrid(rf.maxDepth, [5, 10, 15]) \
    .build()


In [0]:
bcEvaluator2 = BinaryClassificationEvaluator(metricName="areaUnderROC")
crossval2 = CrossValidator(estimator=pipeline2,
                          estimatorParamMaps=paramGrid2,
                          evaluator=bcEvaluator,
                          numFolds=5)  # 5 plis est un bon point de départ


In [0]:
# Fit the model
cvModel2 = crossval2.fit(trainDF)

# Obtenir le meilleur modèle
bestModel = cvModel2.bestModel


In [0]:
# Faire des prédictions
predictions2 = bestModel.transform(testDF)

# Évaluateur pour l'aire sous la courbe ROC
bcEvaluator2 = BinaryClassificationEvaluator(labelCol="label", rawPredictionCol="prediction", metricName="areaUnderROC")
auc2 = bcEvaluator2.evaluate(predictions2)
accuracyEvaluator2 = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy2 = accuracyEvaluator2.evaluate(predictions2)
recallEvaluator2 = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="weightedRecall")
weightedRecall2 = recallEvaluator2.evaluate(predictions2)
multi_evaluator_f2 = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="f1")


print(f"Area under ROC curve (AUC): {auc2}")
print(f"Accuracy: {accuracy2}")
print(f"Weighted Recall: {weightedRecall2}")
print(f"F1 Score: {multi_evaluator_f2.evaluate(predictions2)}")