# Classification : a24_clas_app.txt

## 1. Chargement du jeu de données

In [2]:
setwd("C:/Users/antoi/Desktop/UTC/GI05/SY19/Projet")
data <- read.csv("a24_clas_app.txt", sep = " ")

In [10]:
n <- nrow(data)
p <- ncol(data) - 1

print(paste("Nombre de données :", n))
print(paste("Nombre de prédicteurs :", p))

[1] "Nombre de données : 500"
[1] "Nombre de prédicteurs : 50"


In [20]:
install.packages("corrplot")
library("corrplot")

"unable to access index for repository https://cran.r-project.org/bin/windows/contrib/3.6:
  impossible d'ouvrir l'URL 'https://cran.r-project.org/bin/windows/contrib/3.6/PACKAGES'"installing the source package 'corrplot'

corrplot 0.95 loaded


In [18]:
library(Matrix)
X <- as.matrix(data[, 1:p])
rankMatrix(X)

library("corrplot")
cor_data = cor(data)
cor_long <- as.data.frame(as.table(cor_data))
cor_long <- cor_long[cor_long$Var1 != cor_long$Var2, ]
cor_long <- cor_long[as.numeric(as.factor(cor_long$Var1)) < as.numeric(as.factor(cor_long$Var2)), ]
cor_long$abs_value <- abs(cor_long$Freq)
cor_long <- cor_long[order(-cor_long$abs_value), ]
top_10_correlations <- head(cor_long, 10)
print(top_10_correlations)

# Séparation en jeu d'entrainement de et test
train <- sample(1:n, round(4*n/5))
data.train <- data[train, ]
data.test <- data[-train, ]

# Observation de la distribution de certaines variables
boxplot(data$X1, data$X2, data$X3,
        names = c("X1", "X2", "X3"),
        main = "Boxplot of X1, X2, and X3", # Title
        ylab = "Values", # Y-axis label
        col = c("lightblue", "lightgreen", "lightcoral"))

plot(data$X2 ~ data$X1, main = "Valeurs de X2 en fonction de X1")

barplot(data$X1, main = "Diagramme en barre des valeurs de X1")

barplot(table(data$y), main = "Distribution des valeurs de y", xlab = "y", ylab = "Nombre", col = "lightblue", border = "black")

# KNN

#install.packages("e1071") 
#install.packages("caTools") 
#install.packages("class") 

library(e1071) 
library(caTools) 
library(class) 

# Normalisation des données
data.train.x_scale <- scale(data.train[, 1:50])
data.test.x_scale <- scale(data.test[, 1:50], center = attr(data.train.x_scale, "scaled:center"), scale = attr(data.train.x_scale, "scaled:scale"))

K <- 10
folds <- sample(1:K, nrow(data.train), replace = TRUE)
table(folds)
accuracy.train <- rep(0, 100)
accuracy.val <- rep(0, 100)

# Test des différentes valeurs de k
for (k in 1:100){  
  accuracy.train.fold <- rep(0, K)
  accuracy.val.fold <- rep(0, K)
  
  # Pour chaque pli
  for (fold in 1:K){
    # Données d'entraînement et de validation
    x_train_data <- data.train.x_scale[folds != fold, ]
    y_train_data <- data.train[folds != fold, ]$y
    x_validation_data <- data.train.x_scale[folds == fold, ]
    y_validation_data <- data.train[folds == fold, ]$y
    
    # Application du k-NN pour les données de validation
    classifier_knn_val <- knn(train = x_train_data, test = x_validation_data, cl = y_train_data, k = k)
    # Calcul de l'accuracy pour ce pli sur les données de validation
    accuracy.val.fold[fold] <- sum(classifier_knn_val == y_validation_data) / length(y_validation_data)
    
    # Application du k-NN pour les données d'entraînement
    classifier_knn_train <- knn(train = x_train_data, test = x_train_data, cl = y_train_data, k = k)
    # Calcul de l'accuracy pour ce pli sur les données d'entraînement
    accuracy.train.fold[fold] <- sum(classifier_knn_train == y_train_data) / length(y_train_data)
  }
  # Moyenne des accuracies sur tous les plis
  accuracy.val[k] <- mean(accuracy.val.fold, na.rm = TRUE)
  accuracy.train[k] <- mean(accuracy.train.fold, na.rm = TRUE)
}

plot(1:100, accuracy.train, main = "Accuracy sur les données d'entrainement \net de validation avec KNN",
     col = "blue", type = "l", xlab = "Nombre de voisins (k)", ylab = "Accuracy")
lines(1:100, accuracy.val, col = "green")
max_accuracy_val <- max(accuracy.val)
max_index <- which.max(accuracy.val)
abline(v = max_index, col = "red", lty = 2)
legend("topright", legend = c("Entraînement", "Validation"), col = c("blue", "green"), lty = 1)
sprintf("Le nombre de voisins optimal est : %d", max_index)


classifier_knn <- knn(train = data.train[, 1:50], test = data.test[, 1:50], cl = data.train$y, k = 26)
knn.accuracy.test <- sum(classifier_knn == data.test$y) / length(data.test$y)
print(knn.accuracy.test) # Renvoie 0.46


# Régression logistique classique

library(nnet)
reg_log <- multinom(y ~ ., data = data.train)
reg_log.pred <- predict(reg_log, newdata=data.test[, 1:50],type='class')
reg_log.confusion_matrix <- table(Predicted = reg_log.pred, Actual = data.test$y)
reg_log.confusion_matrix
reg_log.accuracy.test <- sum(reg_log.pred == data.test$y) / length(data.test$y) 
print(reg_log.accuracy.test) # Renvoie 0.62

# QDA 

library(MASS)
qda <- qda(y ~ ., data = data.train)
qda.pred <- predict(qda, newdata = data.test)$class
qda.confusion_matrix <- table(Predicted = qda.pred, Actual = data.test$y)
qda.confusion_matrix
qda.accuracy.test <- sum(qda.pred == data.test$y) / length(data.test$y) 
print(qda.accuracy.test) # Renvoie 0.64

# LDA

lda <- lda(y ~ ., data = data.train)
lda.pred <- predict(lda, newdata = data.test)$class
lda.confusion_matrix <- table(Predicted = lda.pred, Actual = data.test$y)
lda.confusion_matrix
lda.accuracy.test <- sum(lda.pred == data.test$y) / length(data.test$y) 
print(lda.accuracy.test) # Renvoie 0.62

# Bayes Naîf

naive_bayes <- naiveBayes(y ~ ., data = data.train)
naive_bayes.pred <- predict(naive_bayes, newdata = data.test)
naive_bayes.confusion_matrix <- table(Predicted = naive_bayes.pred, Actual = data.test$y)
naive_bayes.confusion_matrix
naive_bayes.accuracy.test <- sum(naive_bayes.pred == data.test$y) / length(data.test$y) 
print(naive_bayes.accuracy.test) # Renvoie 0.7

# Test de McNemar

library(stats)
mcnemar.test(lda.pred == data.test$y, qda.pred == data.test$y)

# SVM radial

data.train_scaled <- data.frame(y = data.train$y, data.train.x_scale)
data.test_scaled <- data.frame(y = data.test$y, data.test.x_scale)

svm_model <- svm(y ~ ., data = data.train_scaled, type = "C-classification", kernel = "radial", cross = 10)
svm.pred <- predict(svm_model, newdata = data.test_scaled)
svm.confusion_matrix <- table(Predicted = svm.pred, Actual = data.test_scaled$y)
print(svm.confusion_matrix)
svm.accuracy.test <- sum(svm.pred == data.test_scaled$y) / length(data.test_scaled$y)
print(svm.accuracy.test) #0.63

# SVM linear

data.train_scaled <- data.frame(y = data.train$y, data.train.x_scale)
data.test_scaled <- data.frame(data.test.x_scale, y = data.test$y)

svm_model <- svm(y ~ ., data = data.train_scaled, type = "C-classification", kernel = "linear", cross = 10)
svm.pred <- predict(svm_model, newdata = data.test_scaled)
svm.confusion_matrix <- table(Predicted = svm.pred, Actual = data.test_scaled$y)
print(svm.confusion_matrix)
svm.accuracy.test <- sum(svm.pred == data.test_scaled$y) / length(data.test_scaled$y)
print(svm.accuracy.test) #0.55


# Tree
library(rpart)
library(rpart.plot)
tree_model <- rpart(y~., data = data.train, method = "class", parms = list(split = 'gini'))
rpart.plot(tree_model, box.palette="RdBu", shadow.col="gray",
           fallen.leaves=FALSE)
plotcp(tree_model)


# Random Forest
library(randomForest)

x.train <- data.train_scaled[, -1]  # Caractéristiques d'entraînement sans la colonne cible
y.train <- data.train_scaled$y      # Variable cible d'entraînement
x.test <- data.test_scaled[, -1]    # Caractéristiques de test sans la colonne cible
y.test <- data.test_scaled$y        # Variable cible de test

y.train <- as.factor(y.train)
y.test <- as.factor(y.test)

rf_model <- randomForest(x = x.train, y = y.train, 
                         xtest = x.test, ytest = y.test,
                         ntree = 500, 
                         mtry = floor(sqrt(ncol(x.train))),
                         nodesize = 1,
                         importance = TRUE, 
                         keep.forest = TRUE)

print(rf_model)
rf.confusion_matrix <- table(Predicted = rf_model$test$predicted, Actual = y.test)
print(rf.confusion_matrix)

rf.accuracy.test <- sum(rf_model$test$predicted == y.test) / length(y.test)
print(rf.accuracy.test) #0.58

# Importance des variables

varImpPlot(rf_model)


# ACP
data_standardized <- scale(data[, !names(data) %in% 'y'])
acp_result <- prcomp(data_standardized, center = TRUE, scale. = TRUE)
summary(acp_result)

# Visualisation des composantes principales (biplot)
biplot(acp_result, scale = 1)

# Tracé de la variance expliquée par chaque composante principale
screeplot(acp_result, type = "lines", main = "Scree Plot des 20 premiers axes", npcs = 20)

# Diagramme des variances expliquées cumulées
explained_variance <- cumsum(acp_result$sdev^2 / sum(acp_result$sdev^2)) * 100
plot(1:length(explained_variance), explained_variance, col = "blue", type = "l", 
     xlab = "Nombre de composantes principales", ylab = "Variance expliquée cumulée (%)",
     main = "Variance expliquée cumulée par les composantes principales")

# Apprentissage sur ces données PCA
n <- nrow(data)  
p <- ncol(data) - 1

train <- sample(1:n, round(4 * n / 5))
test <- setdiff(1:n, train)

acp_results <- acp_result$x[, 1:p]
data_pca <- data.frame(acp_results, y = data$y)

train_data_pca <- data_pca[train, ]
test_data_pca <- data_pca[test, ]

K <- 5  
fold <- sample(1:K, length(train), replace = TRUE)
table(fold)

mean_accuracy <- rep(0, p)

for (nb_axes in 1:p) {
  accuracy <- rep(0, K)
  for (k in 1:K) {
    glm_pca <- multinom(y ~ ., data = train_data_pca[fold != k, c(1:nb_axes, ncol(train_data_pca))])
    pred <- predict(glm_pca, newdata = train_data_pca[fold == k, c(1:nb_axes, ncol(train_data_pca))], type = "class")
    accuracy[k] <- sum(pred == data.test_scaled$y) / length(data.test_scaled$y)
  }
  mean_accuracy[nb_axes] <- mean(accuracy)
}

# Nombre optimal de composantes principales
best_nb_axes <- which.max(mean_accuracy)
cat(sprintf("Le nombre optimal de composantes principales est : %i\n", best_nb_axes))


ERROR: Error in library(Matrix): there is no package called 'Matrix'
