<a href="https://colab.research.google.com/github/anassbouchfar/Bank_marketing_ML_DL/blob/main/Classification%26Clustering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Exploratory Data Analysis (EDA)

## Importer les librairies

In [None]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import rcParams
import seaborn as sns
%matplotlib inline
import warnings

## Importer l'ensemble de données

In [None]:
data=pd.read_csv("bank.csv")
data.head()

## Vérifier les valeurs nulles

In [None]:
data.isnull().sum()

Il n'y a pas de valeurs nulles dans l'ensemble de données

## Vérifier les valeurs dupliquées

In [None]:
data.duplicated().sum()

Il n'y a pas de valeurs dupliquées dans l'ensemble de données

## Résumé statistique de l'ensemble de données

In [None]:
data.describe()

## Les types des variables de l'ensemble de données

In [None]:
data.info()

## Vérifier la balance de l'ensemble des données

In [None]:
plt.figure(figsize=(12,5))
plt.subplot(1,2,1)
sns.countplot(x='deposit',data=data)
plt.title("Yes et No")
plt.subplot(1,2,2)
labels =data['deposit'].value_counts(sort = True).index
sizes = data['deposit'].value_counts(sort = True)
plt.pie(sizes,labels=labels,autopct='%1.1f%%', shadow=True, startangle=270,)
plt.title('Total de Yes et No',size = 12)
plt.show()

Notre ensemble de données est en balance 

## Analyse du type de données d'objet

In [None]:
colonnes_obj=data.select_dtypes('O').columns
fig, axes = plt.subplots(3, 3, figsize=(20, 20))
fig.tight_layout(pad=5.0)
for col, ax in zip(colonnes_obj, axes.flatten()):
    sns.countplot(x=col,data=data, hue='deposit', ax=ax)
    ax.tick_params(axis='x', rotation=45)

plt.show()

## Analyse du type de données numérique


In [None]:
colonnes_int=data.select_dtypes('int').columns
fig, axes = plt.subplots(4,2, figsize=(20, 20))
fig.tight_layout(pad=5.0)
for col, ax in zip(colonnes_int, axes.flatten()):
    sns.kdeplot(x=col,data=data, hue='deposit', ax=ax, fill=True)
    ax.tick_params(axis='x', rotation=45)

## Corrélation entre les variables

In [None]:
plt.figure(figsize=(15,6))
sns.heatmap(data.corr(),annot=True)
plt.show()

Il existe une corrélation modérée entre "pdays" et "previous", il est donc nécessaire de supprimer l'une d'entre elles. Pour décider quelle colonne doit etre supprimer, nous comparons leur Score d'Information Mutuelle.



In [None]:
from sklearn.feature_selection import mutual_info_classif

def make_mi_scores(X, y, discrete_features):
    mi_scores = mutual_info_classif(X, y, discrete_features=discrete_features)
    mi_scores = pd.Series(mi_scores, name="MI Scores", index=X.columns)
    mi_scores = mi_scores.sort_values(ascending=False)
    return mi_scores

mi_scores = make_mi_scores( data.loc[:, data.dtypes==int], data['deposit'], 'auto')
mi_scores[::3]

In [None]:
def plot_mi_scores(scores):
    scores = scores.sort_values(ascending=True)
    width = np.arange(len(scores))
    ticks = list(scores.index)
    plt.barh(width, scores)
    plt.yticks(width, ticks)
    plt.title("Mutual Information Scores")


plt.figure(dpi=100, figsize=(8, 8))
plot_mi_scores(mi_scores)


Donc on supprime la colonne "previous".



In [None]:
data.drop('previous',axis=1,inplace=True)

## Vérification des valeurs aberrantes

In [None]:
sns.catplot(data=data,kind='box')
plt.xticks(rotation=90)
plt.show()

In [None]:
# on remarque qu'il existe des valeurs aberrantes dans la colonne "balance" 
plt.figure(figsize=(7,5))
sns.distplot(data['balance'])
plt.show()

Les données supérieures à 20 000 peuvent être supprimées pour éviter le risque de valeurs aberrantes.

In [None]:
i=data['balance'][data['balance']>20000].index
data.drop(i,inplace=True)
plt.figure(figsize=(5,3))
sns.distplot(data['balance'])
plt.show()

Maintenant, la colonne "balance" semble être quelque peu normalement distribué.

#Encodage de l'ensemble des données & Data Preprocessing

## Encodage de la colonne "deposit"

In [None]:
data['deposit'] = data['deposit'].map({'no': 0, 'yes': 1})

## Encodage de la colonne "month"

In [None]:
dict_mois = {'jan' : 1, 'feb' : 2, 'mar' : 3, 'apr' : 4, 'may' : 5, 'jun' : 6,
             'jul' : 7, 'aug' : 8, 'sep' : 9, 'oct' : 10, 'nov' : 11, 'dec' : 12}
data['month'] = data['month'].map(dict_mois)

## Encodage des colonnes "default","housing" et "loan"

In [None]:
#default :
data.default.replace({'no' : 0, 'yes' : 1}, inplace = True)
#housing :
data.housing.replace({'no' : 0, 'yes' : 1}, inplace = True)
#loan :
data.loan.replace({'no' : 0, 'yes' : 1}, inplace = True)

## Encodage de la colonne "marital"

In [None]:
marital_dummies = pd.get_dummies(data['marital'], prefix = 'marital')
marital_dummies.drop('marital_divorced', axis=1, inplace=True)
data = pd.concat([data, marital_dummies], axis=1)

## Encodage de la colonne "job"

In [None]:
job_dummies = pd.get_dummies(data['job'], prefix = 'job')
job_dummies.drop('job_unknown', axis=1, inplace=True)
data= pd.concat([data, job_dummies], axis=1)

## Encodage de la colonne "education"

In [None]:
education_dummies = pd.get_dummies(data['education'], prefix = 'education')
education_dummies.drop('education_unknown', axis=1, inplace=True)
data = pd.concat([data, education_dummies], axis=1)

## Encodage de la colonne "contact"

In [None]:
contact_dummies = pd.get_dummies(data['contact'], prefix = 'contact')
contact_dummies.drop('contact_unknown', axis=1, inplace=True)
data = pd.concat([data, contact_dummies], axis=1)

## Encodage de la colonne "poutcome"

In [None]:
poutcome_dummies = pd.get_dummies(data['poutcome'], prefix = 'poutcome')
poutcome_dummies.drop('poutcome_unknown', axis=1, inplace=True)
data = pd.concat([data, poutcome_dummies], axis=1)

## Encodage de la colonne "pdays"

Si la valeur de 'pdays' est 999 alors le client n'a pas été contacté auparavant et si la valeur de 'pdays' est '-1', si c'est le cas, nous l'associerons à une valeur de 0.

In [None]:
data['pdays'] = data['pdays'].apply(lambda row: 0 if row == -1 else 1)

## Supprimer les colonnes "marital","job","education","contact" et "poutcome"

In [None]:
data.drop(['job', 'education', 'marital', 'contact', 'poutcome'], axis=1,inplace=True)

## Vérifier le type des données aprés l'encodage

In [None]:
data.dtypes

In [None]:
data

On remarque toutes les données sont bien encodés.

##Enregistrement de notre dataset après l'encodage 

In [None]:
data.to_csv('datacode.csv', index=False)

# Classification  supervisée (Classification )

### Train and Test dataset

In [None]:
X = data.drop('deposit', axis=1)
Y = data['deposit']

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2, random_state = 32)

In [None]:
X_train.shape

In [None]:
X_test.shape

In [None]:
y_train.shape

In [None]:
y_test.shape

### Features Scaling

Nous allons normaliser les variables indépendantes présentes dans l'ensemble de données dans une plage fixe.

In [None]:
from sklearn.preprocessing import StandardScaler
ss = StandardScaler()
X_train_s = ss.fit_transform(X_train)
X_test_s = ss.transform(X_test)

## Model Training 

### Importer les librairies

In [None]:
from sklearn.neighbors import KNeighborsClassifier #KNN
from sklearn.tree import DecisionTreeClassifier #Decision Tree
from sklearn.naive_bayes import GaussianNB #Naive bayes
from sklearn.neural_network import MLPClassifier #Multilayer perceptron
from sklearn.model_selection import train_test_split #training and testing data split
from sklearn import metrics #accuracy measure
from sklearn.metrics import confusion_matrix #for confusion matrix
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import classification_report
from sklearn import svm

# Classification Supervisée

### KNN

In [None]:
# faire le test et le train pour k allant de 1 jusqu'à 20
for i in range(2,20):
    knn= KNeighborsClassifier(n_neighbors=i)
    knn.fit(X_train_s,y_train)
    Y_pred = knn.predict(X_test_s)
    Y_test_pred =  knn.predict(X_test_s)
    knn_test_accuracy = metrics.accuracy_score(y_test,Y_pred)
    knn_train_accuracy = knn.score(X_train_s,y_train)
     
    
    print(f"Pour K={i} le score de précision des tests est {knn_test_accuracy*100:.2f}%".format(i,knn_test_accuracy))
    print(f"Pour K={i} le score de précision du train est {knn_train_accuracy*100:.2f}%".format(i,knn_train_accuracy))
    
    print()

Le K qui a plus d'accuracy est : 5

#### Evaluation de performance de KNN5

In [None]:
knn5= KNeighborsClassifier(n_neighbors=5)
knn5.fit(X_train_s,y_train)
knn5_train_pred = knn5.predict(X_train_s)
knn5_test_pred =  knn5.predict(X_test_s)
print("Le rapport de l'ensemble d'apprentissage: \n",classification_report(y_train,knn5_train_pred))
print("Le rapport de l'ensemble de test: \n",classification_report(y_test,knn5_test_pred))

#### Overfitting and underfitting
Le score d'accuracy de l'ensemble d'apprentissage est de 77,44 %, tandis que l'accuracy de l'ensemble de test est de 75,82 %. Ces deux valeurs sont tout à fait comparables. Il n'est donc pas question de overfitting.



In [None]:
report_knn5_test =classification_report(y_test,knn5_test_pred, output_dict=True )
report_knn5_train =classification_report(y_train,knn5_train_pred, output_dict=True )

### Arbre de décision 

In [None]:
dtc = DecisionTreeClassifier(random_state=0)
dtc.fit(X_train_s, y_train)
dtc_test_pred = dtc.predict(X_test_s)
dtc_train_pred=dtc.predict(X_train_s)
print("Valeur prédictée: ", dtc_test_pred[:10])
print("Valeur actuelle:  ", y_test[:10].to_numpy())

#### Evaluation de performance de AD





In [None]:
print("Le rapport de l'ensemble d'apprentissage: \n", classification_report(y_train,dtc_train_pred))
print("Le rapport de lensemble de test : \n", classification_report(y_test,dtc_test_pred))

il semble  bien qu'il y a un sur-apprentissage (overfitting), pour résoudre ce problème il y a 3 méthodes qui sont Pre-Pruning ou Post-Pruning l'utlisation de l'algo random forest on va travailler seulement avec les 2 premieres méthodes.

#### Pre-Pruning

La technique de pré-élagage fait référence à l'arrêt précoce de la croissance de l'arbre de décision. La technique de pré-élagage consiste à régler les hyperparamètres du modèle d'arbre décisionnel avant le pipeline de formation. Les hyperparamètres de l'arbre de décision, notamment **max_depth, min_samples_leaf, min_samples_split**, peuvent être réglés afin d'arrêter précocement la croissance de l'arbre et d'éviter que le modèle ne soit surajusté.

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score
param_grid = {
    "max_depth": [3,5,10,15,20,None],
    "min_samples_split": [2,5,7,10],
    "min_samples_leaf": [1,2,5]
}

clf = DecisionTreeClassifier(random_state=42)
grid_cv = GridSearchCV(clf, param_grid, scoring="roc_auc", n_jobs=-1, cv=3).fit(X_train_s, y_train)

print("Param for GS", grid_cv.best_params_)
print("CV score for GS", grid_cv.best_score_)
print("Train AUC ROC Score for GS: ", roc_auc_score(y_train, grid_cv.predict(X_train_s)))
print("Test AUC ROC Score for GS: ", roc_auc_score(y_test, grid_cv.predict(X_test_s)))

Le score de précision de l'ensemble de formation est de **87,37%** tandis que la précision de l'ensemble de test est de **82,37%**. Ces deux valeurs sont tout à fait comparables. Il n'y a donc pas de problème de overfitting.

####Post-Pruning
La technique de post-élagage permet au modèle d'arbre de décision de croître jusqu'à sa pleine profondeur, puis supprime les branches de l'arbre pour empêcher le modèle de sur-ajuster. **L'élagage de la complexité des coûts (ccp)** est un type de technique post-élagage. En cas d'élagage de la complexité des coûts, le ccp_alpha peut être ajusté pour obtenir le meilleur modèle d'ajustement.

Le package Scikit-learn est fourni avec l'implémentation pour calculer les valeurs **ccp_alpha** de l'arbre de décision à l'aide de la fonction **cost_complexity_pruning_path**(). Avec l'augmentation des valeurs **ccp_apha**, davantage de nœuds de l'arbre sont élagués.

In [None]:
# Allow a decision tree to grow to its full depth
clf = DecisionTreeClassifier(random_state=42)
clf.fit(X_train_s, y_train)

# compute ccp_alpha values
path = clf.cost_complexity_pruning_path(X_train_s, y_train)
ccp_alphas, impurities = path.ccp_alphas, path.impurities

# train DT classifier for each ccp_alpha value
clfs = []
for ccp_alpha in ccp_alphas:
    clf = DecisionTreeClassifier(random_state=42, ccp_alpha=ccp_alpha)
    clf.fit(X_train_s, y_train)
    clfs.append(clf)

# Plot train and test score for each of the above trained model    
clfs = clfs[:-1]
ccp_alphas = ccp_alphas[:-1]

train_scores = [roc_auc_score(y_train, clf.predict(X_train_s)) for clf in clfs]
test_scores = [roc_auc_score(y_test, clf.predict(X_test_s)) for clf in clfs]

fig, ax = plt.subplots()
ax.set_xlabel("alpha")
ax.set_ylabel("accuracy")
ax.set_title("AUC-ROC score vs alpha")
ax.plot(ccp_alphas, train_scores, marker='o', label="train")
ax.plot(ccp_alphas, test_scores, marker='o', label="test")
ax.legend()
plt.show()

On remarque que la **meilleure** accuracy entre training et testing c'est **85%**.

Alors avec ces 2 méthodes on a résolu le problème d'overfitting.

In [None]:
clf2 = DecisionTreeClassifier(random_state=42, ccp_alpha=0.001)
clf2.fit(X_train_s, y_train)
clf2_train_pred = clf2.predict(X_train_s)
clf2_test_pred =  clf2.predict(X_test_s)
print("Le rapport de l'ensemble d'apprentissage: \n",classification_report(y_train,clf2_train_pred))
print("Le rapport de l'ensemble de test: \n",classification_report(y_test,clf2_test_pred))

In [None]:
report_clf2_test =classification_report(y_test,clf2_test_pred, output_dict=True )
report_clf2_train =classification_report(y_train,clf2_train_pred, output_dict=True )

### Naive Bayes

In [None]:
nb= GaussianNB()
nb.fit(X_train_s,y_train)
nb_test_pred = nb.predict(X_test_s)
nb_train_pred=nb.predict(X_train_s)
print("La valeur prédictée: ", nb_test_pred[:10])
print("Valeur actuelle    : ", y_test[:10].to_numpy())

#### Evaluation de performance de Naive Bayes

In [None]:
print("Le rapport de l'ensemble d'apprentissage: \n",classification_report(y_train,nb_train_pred))
print("Le rapport de l'ensemble de test: \n",classification_report(y_test,nb_test_pred))

Pas de overfitting.

In [None]:
report_nb_test =classification_report(y_test,nb_test_pred, output_dict=True )
report_nb_train =classification_report(y_train,nb_train_pred, output_dict=True )

###SVM

In [None]:
from sklearn import svm

In [None]:
svc = svm.SVC(kernel='rbf', C=70, gamma=0.001).fit(X_train_s,y_train)
predictionsvm_train = svc.predict(X_train_s)
predictionsvm_test = svc.predict(X_test_s)
print("La valeur prédictée: ", predictionsvm_test[:10])
print("Valeur actuelle    : ", y_test[:10].to_numpy())

#### Evaluation de performance de SVM

In [None]:
print("Le rapport de l'ensemble d'apprentissage: \n",classification_report(y_train,predictionsvm_train))
print("Le rapport de l'ensemble de test: \n",classification_report(y_test,predictionsvm_test))

In [None]:
report_svm_test =classification_report(y_test,predictionsvm_test, output_dict=True )
report_svm_train =classification_report(y_train,predictionsvm_train, output_dict=True )

### Comparaison entre les modèles de classification

In [None]:
models=[knn5,clf2,nb,svc]
models_name=["KNN5","DTC","NB","SVM"]
fig, axes = plt.subplots(1,4, figsize=(17, 5))
fig.tight_layout(pad=5.0)
for model, ax,model_name in zip(models, axes.flatten(),models_name):
    
    plot=plot_confusion_matrix(model, X_test_s, y_test, display_labels=["No","Yes"],ax=ax)
    
    ax.tick_params(axis='x', rotation=45)
    plot.ax_.set_title("Matrice de confusion de "+model_name)

In [None]:
scores_data = {'model_name':  ['KNN5', 'DTC', 'NB','SVM'],
        'accuracy_train': [report_knn5_train['accuracy'], report_clf2_train['accuracy'], report_nb_train['accuracy'],report_svm_train['accuracy'] ],
          'accuracy_test': [report_knn5_test['accuracy'] ,report_clf2_test['accuracy'], report_nb_test['accuracy'],report_svm_test['accuracy'] ],
          'precision_train': [report_knn5_train['macro avg']['precision'] ,report_clf2_train['macro avg']['precision'], report_nb_train['macro avg']['precision'],report_svm_train['macro avg']['precision'] ] ,
          'precision_test': [report_knn5_test['macro avg']['precision'] ,report_clf2_test['macro avg']['precision'], report_nb_test['macro avg']['precision'],report_svm_test['macro avg']['precision'] ] ,
          'recall_train': [report_knn5_train['macro avg']['recall'] ,report_clf2_train['macro avg']['recall'], report_nb_train['macro avg']['recall'],report_svm_train['macro avg']['recall']] ,
          'recall_test': [report_knn5_test['macro avg']['precision'],report_clf2_test['macro avg']['precision'],report_nb_test['macro avg']['precision'],report_svm_test['macro avg']['precision']] ,
          'f1_train': [report_knn5_train['macro avg']['f1-score'],report_clf2_train['macro avg']['f1-score'],report_nb_train['macro avg']['f1-score'],report_svm_train['macro avg']['f1-score']] ,
          'f1_test': [report_knn5_test['macro avg']['f1-score'] ,report_clf2_test['macro avg']['f1-score'] ,report_nb_test['macro avg']['f1-score'] ,report_svm_test['macro avg']['f1-score']] 
        }
scores = pd.DataFrame(scores_data)
scores.set_index('model_name', inplace=True)

In [None]:
output = scores.style.format({
    'accuracy_train': '{:,.2%}'.format,
    'accuracy_test': '{:,.2%}'.format,
    'precision_train': '{:,.2%}'.format,
    'precision_test': '{:,.2%}'.format,
    'recall_train': '{:,.2%}'.format,
    'recall_test': '{:,.2%}'.format,
    'f1_train': '{:,.2%}'.format,
    'f1_test': '{:,.2%}'.format,
})
output

In [None]:
fig,(ax1, ax2) = plt.subplots(1, 2,figsize=(20, 10))
score1=scores.loc[:,["precision_train","precision_test","recall_train","recall_test","f1_train","f1_test"]]
score1.plot(kind="line", figsize = (14, 5),ax=ax1)

score2=scores.loc[:,["accuracy_train","accuracy_test"]]
score2.plot(kind="bar", figsize = (14, 5),ax=ax2)

plt.show()

Le modèle le plus performant est SVM.

# Classification non supervisée (Clustering )

### K-means

#### without PCA

In [None]:
from sklearn.cluster import KMeans

In [None]:
from sklearn.preprocessing import MinMaxScaler

ms = MinMaxScaler()

X_scaled = ms.fit_transform(X)

In [None]:
cs = []
for i in range(1, 11):
    kmeans = KMeans(n_clusters = i, init = 'k-means++', max_iter = 300, n_init = 10, random_state = 0)
    kmeans.fit(X_train_s)
    cs.append(kmeans.inertia_)
plt.plot(range(1, 11), cs,marker='o')
plt.title('The Elbow Method')
plt.xlabel('Number of clusters')
plt.ylabel('CS')
plt.show()

In [None]:
cs

In [None]:
accuracies = []
for i in range(1, 11):
    kmeans = KMeans(n_clusters = i, init = 'k-means++', max_iter = 300, n_init = 10, random_state = 0)
    kmeans.fit(X_scaled)
    correct_labels = sum(Y == kmeans.labels_)
    accuracies.append(correct_labels/float(Y.size))
plt.plot(range(1, 11), accuracies,marker='o')
plt.title('The Elbow Method')
plt.xlabel('Number of clusters')
plt.ylabel('accuracy')
plt.savefig("p.png")
plt.show()

In [None]:
accuracies

#####**k=2**

In [None]:
kmeans = KMeans(n_clusters = 2, init = 'k-means++', max_iter = 300, n_init = 10, random_state = 0)
kmeans.fit(X)

In [None]:
correct_labels = sum(Y == kmeans.labels_)
print("Result: %d out of %d samples were correctly labeled." % (correct_labels, Y.size))
print('Accuracy score: {0:0.2f}'. format(correct_labels/float(Y.size)))

##### with PCA


In [None]:
from sklearn.preprocessing import StandardScaler, normalize
from sklearn.decomposition import PCA

In [None]:
# Standardize data
scaler = StandardScaler() 
scaled_df = scaler.fit_transform(X) 
  
# Normalizing the Data 
normalized_df = normalize(scaled_df) 
  
# Converting the numpy array into a pandas DataFrame 
normalized_df = pd.DataFrame(normalized_df) 
  
# Reducing the dimensions of the data 
pca = PCA(n_components = 2) 
X_principal = pca.fit_transform(normalized_df) 
X_principal = pd.DataFrame(X_principal) 
X_principal.columns = ['P1', 'P2'] 
  
X_principal.head(2)


#### Elbow method

In [None]:
sse = {} #sum of squeared  errors
for k in range(1, 10):
    kmeans = KMeans(n_clusters=k, max_iter=1000).fit(X_principal)
    sse[k] = kmeans.inertia_ # Inertia: Sum of distances of samples to their closest cluster center
plt.figure()
plt.plot(list(sse.keys()), list(sse.values()),marker='o')
plt.xlabel("Number of cluster")
plt.ylabel("SSE")
plt.savefig("p2.png")
plt.show()

#### Silhouette Coefficient Method

In [None]:
from sklearn.metrics import silhouette_score

In [None]:
silhouette_scores = [] 

for n_cluster in range(2, 10):
    silhouette_scores.append( 
        silhouette_score(X_principal, KMeans(n_clusters = n_cluster).fit_predict(X_principal))) 
    
# Plotting a bar graph to compare the results 
k = [2, 3, 4, 5, 6,7,8,9] 
plt.bar(k, silhouette_scores) 
plt.xlabel('Number of clusters', fontsize = 10) 
plt.ylabel('Silhouette Score', fontsize = 10) 
plt.savefig("p3.png")
plt.show() 

#### k=3

In [None]:
kmeans = KMeans(n_clusters=3)
kmeans.fit(X_principal)

In [None]:
# Visualizing the clustering 
plt.scatter(X_principal['P1'], X_principal['P2'],  
           c = KMeans(n_clusters = 3).fit_predict(X_principal), cmap =plt.cm.winter) 
plt.savefig("p4.png")
plt.show()

### Hierarchical Agglomerative Clustering (HAC)


In [None]:
from sklearn.preprocessing import StandardScaler
ss = StandardScaler()
X_train_s = ss.fit_transform(X_train)
X_test_s = ss.transform(X_test)

In [None]:
from sklearn.cluster import AgglomerativeClustering

ac = AgglomerativeClustering(n_clusters=2, linkage='ward', compute_full_tree=False)
model_ac = ac.fit(X_train_s)
train_predicted = model_ac.fit_predict(X_train_s)
test_predicted = model_ac.fit_predict(X_test_s)
print(train_predicted.shape, y_train.shape)
print(test_predicted.shape, y_test.shape)

#### Accuracy

In [None]:
from sklearn.metrics import accuracy_score
train_accuracy = accuracy_score(y_train, train_predicted)
test_accuracy = accuracy_score(y_test, test_predicted)

from sklearn.metrics import classification_report
print("Le rapport de l'ensemble d'apprentissage: \n",classification_report(y_train,train_predicted))
print("Le rapport de l'ensemble de test: \n",classification_report(y_test,test_predicted))


### DBSCAN

In [None]:
from sklearn.cluster import DBSCAN

dbs = DBSCAN(eps=3, min_samples=500)

model_dbs = dbs.fit(X_train_s)
predicted_train = list(map(lambda x: x+1, model_dbs.fit_predict(X_train_s)))
predicted_test = list(map(lambda x: x+1, model_dbs.fit_predict(X_test_s)))

####Accuracy

In [None]:
from sklearn.metrics import accuracy_score
train_accuracy = accuracy_score(y_train, predicted_train)
test_accuracy = accuracy_score(y_test, predicted_test)


print("Le rapport de l'ensemble d'apprentissage: \n",classification_report(y_train,predicted_train))
print("Le rapport de l'ensemble de test: \n",classification_report(y_test,predicted_test))


# Deep Learning

### MLP

In [None]:
mlp = MLPClassifier()
mlp.fit(X_train_s, y_train)
mlpprediction = mlp.predict(X_test_s)
print("Valeur prédictée :", mlpprediction[:10])
print("Valeur actuelle :", y_test[:10])

#### Evaluation de performance de MLP

In [None]:
mlp_test_accuracy=metrics.accuracy_score(y_test,mlpprediction)
mlp_train_accuracy=mlp.score(X_train_s,y_train)
print(f"Le score d'accuracy des tests est {mlp_test_accuracy*100:.2f}%")
print(f"Le score d'accuracy du train est {mlp_train_accuracy*100:.2f}%")

### CNN

In [None]:
import tensorflow as tf
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Conv1D, MaxPooling1D, Flatten
from keras.optimizers import Adam

In [None]:
X_train = X_train_s / 255
X_test = X_test_s / 255

In [None]:
# Create the model
model = Sequential()
# Add a 1D convolutional layer with 64 filters and kernel size of 3, inpu_shape (29 features, and 1 output binary classification)
model.add(Conv1D(filters=64, kernel_size=3, input_shape=(31, 1)))
# Add a ReLU activation function
model.add(Activation('relu'))
# Add a max pooling layer with pool size of 2
model.add(MaxPooling1D(pool_size=2))
# Flatten the output of the max pooling layer
model.add(Flatten())
# Add a dense layer with 64 units and ReLU activation
model.add(Dense(64, activation='relu'))
# Add a dropout layer with dropout rate of 0.5
model.add(Dropout(0.5))
# Add a final dense layer with 2 units and softmax activation for output
model.add(Dense(2, activation='softmax'))

In [None]:
model.compile(loss='sparse_categorical_crossentropy', optimizer=Adam(), metrics=['accuracy'])

In [None]:
X_train.shape

In [None]:
model.fit(X_train, y_train, batch_size=32, epochs=10, validation_data=(X_test, y_test))

#### Evaluation de performance de CNN

In [None]:
# Evaluate the model
# Get the accuracy score on the test data
score = model.evaluate(X_test, y_test, verbose=0)
print("Test accuracy:", score[1])
