# Projet 4 : Anticipez les besoins en consommation électrique de bâtiments

# Analyse exploratoire

## Import des modules

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

## Récupération des données

In [None]:
data_2015 = pd.read_csv("data/2015-building-energy-benchmarking.csv")
data_2016 = pd.read_csv("data/2016-building-energy-benchmarking.csv")

## **Exploration initiale des données**

In [None]:
def shape(dataframe):
    """Calcuates and displays the shape of the dataframe and the filling rage"""
    
    # get the numbers of rows and columns in the dataframe
    nb_rows, nb_columns = dataframe.shape
    print("DataFrame has {} rows and {} columns.".format(nb_rows,nb_columns))

    # get the number of non-Nan data in the dataframe
    nb_data = dataframe.describe(include='all').loc["count"].sum()

    # computing the filling rate by rounding down to 2 decimal places
    import math
    filling_rate = math.floor(nb_data*10000 / (nb_rows * nb_columns))/10000
    print("The global filling rate of the DataFrame is : {:.2%}".format(filling_rate))
    
    # computing the missing values
    nb_missing = int((nb_rows * nb_columns) - nb_data)
    print("There are {} missing values out of {}.".format(nb_missing, int(nb_rows * nb_columns)))

In [None]:
shape(data_2015)
shape(data_2016)

In [None]:
data_2015.head()

In [None]:
data_2016.head()

Nous constatons que la plupart des lignes (correspondant à un batîment particulier) se retrouvent à la fois dans les données de 2015 et dans celles de 2016.

L'agrégation des données de 2015 et de 2016 nécessitera donc de gérer les données dupliquées.

In [None]:
print("\nNames of the {} common columns :\n".format(len(set(data_2015)&set(data_2016))), set(data_2015)&set(data_2016))
print("\nNames of the {} columns only in 2015 dataset:\n".format(len(set(data_2015)-set(data_2016))), set(data_2015)-set(data_2016))
print("\nNames of the {} columns only in 2016 dataset:\n".format(len(set(data_2016)-set(data_2015))), set(data_2016)-set(data_2015))

Nous constatons que certains colonnes spécifiques ont en fait la même sémantique. Par exemple :
* la colonne 'Comment' de 2015 correspond à la colonne 'Comments' de 2016
* la colonne 'Zip Codes' de 2015 correspond à la colonne 'ZipCode' de 2016
* la colonne 'GHGEmissionsIntensity(kgCO2e/ft2)' de 2015 correspond à la colonne 'GHGEmissionsIntensity' de 2016
* la colonne 'GHGEmissions(MetricTonsCO2e)' de 2015 correspond à la colonne 'TotalGHGEmissions' de 2016

Nous constatons aussi que certains colonnes spécifiques ont été reformatées. Ainsi, la colonne 'Location' a été séparée en plusieurs colonnes en 2016 :
* 'State'
* 'City'
* 'Address'
* 'Longitude'
* 'Latitude'

## **Nettoyage**

### Suppression des données non-explicatives
Nous supprimons les variables (colonnes) qui n'apportent aucun élément explicatif :

* Dans les deux tables :
    * 'DataYear' : année de collecte des données
    * 'PropertyName' : Official or common property name.
    * 'TaxParcelIdentificationNumber' : Property King County PIN
    * 'CouncilDistrictCode' : Property City of Seattle council district.
    * 'DefaultData'
    * 'ComplianceStatus' : ???


* 2015 seulement :
    * 'Comment'
    * '2010 Census Tracts' : ???
    * 'Seattle Police Department Micro Community Policing Plan Areas' :  ???
    * 'City Council Districts' : ???
    * 'SPD Beats' : ???
    * 'Zip Codes'
    * 'Location'


* 2016 seulement :
    * 'ZipCode'
    * 'Longitude'
    * 'State'
    * 'Latitude'
    * 'Comments'
    * 'Address'
    * 'City'

In [None]:
features_to_drop = [         
    'DataYear',
    'PropertyName',
    'TaxParcelIdentificationNumber',
    'CouncilDistrictCode',
    'DefaultData',
    'Comment',
    'ComplianceStatus',
    '2010 Census Tracts',
    'Seattle Police Department Micro Community Policing Plan Areas',
    'City Council Districts',
    'SPD Beats',
    'Zip Codes',
    'Location',
    'ZipCode',
    'Longitude',
    'State',
    'Latitude',
    'Comments',
    'Address',
    'City'
]

# Drop several columns
for feature in features_to_drop:
    
    # In the 2015 dataframe
    try:
        data_2015 = data_2015.drop(feature, axis=1)
    except KeyError:
        pass
    
    # In the 2016 dataframe
    try:
        data_2016 = data_2016.drop(feature, axis=1)
    except KeyError:
        pass

### Suppression des données issues des relevés de consommation annuels
Nous supprimons aussi les variables (colonnes) issues des relevés de consommation annuels (hormis ceux qui seront utilisés comme étiquette pour la régression) :

* Dans les deux tables :
    * 'SiteEUI(kBtu/sf)' : Site Energy Use Intensity (EUI) is a property's Site Energy Use divided by its gross floor area.
    * 'SiteEUIWN(kBtu/sf)' : Weather Normalized (WN) Site Energy Use Intensity (EUI) is a property's WN Site Energy divided by its gross floor area (in square feet).
    * 'SourceEUI(kBtu/sf)' : Source Energy Use Intensity (EUI) is a property's Source Energy Use divided by its gross floor area.
    * 'SourceEUIWN(kBtu/sf)' : Weather Normalized (WN) Source Energy Use Intensity (EUI) is a property's WN Source Energy divided by its gross floor area.
    * 'SiteEnergyUseWN(kBtu)'
    * 'SteamUse(kBtu)' : The annual amount of district steam consumed by the property on-site
    * 'Electricity(kWh)' : The annual amount of electricity consumed by the property on-site, […] measured in kWh.
    * 'Electricity(kBtu)' : The annual amount of electricity consumed by the property on-site, […] measured in thousands of British thermal units (kBtu).
    * 'NaturalGas(therms)' : The annual amount of utility-supplied natural gas consumed by the property, measured in therms.
    * 'NaturalGas(kBtu)' :
    * 'OtherFuelUse(kBtu)'
    * 'Outlier'


* 2015 uniquement :
    * 'GHGEmissionsIntensity(kgCO2e/ft2)' : Total Greenhouse Gas Emissions divided by property's gross floor area, measured in kilograms of carbon dioxide equivalent per square foot.


* 2016 uniquement : 
    * 'GHGEmissionsIntensity', 

In [None]:
features_to_drop2 = [         
        'SiteEUI(kBtu/sf)',
        'SiteEUIWN(kBtu/sf)',
        'SourceEUI(kBtu/sf)',
        'SourceEUIWN(kBtu/sf)',
        'SiteEnergyUseWN(kBtu)',
        'SteamUse(kBtu)',
        'Electricity(kWh)',
        'Electricity(kBtu)',
        'NaturalGas(therms)',
        'NaturalGas(kBtu)',
        'OtherFuelUse(kBtu)',
        'Outlier',
        'GHGEmissionsIntensity(kgCO2e/ft2)',
        'GHGEmissionsIntensity'
]

# Drop several columns
for feature in features_to_drop2:
    
    # In the 2015 dataframe
    try:
        data_2015 = data_2015.drop(feature, axis=1)
    except KeyError:
        pass
    
    # In the 2016 dataframe
    try:
        data_2016 = data_2016.drop(feature, axis=1)
    except KeyError:
        pass

In [None]:
# Renaming a column
data_2015 = data_2015.rename(columns={'GHGEmissions(MetricTonsCO2e)':"TotalGHGEmissions"})

In [None]:
print("\nNames of the {} common columns :\n".format(len(set(data_2015)&set(data_2016))), set(data_2015)&set(data_2016))
print("\nNames of the {} columns only in 2015 dataset:\n".format(len(set(data_2015)-set(data_2016))), set(data_2015)-set(data_2016))
print("\nNames of the {} columns only in 2016 dataset:\n".format(len(set(data_2016)-set(data_2015))), set(data_2016)-set(data_2015))

## Concaténation

In [None]:
data = pd.concat([data_2015, data_2016])
shape(data)

## Fusion des lignes correspondant à des bâtiments dupliqués
Nous identifions les bâtiments dupliqués sur la base de l'identifiant 'OSEBuildingID'.

Nous fusionnons ensuite les lignes et supprimons les doublons.

In [None]:
def duplicates_merging(dataframe, keys):
    """This function handles duplicates rows in the dataframe, based on a subset of columns (features)."""

    # Checking if there are some duplicated rows
    duplicates_mask = dataframe.duplicated(subset=keys, keep=False)
    if True not in list(duplicates_mask.unique()):
        print("No duplicates.")
        return dataframe
    
    print("Number of duplicates rows :", len(dataframe[duplicates_mask]))
    
    # Filtering the dataframe to keep only duplicated rows
    duplicates_mask = dataframe.duplicated(subset=keys, keep=False)
    duplicates_df = dataframe[duplicates_mask]

    # Group-by subset of columns used for key, sort=False to speed-up
    gb = duplicates_df.groupby(keys, sort=False)

    # Initializing aggregated dataframe
    agg_df = pd.DataFrame()

    # Identification of numerical and non-numerical columns
    numeric_columns = list(dataframe.select_dtypes(include=[np.number]).columns.values)

    # defining aggregation function for non-numerical columns
    def agg_mode(x): m = pd.Series.mode(x); return m.values[0] if not m.empty else np.nan

    # Iterating upon columns
    for column in dataframe.columns:

        # Calculate the mean of each group for numeric columns
        if column in numeric_columns:
            agg_col = gb[column].agg('mean')

        # Calculate the mode of each group for numeric columns
        else:
            agg_col = gb[column].agg(agg_mode)
            
        # adding the aggregated column to aggregated dataframe
        agg_df = pd.concat([agg_df, agg_col], axis=1, sort=True)

    # Dropping all duplicates
    dataframe_cleaned = dataframe.drop_duplicates(subset=keys, keep=False)

    # Concatenating the dataframe without duplicates and the aggregated rows for duplicates
    result = dataframe_cleaned.append(agg_df, ignore_index=True, sort=False)

    print("Shape after handling duplicates :", result.shape)
    return result

In [None]:
dataframe = data
keys = ['OSEBuildingID']

data = duplicates_merging(dataframe, keys)
shape(data)

Enfin, nous éliminons l'identifiant des bâtiments qui ne nous sera plus utile.

In [None]:
# Delete the identifier of buildings
data = data.drop('OSEBuildingID', axis=1)

# *Features engineering*

## Étude du taux de remplissage des variables

In [None]:
nb_rows, nb_columns = data.shape

# Count of the values on each column
columns_count = data.count()

# Sort the Series
columns_count = columns_count.sort_values(ascending=False)

# Calculating filling rates
filling_rates = columns_count / nb_rows

# Display the filling rates
filling_rates

### Élimination des colonnes trop peu remplies
Nous décidons de ne conserver que les colonnes remplies à 75% minimum pour la modélisation.

Cela nous permet de conserver la variable ENERGYSTARScore dont nous devrons étudier la pertinence (dans la partie modélisation).

In [None]:
def fillingrate_filter_columns(dataframe, limit_rate):
    """This function drop the colums where the filling rate is less than a defined limit rate."""

    # Count of the values on each column
    columns_count = dataframe.count()

    # Number of rows in the dataframe
    nb_rows = dataframe.shape[0]
    
    # Calculating filling rates
    filling_rates = columns_count / nb_rows

    # Define a mask of features with a filling_rate bigger than the limit rate
    mask = filling_rates > limit_rate
    
    # Apply the mask to the filling_rates Series
    filling_rates_selection = filling_rates[mask]
    
    # Get the list of the name of the selected columns
    features_selection = list(filling_rates_selection.index)
    print("Number of columns with a filling rate bigger than  {:.2%} : {} columns.".format(limit_rate, len(features_selection)))

    # Return a projection on the selection of features
    return dataframe[features_selection]

In [None]:
data = fillingrate_filter_columns(data, 0.75)
shape(data)

### Étude du remplissage des lignes

In [None]:
# Count the non-null values on each row
row_count = data.count(axis=1)

# Calculating filling rates
nb_columns = data.shape[1]
filling_rates_row = row_count / nb_columns

# Plotting histogram
import matplotlib.ticker as ticker

fig, ax = plt.subplots(figsize=(20, 10))
plt.title("Distribution du remplissage par produit", fontsize=25)
plt.xlabel("taux de remplissage", fontsize=15)
plt.ylabel("nombre de produits", fontsize=15)
ax.xaxis.set_major_formatter(ticker.PercentFormatter(xmax=1))
bins = np.linspace(0, 1, num=11)
ax.hist(filling_rates_row, bins=bins)
ax.xaxis.set_major_locator(plt.MaxNLocator(11))
plt.show()

Nous constatons que la majeur partie des lignes est remplie à plus de 90%.
Nous allons faire de l'imputation pour les dernières valeurs manquantes.

## Imputation des valeurs manquantes


In [None]:
shape(data)

In [None]:
# Looking at missing values
data.describe(include='all')

### Imputation multivariée de la variable 'ENERGYSTARScore'
La variable 'ENERGYSTARScore' est celle qui, parmi les variables conservées, possède le plus mauvais taux de remplissage.

Nous allons donc faire une imputation multivariée à l'aide de l'imputeur itératif de scikit-learn.

In [None]:
# Selection of numeric columns
numeric_columns = list(data.select_dtypes(include=['number']).columns)

# Selection of columns to apply the imputer, avoiding data leaks
numeric_columns.remove('SiteEnergyUse(kBtu)')
numeric_columns.remove('TotalGHGEmissions')
columns_to_impute = numeric_columns

try:
    # Load libraries
    from sklearn.experimental import enable_iterative_imputer
    from sklearn.impute import IterativeImputer    

# Instructions if problem with IterativeImputer
except:
    from sklearn.impute import SimpleImputer
    
    # Create simple imputer
    imp = SimpleImputer(strategy='median')

# Instructions if no problem with IterativeImputer
else:
    # Create iterative imputer
    imp = IterativeImputer(max_iter=10, random_state=0)
    
# Instruction to run in all cases
finally:
    
    # Train and apply (inplace) the imputer
    data[columns_to_impute] = imp.fit_transform(data[columns_to_impute])

    # Checking the effects of the iterative imputer
    shape(data)

### Imputation des autres valeurs manquantes
Nous allons regarder plus en détail à quoi correspondent les dernières valeurs manquantes.

In [None]:
# Display rows with missing data
mask = data.isnull().any(axis=1)
data[mask].head()

Pour les autres lignes, nous constatons que les valeurs manquantes correspondent aux variables 'LargestPropertyUseType' et 'LargestPropertyUseTypeGFA'.
Nous allons imputer :
* la variable 'LargestPropertyUseType' avec la variable 'ListOfAllPropertyUseTypes'
* la variable 'LargestPropertyUseTypeGFA' avec la variable 'PropertyGFATotal'

In [None]:
# Imputation of missing values
data['LargestPropertyUseType'].fillna(value=data['ListOfAllPropertyUseTypes'], axis=0, inplace=True)
data['LargestPropertyUseType'].fillna(value=data['PrimaryPropertyType'], axis=0, inplace=True)
data['LargestPropertyUseTypeGFA'].fillna(value=data['PropertyGFATotal'], axis=0, inplace=True)

# Dropping a linearly dependant feature
data = data.drop('ListOfAllPropertyUseTypes', axis=1)

In [None]:
# Display rows with missing data
mask = data.isnull().any(axis=1)
data[mask]

Nous constatons que les dernières valeurs manquantes correspondent aux étiquettes (valeurs cibles) : total d'énergie consommée et total des émissions de gaz à effet de serre.

Nous allons donc conserver ces lignes telles quelles, dans le but d'appliquer à ces batîments le modèle d'apprentissage automatique.

### Retypage des variables catégorielles

In [None]:
data.dtypes

In [None]:
# Select columns of object dtypes
obj_columns = list(data.select_dtypes(include='object').columns)
print("Columns of object dtypes:\n", obj_columns)

In [None]:
# Convert object dtypes features as categorical data ('category' dtypes)
data[obj_columns] = data[obj_columns].astype('category')

# Check dtypes
data.dtypes

# **Études univariées des variables cibles**
Nous allons étudier la distribution des variables 'SiteEnergyUse(kBtu)' et 'TotalGHGEmissions' et tester leur normalité.

### Indicateurs statistiques principaux

In [None]:
# .describe() pour min-max…
data[['SiteEnergyUse(kBtu)', 'TotalGHGEmissions']].describe()

### Distributions empiriques

In [None]:
def empirical_distribution(dataframe, feature):
    """Function plotting the bar plot and a boxplot (as subplots) for a distribution."""
    
    # Loading libraries
    import matplotlib.pyplot as plt
    import seaborn as sns

    # filtering non-null data
    mask = dataframe[feature].notnull()
    data_view = dataframe[mask]
    
    # Setting the data to plot
    x = data_view[feature]
    
    # Create a figure instance, and the two subplots
    fig = plt.figure(figsize=(20, 10))
    plt.suptitle("Statistical distribution: " + feature, fontsize=25)
    ax1 = fig.add_subplot(211) # histogram
    ax2 = fig.add_subplot(212) # boxplot

    # Tell distplot to plot on ax1 with the ax argument
    sns.distplot(x, ax=ax1)
    ax1.set_ylabel("Frequency", fontsize=20)
    ax1.set_xlabel("")

    # Tell the boxplot to plot on ax2 with the ax argument
    medianprops = {'color':"black"}
    meanprops = {'marker':'o', 'markeredgecolor':'black', 'markerfacecolor':'firebrick'}
    sns.boxplot(x,
                ax=ax2,
                showfliers=True,
                medianprops=medianprops,
                showmeans=True,
                meanprops=meanprops)
    ax2.set_xlabel("Value", fontsize=20)

In [None]:
# Setting parameters before calling the function
dataframe = data
feature = 'SiteEnergyUse(kBtu)'

# Call to the function
empirical_distribution(dataframe, feature)

In [None]:
# Setting parameters before calling the function
dataframe = data
feature = 'TotalGHGEmissions'

# Call to the function
empirical_distribution(dataframe, feature)

### Tests de normalité (Shapiro-Wilk)

Il semble que les variables cibles ('TotalGHGEmissions', 'SiteEnergyUse(kBtu)') ne soient pas normalement distribuées. Nous allons le vérifier avec un test statistique :

    H0 : les valeurs de la variable aléatoire pour l'échantillon sont issus d'une population normalement distribuée.

    H1 :  les valeurs de la variable aléatoire pour l'échantillon NE sont PAS issus d'une population normalement distribuée

In [None]:
def shapiro_wilk(dataframe, feature):
    '''This function proceed to the Shapiro-Wilk test (for gaussian distribution).
    It takes a dataframe and the name of the feature to test.
    It filters for non-null vallues of the feature and print the results.'''
    
    # Loading libraries
    from scipy.stats import shapiro

    # filtering non-null data for the feature
    mask = dataframe[feature].notnull()
    data_view = dataframe[mask][feature]

    # processing the Shopiro-Wilk test on the filtered data
    results = shapiro(data_view)

    # Print results
    print("Shapiro-Wilk test's statistic value is: W = {}".format(results[0]))
    print("Shapiro-Wilk test's p-value is: p = {}".format(results[1]))
    print("\nGaussian distribution hypothesis for \'{}\' can be rejected at a risk of {}%.".format(feature, results[1]*100))

In [None]:
dataframe = data
feature = 'SiteEnergyUse(kBtu)'
shapiro_wilk(dataframe, feature)

In [None]:
dataframe = data
feature = 'TotalGHGEmissions'
shapiro_wilk(dataframe, feature)

### Transformée de Box-Cox
Nous essayons de transformer les données pour obtenir une distribution gaussienne à l'aide de la [transformée de Box-Cox](https://fr.wikipedia.org/wiki/Transform%C3%A9e_de_Box-Cox) implémentée dans la [librairie scipy](https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.boxcox.html).

In [None]:
dataframe = data
feature = 'TotalGHGEmissions'

# Loading library
from scipy import stats

# Proceeding the Box-Cox transformation
feature_transformed, maxlog = stats.boxcox(dataframe[feature])

# Printing the results
print("Value of lambda that maximise log-likelihood function is: {}".format(maxlog))

# Converting the transformed feature (pd.Series) as a pd.DataFrame
feature_transformed = pd.DataFrame(feature_transformed, columns = [feature])

# Testing the normality of the Box-Cox transformed feature
print("\nResults of the Shapiro-Wilk test on transformed feature:")
shapiro_wilk(feature_transformed, feature)

In [None]:
# Setting parameters before calling the function
dataframe = feature_transformed
feature = 'TotalGHGEmissions'

# Call to the function
empirical_distribution(dataframe, feature)

## **Étude des corrélations entre variables**

La librairie dython (http://shakedzy.xyz/dython/) permet de calculer :
* pour les couples de variables quantitatives :
    * le coefficient de correlation de Pearson R 
    
    
* pour les couples de variables catégorielles :
    * le coefficient de Cramer V (symetrique)
    * le coefficient d'incertitude de Theil U (asymetrique)
    
    
* pour les couples de variables mixtes (quantitative + catégorielle) :
    * le ratio de correlation η (eta)

In [None]:
# loading library
from dython.nominal import associations

# Get the categorical columns
categorical_columns = list(data.select_dtypes(include='category').columns)

# Have to drop NaN values to avoid errors
df_for_correlations = data.dropna()

# Calculate associations and display graph
associations(
    df_for_correlations,
    figsize=(15,5),
    theil_u=True, # asymetric measure of correlation for nominal feature
    nominal_columns=categorical_columns
)

Nous constatons qu'un sous-ensemble des variables sont très fortement corrélées entre elles (**multicolinéarité**).
* 'LargestPropertyUseTypeGFA'
* 'PropertyGFABuilding(s)'
* 'PropertyGFATotal'

Pour nous permettre de pouvoir faire une analyse fiable de l'importance des variables (***features importance***) de notre modèle, nous décidons de supprimer ces variables, sauf une.

In [None]:
# Dropping columns
data = data.drop(columns=['PropertyGFABuilding(s)', 'PropertyGFATotal'])

Nous pouvons aussi constater que la variable **'ENERGYSTARScore'** présente un coefficient de corrélation linéaire proche de 0 avec les variables à expliquer ('TotalGHGEmissions', 'SiteEnergyUse(kBtu)').

Cela laisse entendre que cette variable ne sera pas très importante pour notre modèle, mais nous le confirmerons avec une analyse de l'importance des variables (*features importance*) a posteriori.

# **Visualisation des données** (par t-SNE)

Une valeur (nulle) pose problème lors du passage au logarithme, nous allons l'écarter pour la t-SNE.

In [None]:
# Identification of the values out of the domain of log function
mask = data['SiteEnergyUse(kBtu)'] <= 0
data[mask]

In [None]:
data_tsne = data.drop(209)

### Selection des variables pour la t-SNE

In [None]:
# Selecting all the numeric columns
subset = list(data.select_dtypes(include=['number']).columns)

# Keeping only the required columns in the dataframe
X = data_tsne[subset]

# Dropping rows with missing values (not handled by t-SNE implementation)
X = X.dropna(subset=subset)

### Pré-traitement pour la t-SNE

In [None]:
# Centering and reducting all numeric columns
from sklearn import preprocessing
std_scaler = preprocessing.StandardScaler().fit(X)
X_std = std_scaler.transform(X)

# Tranforming with logarithm of the target (only for coloring t-SNE)
# y = np.log(X['SiteEnergyUse(kBtu)'])
y = X['SiteEnergyUse(kBtu)']

### Exécution de la t-SNE

In [None]:
# Import libraries
from sklearn import manifold

# Instanciation of t-SNE
tsne = manifold.TSNE(n_components=2,
                     perplexity=30,
                     n_iter=300,
                     init='pca', # initialisation by a PCA
                     random_state=0
                    )

# Applying the t-SNE
X_projected = tsne.fit_transform(X_std) # t-SNE do not have ".transform" method

### Visualisation de la t-SNE en 2D
Nous procédons à une visualisation des données, en fonction de leur note Nutri-Score ('nutrition_grade_fr'), par t-SNE.

In [None]:
# Graphical representation of the population
plt.figure(figsize=(10,10))

# Definitions of axis boundaries
plt.xlim(X_projected[:,0].min(), X_projected[:,0].max())
plt.ylim(X_projected[:,1].min(), X_projected[:,1].max())

# Definition of axis'labels
plt.title("t-SNE\n")
plt.xlabel("t-SNE feature 1")
plt.ylabel("t-SNE feature 2")

# Mapping colors
y = (y - y.min()) / (y.max() - y.min())

# Glyphes and colors are representing 'Nutri-Score grades'
for i in range(len(X_projected)):
    plt.scatter(X_projected[:,0][i], # x-coordinate
             X_projected[:,1][i], # y-coordinate
             # y.iloc[i,0], # labels
             color=plt.cm.RdYlGn_r(y.iloc[i])
    )

# **Sauvegarde** (méthode *feather*)

In [None]:
# feather does not support serializing a non-default index
data = data.reset_index()

# Deleting index
data = data.drop(columns=["index"])

# Save the file (binary)
data.to_feather("p4_data4.ft")

In [None]:
data.head()