<a href="https://colab.research.google.com/github/txusser/Master_IA_Sanidad/blob/main/Modulo_2/2_3_3_Preprocesado.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Data Preprocessing

Data preprocessing is a crucial step in the development of machine learning models and encompasses several fundamental processes:

1. **Data Cleaning**:
   - **Handling Missing Values**: Filling missing values with a specific value, mean, median, etc.
   - **Removing Duplicates**: Deleting or handling duplicate observations.
   - **Outlier Removal**: Identifying and handling outliers that can skew the model.

2. **Data Transformation**:
   - **Normalization and Scaling**: Rescaling data so all values fall within a similar range.
   - **Encoding Categorical Variables**: Converting categories into a form the model can understand, such as One-Hot Encoding.
   - **Feature Transformation**: Creating new features from existing ones, such as variable combinations, polynomial functions, etc.

3. **Dimensionality Reduction**:
   - **Feature Selection**: Selecting the most important features that contribute to model performance.
   - **Reduction Techniques**: Using techniques like Principal Component Analysis (PCA) to reduce data complexity without losing relevant information.

4. **Data Partitioning**:
   - **Splitting into Training, Validation, and Test Sets**: Allows model performance to be evaluated at different stages and ensures the model does not overfit the data.

5. **Handling Imbalanced Data**:
   - **Undersampling and Oversampling**: Techniques to deal with datasets where one class is overrepresented compared to another.

6. **Exploratory Data Analysis (EDA)**:
   - **Visualization**: Graphs and tables to understand the nature and relationships between variables.
   - **Descriptive Statistics**: Provides an initial understanding of the data through metrics like mean, median, standard deviation, etc.


In [None]:
# Load rich for text enrichment
from rich.console import Console
console = Console()

# Load common working libraries
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np

# Load relevant preprocessing methods
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler




In [None]:
# We will use the Iris flower dataset
from sklearn.datasets import load_iris

iris = load_iris()
data = pd.DataFrame(data=iris['data'], columns=iris['feature_names'])


In [None]:
console.rule("[blue] Information About the Working Dataset [/blue]")
console.log(data.head())
console.log(f"\n Data Description ============= \n\n {data.describe()}")

### Generating NaNs for the Exercise

In [None]:
# Introduce some missing data randomly

def generate_nans(data):
    np.random.seed(42)  # Seed for reproducibility

    # Fraction of values to convert to NaN (e.g., 5%)
    fraction = 0.05

    # Mask to select entries
    mask = np.random.rand(*data.shape) < fraction

    # Convert selected entries to NaN
    data[mask] = np.nan

# Generate missing data
generate_nans(data)

# Check if there are missing values
missing_values = data.isna()
console.log(f"\n - Missing data as NaNs:\n", missing_values)

# You can also use isnull() (Equivalent to the above, but isna() is recommended)
missing_values = data.isnull()
console.log(f"\n - Missing data as nulls:\n", missing_values)

# Count how many missing values or NaNs are in the dataset:
console.rule("Missing Data")
console.log("Number of missing values:\n", data.isna().sum())


### Removing Rows or Columns with Missing Values


In [None]:
# Removing rows with at least one missing value
data.dropna(inplace=True)

# Removing columns with at least one missing value
data.dropna(axis=1, inplace=True)

# Verify that there are no more missing values:
console.rule("Missing Data")
console.log(data.isna().sum())


### Filling Missing Values with Mean, Median, etc.


In [None]:
# Generate NaNs (only for this demonstration)
generate_nans(data)
console.rule("Initial Data")
console.log(data.isna().sum())

# Filling with mean
data['sepal length (cm)'] = data['sepal length (cm)'].fillna(data['sepal length (cm)'].mean())
data['sepal width (cm)'] = data['sepal width (cm)'].fillna(data['sepal width (cm)'].mean())

# Filling with median
data['petal length (cm)'] = data['petal length (cm)'].fillna(data['petal length (cm)'].median())

# Verify result
console.rule("Final Data")
console.log(data.isna().sum())


In [None]:
### 2. Removing Duplicates
number_of_duplicates = data.duplicated().sum()
print(f"Number of duplicate entries: {number_of_duplicates}")
data.drop_duplicates(inplace=True)


### Removing Outliers


In [None]:
# Reload the working data
iris = load_iris()

# Generate a reference dataset (ini) for demonstration purposes
data = pd.DataFrame(data=iris['data'], columns=iris['feature_names'])
data_ini = pd.DataFrame(data=iris['data'], columns=iris['feature_names'])

def generate_outliers(data: pd.DataFrame):
    """ Outlier Generator """
    # Introduce outliers randomly (e.g., in 5% of the data)
    fraction_outliers = 0.05
    data_outliers = np.copy(data)
    mask_outliers = np.random.rand(*data_outliers.shape) < fraction_outliers
    data_outliers[mask_outliers] += 5 * data_outliers.std()
    return data_outliers

data_outliers = generate_outliers(data)

# Visualize the outliers
fig = plt.figure(1, figsize=(15, 10))
ax1 = fig.add_subplot(121)
sns.boxplot(data=data_ini, ax=ax1)
ax1.set_title("Original Data")
ax2 = fig.add_subplot(122)
sns.boxplot(data=data_outliers, ax=ax2)
ax2.set_title("Data with Outliers")
plt.show()

# Restructure the data
data_outliers = pd.DataFrame(data_outliers, columns=data.columns)

# Define validity range using the Interquartile Range (IQR)
Q1 = data_outliers['sepal length (cm)'].quantile(0.25)
Q3 = data_outliers['sepal length (cm)'].quantile(0.75)
IQR = Q3 - Q1

# Define the boundaries for outliers
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Remove the outliers
data_outliers = data_outliers[
    (data_outliers['sepal length (cm)'] >= lower_bound) &
    (data_outliers['sepal length (cm)'] <= upper_bound)
]

# Visualize the result after removing outliers
fig = plt.figure(2, figsize=(10, 10))
ax = fig.add_subplot(111)
sns.boxplot(data=data_outliers, ax=ax)
ax.set_title("Data after Removing Outliers")
plt.show()


## Normalization

### StandardScaler
Standardize features by removing the mean and scaling to unit variance.

* [Documentation](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html)


In [None]:
"""
We will use scikit-learn to apply the StandardScaler preprocessing technique.
The goal is to transform the data so that it has a mean of zero and a unit standard deviation.
"""
iris = load_iris()
data = pd.DataFrame(data=iris['data'], columns=iris['feature_names'])

# Use the standard scaler from Scikit-learn
scaler = StandardScaler()

# Scale the data considering the mean and standard deviation of each variable
# The "fit" method adjusts the model to the original data.
scaler.fit(data.values)

# Use the "transform" function from the StandardScaler class to apply
# the transformation to the original data. The result of this transformation
# is stored in the "data_norm" variable.
data_norm = scaler.transform(data.values)

# Verify the result
console.log("Initial Data:\n", data[:10])
console.log("Scaled Data:\n", data_norm[:10])


In [None]:
# Let's see an example with more interpretable data

# Create a sample DataFrame
data = {
    'Feature1': [1.0, 2.0, 3.0, 4.0, 5.0],
    'Feature2': [10.0, 20.0, 30.0, 40.0, 50.0],
    'Feature3': [100.0, 200.0, 300.0, 400.0, 500.0]
}
df = pd.DataFrame(data)

# Display the original DataFrame
print("Original DataFrame:\n", df)

# Create a StandardScaler object
scaler = StandardScaler()

# Scale the data considering the mean and standard deviation of each variable
# The "fit" method adjusts the model to the original data.
scaler.fit(df.values)

# Use the "transform" function from the StandardScaler class to apply
# the transformation to the original data. The result of this transformation
# is stored in the variable "X_scaled".
X_scaled = scaler.transform(df.values)

# Display the scaled DataFrame
df_scaled = pd.DataFrame(X_scaled, columns=df.columns)
print("\nScaled DataFrame:\n", df_scaled)


### MinMaxScaler
Transforms features by scaling them to a specified range.  
This estimator scales and translates each feature individually so that it is within the given range in the training set, for example, between zero and one.

* [Documentation](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.MinMaxScaler.html)

In [None]:
# Reload the original data to avoid previous transformations (for demonstration purposes only)
iris = load_iris()
data = pd.DataFrame(data=iris['data'], columns=iris['feature_names'])

# Create the MinMaxScaler
scaler = MinMaxScaler(feature_range=(0, 1))

# Select columns with numerical values of interest
columns_to_scale = ['sepal length (cm)', 'sepal width (cm)']
data_s = data[columns_to_scale]

# Print the original data
print(" => Original Data: \n", data_s.head())

# Apply the scaling function
data_scaled = pd.DataFrame(scaler.fit_transform(data_s), columns=columns_to_scale)

# Print the scaled data
print(" => Scaled Data: \n", data_scaled.head())


In [None]:
# Standard normalization of the data, also known as Z-scale or Z-standardization
data_norm = (data_scaled - data_scaled.mean()) / data_scaled.std()

### Data Visualization

fig = plt.figure(1, figsize=(12, 5))

# Original data histogram
plt.subplot(131)
sns.histplot(data['sepal length (cm)'].values, bins=20, color='blue', alpha=0.7)
plt.title("Original")

# Scaled data histogram
plt.subplot(132)
sns.histplot(data_scaled['sepal length (cm)'].values, bins=20, color='green', alpha=0.7)
plt.title("Scaled")

# Normalized data histogram
plt.subplot(133)
sns.histplot(data_norm['sepal length (cm)'].values, bins=20, color='orange', alpha=0.7)
plt.title("Normalized")

plt.tight_layout()
plt.show()


# NOTE

The following examples complement the exercises covered in the previous week:
* Links to the respective notebooks:

  * Notebook 1: 2_3_3_Preprocessing_and_Structuring_Data
  * Notebook 2: 2_3_3_Feature_Extraction


## Encoding Categorical Variables
Most Machine Learning algorithms require all variables to be numeric. Therefore, categorical variables must be encoded into a numerical format before being input into the model during the training (and/or inference) process.


In [None]:
# Create a sample DataFrame
data = {
    'Type': ['Apple', 'Orange', 'Banana', 'Orange', 'Apple'],
    'Color': ['Red', 'Orange', 'Yellow', 'Orange', 'Green']
}
df = pd.DataFrame(data)

# Display the original DataFrame
print("Original DataFrame:\n", df)

# One-Hot Encoding using pandas' get_dummies
# Create dummy variables for the categorical variables
df_encoded = pd.get_dummies(df, columns=['Type', 'Color'], prefix=['Type', 'Color'])

# Display the DataFrame with encoded variables
print("\nDataFrame with Encoded Variables:\n", df_encoded)

# Numerical encoding using pandas' factorize
# Encode the categorical variable 'Type' numerically
df['Type_encoded'] = pd.factorize(df['Type'])[0]

# Display the DataFrame with the 'Type' variable numerically encoded
print("\nDataFrame with 'Type' Variable Numerically Encoded:\n", df)


# Dimensionality Reduction (Feature Extraction)


### Principal Component Analysis (PCA)
Principal Component Analysis, or PCA, is a statistical technique used to reduce the dimensionality of a dataset. PCA allows us to simplify the information present in a dataset with multiple variables and transform it into a reduced dataset that still retains much of the original information.

The goal of PCA is to find a representation of the data that is easier to understand while preserving as much variance in the data as possible.

To perform PCA, the covariance matrix of the original data is first calculated. Then, the eigenvectors of this matrix are computed, which indicate the directions in which the data has the greatest variance. The original data is then projected onto these directions, resulting in a new dataset with fewer variables that still captures a significant portion of the original information.


In [None]:
console.rule("[blue] Principal Component Analysis [/blue]")

# Use Sci-kit learn functions for PCA analysis
from sklearn.decomposition import PCA

# Load the following data for this example
from sklearn.datasets import load_breast_cancer
cancer_data = load_breast_cancer()
df = pd.DataFrame(data=cancer_data.data, columns=cancer_data.feature_names)

# Display a summary of the dataset
console.log(df.describe())

In [None]:
# Standardize the data
scaler = StandardScaler()
scaler.fit(df.values)
data_scaled = scaler.transform(df.values)

console.log("Number of features:", data_scaled.shape[1])

# To evaluate the results, use the full set of variables
# "n_components" = 30 specifies that PCA should fit the data to find
# the 30 principal components.
pca = PCA(n_components=30, random_state=2020)
pca.fit(data_scaled)

# Store the values of the (30) principal components in the variable X_pca
X_pca = pca.transform(data_scaled)
print("X_pca:\n", X_pca)

# Since the full set of variables was selected, the chosen components
# should account for 100% of the variance in the data
print("\n => Variance explained by the components:", sum(pca.explained_variance_ratio_ * 100))

In [None]:
# By plotting the variance as a function of the number of components, we can observe
# the minimum number of components needed to explain a certain percentage of the variance
import matplotlib.pyplot as plt
import numpy as np

plt.plot(np.cumsum(pca.explained_variance_ratio_ * 100))
plt.xlabel("Number of Components")
plt.ylabel("Percentage of Variance Explained")
plt.title("Cumulative Variance Explained by PCA Components")
plt.grid(True)
plt.show()

In [None]:
# We see that with just a third of the variables, we can explain 95% of the variance
n_var = np.cumsum(pca.explained_variance_ratio_ * 100)[9]
print("Variance explained by the first 10 components:", n_var)

In [None]:
# Alternatively, we can construct the set that accommodates 95% of the variance
# as follows

# Fit PCA to retain 95% of the variance
pca_95 = PCA(n_components=0.95, random_state=2020)
pca_95.fit(data_scaled)
X_pca_95 = pca_95.transform(data_scaled)

# Convert the first two principal components into a DataFrame for better visualization
pca_df = pd.DataFrame(X_pca_95[:, :2], columns=['PC1', 'PC2'])

# Assuming 'cancer_data.target' is a Series or list with target labels, add it to the DataFrame
pca_df['Target'] = cancer_data.target

# Visualize the relationship of the first two principal components with a scatter plot
plt.figure(figsize=(10, 8))
sns.scatterplot(data=pca_df, x='PC1', y='PC2', hue='Target', palette='viridis')
plt.title('Visualization of the First Two Principal Components')
plt.xlabel('First Principal Component (PC1)')
plt.ylabel('Second Principal Component (PC2)')
plt.legend(title='Target')
plt.show()


### Notes
The scatter plot generated from the first two principal components (PC1 and PC2) of the PCA aims to represent the underlying structure of the data in a lower-dimensional space. This can reveal information about relationships among observations and differences between classes (in this case, represented by `cancer_data.target`).

* **Distinct Clusters**: If observations from different classes form distinct and clearly separated clusters in the plot, this indicates significant differences between the classes. This suggests that the original features contain valuable information for class differentiation.

* **Overlapping Clusters**: If the clusters overlap but there is still some separation, this suggests that there is some information in the data to differentiate the classes, but it is not very clear. There may be features contributing noise, or the separation might require more than two dimensions to become fully visible.

* **Absence of Clusters**: If there is no clear separation between classes and the observations are mixed, this might indicate that the original features are not very useful for class differentiation, at least in the first two principal components.


## Independent Component Analysis (ICA)


In [None]:
# We will use fMRI data for our example with ICA
# To do this, we start by installing the nilearn library
!python -m pip install nilearn

In [None]:
from nilearn import datasets

# Download a subject from the functional MRI study
dataset = datasets.fetch_development_fmri(n_subjects=1)
file_name = dataset.func[0]

# Image preprocessing
from nilearn.input_data import NiftiMasker

# Apply a mask to extract the background from the image (non-brain voxels)
masker = NiftiMasker(smoothing_fwhm=8, memory='nilearn_cache', memory_level=1,
                     mask_strategy='epi', standardize=True)
data_masked = masker.fit_transform(file_name)


In [None]:
from sklearn.decomposition import FastICA
import numpy as np

# Select 10 components
ica = FastICA(n_components=10, random_state=42)
components_masked = ica.fit_transform(data_masked.T).T

# Apply a threshold (80% signal) to the data after normalization
# based on mean and standard deviation
components_masked -= components_masked.mean(axis=0)
components_masked /= components_masked.std(axis=0)
components_masked[np.abs(components_masked) < .8] = 0

# Invert the transformation to recover the 3D structure
component_img = masker.inverse_transform(components_masked)

In [None]:
# Finally, visualize the results of the dimensionality reduction operations
from nilearn import image
from nilearn.plotting import plot_stat_map, show
mean_img = image.mean_img(file_name)
plot_stat_map(image.index_img(component_img, 0), mean_img)
plot_stat_map(image.index_img(component_img, 1), mean_img)

In [None]:
# Export the data for analysis with another tool...
cancer_data = load_breast_cancer()
df = pd.DataFrame(data=cancer_data.data, columns=cancer_data.feature_names)
df.to_csv("sample_data/cancer_data.csv")