<font color='lime'>

# Data preparation and normalization

</font> 

<font color='yellow'>

## A. Visualization of significant features (scatter plots, whisker boxes, histograms)

</font> 

<font color='orange'>

###  Load dataset

</font> 

In [None]:
import pandas as pd

data = pd.read_csv('data/spotify_songs_corrupted.csv')

<font color='orange'>

### Histogram for track_popularity

</font> 

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Set the style
sns.set_style("whitegrid")

# Histogram for track_popularity
plt.figure(figsize=(25, 10))  # Size for individual plot
sns.histplot(data['track_popularity'], bins=20, kde=True)
plt.title('Distribution of Track Popularity')
plt.xlabel('Popularity')
plt.ylabel('Count')
plt.show()

<font color='orange'>

### Scatter diagram for the energy dependence of danceability

</font> 

In [None]:
# Scatter plot for danceability vs energy
plt.figure(figsize=(25, 10))  # Size for individual plot
sns.scatterplot(x='danceability', y='energy', data=data)
plt.title('Danceability vs Energy')
plt.xlabel('Danceability')
plt.ylabel('Energy')
plt.show()

<font color='orange'>

### Box plot for loudness

</font> 

In [None]:
# Box plot for loudness
plt.figure(figsize=(25, 8))  # Size for individual plot
sns.boxplot(x=data['loudness'])
plt.title('Distribution of Loudness')
plt.xlabel('Loudness')
plt.show()

<font color='yellow'>

## B. Data cleaning

</font>

<font color='orange'>

## Skip removal, normalization, duplicate removal

</font>

In [None]:
import numpy as np

# 1. Removing rows with missing values
original_shape = data.shape
data_cleaned = data.dropna()
missing_rows_removed = original_shape[0] - data_cleaned.shape[0]
print(f"Removed {missing_rows_removed} rows with missing values.")

# 2. Normalizing numerical features (using Min-Max scaling for this demonstration)
numerical_features = data_cleaned.select_dtypes(include=[np.number]).columns
for feature in numerical_features:
    min_value = data_cleaned[feature].min()
    max_value = data_cleaned[feature].max()
    data_cleaned.loc[:, feature] = (data_cleaned[feature] - min_value) / (max_value - min_value)
print("Numerical features normalized.")

# 3. Removing duplicate rows
rows_before = data_cleaned.shape[0]
data_cleaned = data_cleaned.drop_duplicates()
duplicates_removed = rows_before - data_cleaned.shape[0]
print(f"Removed {duplicates_removed} duplicate rows.")

# Display the shape of the original and cleaned dataframes to see the difference
cleaned_shape = data_cleaned.shape

print(f"Original shape: {original_shape}")
print(f"Cleaned shape: {cleaned_shape}")

# Save cleaned subset to a CSV file
file_path = "data/spotify_songs_cleaned.csv"
data_cleaned.to_csv(file_path, index=False)
print(f"Cleaned data saved to: {file_path}")


<font color='orange'>

## Delete 0 values

</font>

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Assuming 'data' is a pandas DataFrame that has already been loaded.

# Heatmap before removing zero values
plt.figure(figsize=(10, 5))
sns.heatmap(data.eq(0), cmap="viridis", cbar=False)
plt.title("Heatmap of Zero Values Before")
plt.savefig("heatmap_before.png")
plt.show()

# Check for zero values and replace them with NaN
data_no_zero = data.replace(0, np.nan)

# Count the number of zero values (now NaNs) in each column
zero_values_count = data_no_zero.isna().sum().sum()
print(f"Number of zero values in the dataset: {zero_values_count}")

# Removing rows with NaN values (which are the zero values we replaced)
rows_before_removal = data_no_zero.shape[0]
data_no_zero_cleaned = data_no_zero.dropna()
rows_after_removal = data_no_zero_cleaned.shape[0]

# Calculate how many rows were deleted
rows_deleted = rows_before_removal - rows_after_removal
print(f"Rows deleted due to zero values: {rows_deleted}")

# Heatmap after removing zero values
plt.figure(figsize=(10, 5))
sns.heatmap(data_no_zero_cleaned.eq(0), cmap="viridis", cbar=False)
plt.title("Heatmap of Zero Values After")
plt.savefig("heatmap_after.png")
plt.show()

# Save the dataset with zero values removed to a CSV file
file_path_no_zeros = "spotify_songs_no0.csv"
data_no_zero_cleaned.to_csv(file_path_no_zeros, index=False)
print(f"Data without zero values saved to: {file_path_no_zeros}")

<font color='yellow'>

## C. Data correlation (correlation matrix)

</font> 

In [None]:
# Compute the correlation matrix
correlation_matrix = data_cleaned[numerical_features].corr()

# Plot the heatmap
plt.figure(figsize=(15, 10))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', vmin=-1, vmax=1, linewidths=.5)
plt.title('Correlation Matrix of Numerical Features')
plt.show()