In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
# load dataset
data = pd.read_csv('group_22.csv')

In [5]:
# overview da estrutura básica

print(f"Shape: {data.shape}")
print(f"Número de samples: {data.shape[0]}")
print(f"Número de features: {data.shape[1]}")

Shape: (3000, 49)
Número de samples: 3000
Número de features: 49


In [7]:
# verificar nomes e data types -- não há datatypes inválidos
print(data.dtypes)

duration_1                       float64
duration_2                       float64
duration_3                       float64
duration_4                       float64
duration_5                       float64
loudness_level                   float64
popularity_level                 float64
tempo_class                      float64
time_signature                   float64
key_mode                         float64
artist_song_count                float64
album_freq                       float64
movement_index                   float64
intensity_level                  float64
verbal_density                   float64
purity_score                     float64
positivity_index                 float64
activity_rate                    float64
loudness_intensity               float64
happy_dance                      float64
acoustics_instrumental           float64
artists_avg_popularity           float64
tempo_vs_genre                   float64
energy_rank_pct                  float64
loud_energy_rati

In [13]:
# análise de valores em falta

missing_data = data.isnull().sum()
missing_percent = (missing_data / len(data)) * 100
missing_summary = pd.DataFrame({'Contagem de valores em falta': missing_data, 'Percentagem de valores em falta': missing_percent})

print(missing_summary[missing_summary['Contagem de valores em falta'] > 0])

Empty DataFrame
Columns: [Contagem de valores em falta, Percentagem de valores em falta]
Index: []


In [15]:
# verificar se existem linhas duplicadas -- duplicates afetam negativamente a análise ao inflacionar padrões
print(f"Número de linhas duplicadas: {data.duplicated().sum()}")

Número de linhas duplicadas: 0


In [17]:
# análise de features numericas e categoricas separadamente
numerical_cols = data.select_dtypes(include=[np.number]).columns.tolist()
categorical_cols = data.select_dtypes(include=['object', 'category']).columns.tolist()

# numerical
print("Sumário das features numéricas - quantidade, média, dispersão, intervalo de valores, distribuição e outliers")
numerical_summary = df[numerical_cols].describe()
print(numerical_summary)
print("\n")

Sumário das features numéricas - quantidade, média, ----
        duration_1   duration_2   duration_3   duration_4   duration_5  \
count  3000.000000  3000.000000  3000.000000  3000.000000  3000.000000   
mean      0.067333     0.171333     0.319000     0.426333     0.016000   
std       0.250640     0.376863     0.466167     0.494626     0.125496   
min       0.000000     0.000000     0.000000     0.000000     0.000000   
25%       0.000000     0.000000     0.000000     0.000000     0.000000   
50%       0.000000     0.000000     0.000000     0.000000     0.000000   
75%       0.000000     0.000000     1.000000     1.000000     0.000000   
max       1.000000     1.000000     1.000000     1.000000     1.000000   

       loudness_level  popularity_level  tempo_class  time_signature  \
count     3000.000000       3000.000000  3000.000000     3000.000000   
mean         1.714000          2.043000     1.018000        0.076971   
std          1.347643          0.976457     0.247042        

In [18]:
# skewness and kurtosis
# skewness mede a assimetria da distribuição (0 - distribuição normal | positvo - pico tende para a esquerda | negativo - pico tende para a direita
# kurtosis descreve a distribuição dos dados à volta da média (positivo - mais outliers | negativo - menos outliers)

skewness = data[numerical_cols].skew()
kurtosis = data[numerical_cols].kurtosis()

additional_stats = pd.DataFrame({
    'Skewness': skewness,
    'Kurtosis': kurtosis
})
print(additional_stats)

                                Skewness    Kurtosis
duration_1                      3.454796    9.942242
duration_2                      1.745388    1.047077
duration_3                      0.777064   -1.397103
duration_4                      0.298067   -1.912432
duration_5                      7.718538   57.614238
loudness_level                  0.321106   -1.036907
popularity_level               -0.303235   -1.356962
tempo_class                     1.241528   15.261093
time_signature                 -4.964392   36.835497
key_mode                        0.029530   -1.290927
artist_song_count               3.690076   12.898883
album_freq                      7.635744   93.513645
movement_index                 -0.449840   -0.024884
intensity_level                -1.183599    2.599923
verbal_density                  4.226974   24.413565
purity_score                    0.939049   -0.675281
positivity_index                0.355152   -0.792055
activity_rate                   0.602400    1.

In [27]:
# sumário das características categóricas

print("\n=== CATEGORICAL FEATURES SUMMARY ===")

# Let's look at each categorical feature one by one
for col in categorical_cols:
    print(f"\n--- Analyzing: {col} ---")
    
    # Count how many different categories exist
    num_categories = df[col].nunique()
    print(f"This feature has {num_categories} different categories")
    
    # Show the most common categories and how often they appear
    print("Most common values and their counts:")
    value_counts = df[col].value_counts()
    print(value_counts.head())  # Show top 5 most frequent
    
    # Also show percentages to understand distribution
    print("As percentages:")
    percentages = df[col].value_counts(normalize=True) * 100
    print(percentages.head().round(2))


=== CATEGORICAL FEATURES SUMMARY ===

--- Analyzing: focus_factor ---
This feature has 1555 different categories
Most common values and their counts:
focus_factor
0.0        735
0.00255     11
0,00006     10
0.896        9
0.865        9
Name: count, dtype: int64
As percentages:
focus_factor
0.0        24.50
0.00255     0.37
0,00006     0.33
0.896       0.30
0.865       0.30
Name: proportion, dtype: float64

--- Analyzing: target_class ---
This feature has 3 different categories
Most common values and their counts:
target_class
class_1     1000
class_18    1000
class_24    1000
Name: count, dtype: int64
As percentages:
target_class
class_1     33.33
class_18    33.33
class_24    33.33
Name: proportion, dtype: float64


In [30]:
print("As categorias estão perfeitamente balanceadas, o que indica que o modelo não será biased")

As categorias estão perfeitamente balanceadas, o que indica que o modelo não será biased


In [31]:
if 'target_regression' in data.columns:
    print("\n📈 REGRESSION TASK: Predicting Popularity Score")
    
    # Basic statistics about the popularity scores
    popularity_stats = data['target_regression'].describe()
    print("Popularity Score Statistics:")
    print(f"  Average: {popularity_stats['mean']:.2f}")
    print(f"  Range: {popularity_stats['min']:.2f} to {popularity_stats['max']:.2f}")
    print(f"  Standard Deviation: {popularity_stats['std']:.2f}")
    
    # Check for weird values (outliers)
    Q1 = data['target_regression'].quantile(0.25)  # 25th percentile
    Q3 = data['target_regression'].quantile(0.75)  # 75th percentile
    IQR = Q3 - Q1  # Middle 50% range
    
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    # Find songs with unusually high or low popularity
    outliers = data[(data['target_regression'] < lower_bound) | 
                  (data['target_regression'] > upper_bound)]
    
    print(f"\nOUTLIER CHECK:")
    print(f"Found {len(outliers)} songs with unusual popularity scores")
    print(f"That's {len(outliers)/len(df)*100:.1f}% of our data")
    
    # Why care about outliers?
    if len(outliers) > 0:
        print("NOTE: Outliers can make our regression models less accurate")


📈 REGRESSION TASK: Predicting Popularity Score
Popularity Score Statistics:
  Average: 0.31
  Range: -1.49 to 2.54
  Standard Deviation: 0.77

OUTLIER CHECK:
Found 0 songs with unusual popularity scores
That's 0.0% of our data
