## IDEA


1) Analyze one feature at the time to see if it can be used to discriminate one or more classes from the rest
2) Analyze the feature vector as a whole – see if there is any correlation between the vector elements

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter

data = pd.read_csv('data/GenreClassData_30s.txt', sep='\t')

# Split the data into training and testing sets
train = data[data['Type'] == 'Train']
test = data[data['Type'] == 'Test']

# Define the features and targets
features = ['spectral_rolloff_mean', 'mfcc_1_mean', 'spectral_centroid_mean', 'tempo']
features = [
    'zero_cross_rate_mean','zero_cross_rate_std','rmse_mean','rmse_var',
    'spectral_centroid_mean','spectral_centroid_var','spectral_bandwidth_mean','spectral_bandwidth_var',
    'spectral_rolloff_mean','spectral_rolloff_var','spectral_contrast_mean','spectral_contrast_var',
    'spectral_flatness_mean','spectral_flatness_var',
    'chroma_stft_7_mean',
    
    'tempo',
    'mfcc_1_mean','mfcc_2_mean','mfcc_3_mean','mfcc_4_mean','mfcc_5_mean','mfcc_6_mean',

    'mfcc_2_std','mfcc_3_std','mfcc_4_std','mfcc_5_std', 'mfcc_7_std'
]


targets = ['Genre']


# feature data..
X_train = train[features]
# genre data
y_train = train[targets]

X_test, y_test = test[features], test[targets]

In [None]:

features_to_check = ['spectral_rolloff_mean', 'spectral_centroid_mean', 'tempo']
max_values = train[features_to_check].max()
min_values = train[features_to_check].min()

# Display the results
print("Max values:")
print(max_values)
print("\nMin values:")
print(min_values)

In [None]:
plt.close('all') #Clear any existing figures

data_dict = {
    'hiphop': data[data['Genre'] == 'hiphop'],
    'rock': data[data['Genre'] == 'rock'],
    'jazz': data[data['Genre'] == 'jazz'],
    'classical': data[data['Genre'] == 'classical'],
    'reggae': data[data['Genre'] == 'reggae'],
    'blues': data[data['Genre'] == 'blues'],
    'disco': data[data['Genre'] == 'disco'],
    'metal': data[data['Genre'] == 'metal'],
    'country': data[data['Genre'] == 'country'],
    'pop': data[data['Genre'] == 'pop']
}


data_dict_prev = {
   
    'classical': data[data['Genre'] == 'classical'],
    'disco': data[data['Genre'] == 'disco'],
    'metal': data[data['Genre'] == 'metal'],
    'pop': data[data['Genre'] == 'pop']
}

# for key, value in dict.items():

for feature in features:
    fig, axes = plt.subplots(len(data_dict),1, figsize=(8,len(data_dict)*3), sharex=True )
    for ax, (genre, data)  in zip(axes, data_dict.items()):
        ax.hist(data[feature], bins=30, label=feature)
        ax.legend()
        ax.title.set_text(genre)

plt.tight_layout()
plt.show()

To create a title or section in markdown, use the `#` symbol followed by a space and the title text. The number of `#` symbols determines the heading level.

```markdown
# Title for the Section

## Subsection Title

### Sub-subsection Title
```

For example:

```markdown
# Data Analysis

## Feature Distribution

### Tempo Analysis
```

## MUTUAL INFORMATION BETWEEN FEATURES AND BETWEEN FEATURES AND THE TARGET

In [None]:
# Define the features and targets

from sklearn.feature_selection import mutual_info_classif

all_features = [col for col in data.columns if col not in [
    'Track ID','TrackID', 'File', 'GenreID', 'Genre', 'Type',
    'spectral_flatness_mean', 'spectral_centroid_mean', 'spectral_bandwidth_mean', 'spectral_bandwidth_var', 'spectral_rolloff_var', 'spectral_rolloff_mean',
    'spectral_contrast_mean', 'spectral_contrast_var', 'spectral_centroid_var',
    'mfcc_2_mean', 'mfcc_5_mean','mfcc_8_mean','mfcc_9_mean','mfcc_10_mean','mfcc_11_mean','mfcc_12_mean',
    'mfcc_2_std','mfcc_3_std','mfcc_4_std','mfcc_5_std','mfcc_7_std','mfcc_8_std', 'mfcc_9_std','mfcc_10_std','mfcc_11_std','mfcc_12_std',
    'chroma_stft_1_std','chroma_stft_3_std', 'zero_cross_rate_std'
    ]]

all_features = [col for col in data.columns if col not in [
    'Track ID','TrackID', 'File', 'GenreID', 'Genre', 'Type']]

all_features = [
    'zero_cross_rate_mean','zero_cross_rate_std','rmse_mean','rmse_var',
    'spectral_centroid_mean','spectral_centroid_var','spectral_bandwidth_mean','spectral_bandwidth_var',
    'spectral_rolloff_mean','spectral_rolloff_var','spectral_contrast_mean','spectral_contrast_var',
    'spectral_flatness_mean','spectral_flatness_var',
    'chroma_stft_7_mean',
    
    'tempo',
    'mfcc_1_mean','mfcc_2_mean','mfcc_3_mean','mfcc_4_mean','mfcc_5_mean','mfcc_6_mean',

    'mfcc_2_std','mfcc_3_std','mfcc_4_std','mfcc_5_std', 'mfcc_7_std'
]
targets = ['Genre']


# feature data
X_train = train[all_features]
# genre data
y_train = train[targets]

X_test, y_test = test[all_features], test[targets]

In [None]:
mi = mutual_info_classif(X_train, y_train.values.ravel())

from sklearn.feature_selection import mutual_info_classif

# 1) compute MI (note the .ravel())
mi_scores = mutual_info_classif(
    X_train,
    y_train.values.ravel(),
    discrete_features=False,
    random_state=0
)

# 2) wrap in a Series and sort *ascending* so the largest end up at top
mi = pd.Series(mi_scores, index=X_train.columns)
mi = mi.sort_values(ascending=True)

# 3) horizontal barplot
row_height = 0.3
plt.figure(figsize=(10, len(all_features)*row_height))
mi.plot(kind='barh')
plt.title('Mutual Information Scores by Feature')
plt.xlabel('Mutual Information')
plt.ylabel('Feature')
plt.tight_layout()
plt.show()


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import mutual_info_score
import pandas as pd
import numpy as np

# 1) split at "tempo"
idx = all_features.index("tempo")
feat1 = all_features[:idx+1]
feat2 = all_features[idx+1:]

def plot_clustered_mi(features, title):
    # 2) bin into 10 quantile‐bins
    binned = pd.DataFrame({
        f: pd.qcut(X_train[f], q=10, duplicates="drop").cat.codes
        for f in features
    })

    # 3) build MI matrix
    n = len(features)
    M = np.zeros((n, n))
    for i, f1 in enumerate(features):
        for j, f2 in enumerate(features):
            M[i, j] = mutual_info_score(binned[f1], binned[f2])

    
    g = sns.clustermap(
        M,
        row_cluster=True,
        col_cluster=True,
        figsize=(12, 10),
        xticklabels=features,
        yticklabels=features,
        cmap="viridis",
        square=True
    )
    # give it a title
    g.fig.suptitle(title, y=1.02)
    plt.show()


# now call it for the two halves
plot_clustered_mi(feat1, "Mutual Information Between Features'")
plot_clustered_mi(feat2, "Clustered MI for MFCC features")


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import mutual_info_score
import pandas as pd
import numpy as np

# 1) bin data as before
binned = pd.DataFrame({
    f: pd.qcut(X_train[f], 10, duplicates='drop').cat.codes
    for f in all_features
})

# 2) compute pairwise MI
M = np.zeros((len(all_features), len(all_features)))
for i, f1 in enumerate(all_features):
    for j, f2 in enumerate(all_features):
        M[i, j] = mutual_info_score(binned[f1], binned[f2])


g = sns.clustermap(
    M,
    row_cluster=True,
    col_cluster=True,
    figsize=(16, 12),
    xticklabels=all_features,
    yticklabels=all_features,
    cmap="viridis",
   
    square=True
)
plt.suptitle("Clustered MI Between Features", y=1.02)
plt.show()


In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import mutual_info_score


# 1) Create the mask of "low‑MI" cells
mask = M < 0.5


n = len(all_features)

# 0.2 inches per feature in each direction (so 60 features ⇒ 12 inches)
size = max(8, n * 0.2)


plt.figure(figsize=(size, size))

# 4) Draw the heatmap
sns.heatmap(
    M,
    mask=mask,
    cmap="viridis",
    xticklabels=all_features,
    yticklabels=all_features,
    square=True
)

plt.title("Only MI ≥ 0.5")
plt.xticks(rotation=90)
plt.yticks(rotation=0)
plt.tight_layout()
plt.show()


In [None]:
import matplotlib.pyplot as plt
import pandas as pd

# 1) compute correlations
corr = X_train.corr()

# 2) display the raw matrix (you’ll see a pandas DataFrame)
print("Feature–Feature Correlation Matrix:")
display(corr)  

# 3) plot a heat-map
plt.figure(figsize=(10, 8))
plt.pcolor(corr)   # default colormap
plt.xticks(range(len(corr.columns)), corr.columns, rotation=90)
plt.yticks(range(len(corr.index)),    corr.index)
plt.title("Feature Correlation Heatmap")
plt.colorbar()
plt.tight_layout()
plt.show()

# 4) (optional) list out pairs with |corr| > 0.8
threshold = 0.8
pairs = corr.abs().unstack().sort_values(ascending=False)
high_corr = pairs[(pairs < 1.0) & (pairs > threshold)]
print("\nHighly correlated feature pairs (|corr| > 0.8):")
print(high_corr.drop_duplicates())
