In [None]:
# Necessary libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats


# Load the dataset
df = pd.read_csv('music_genre.csv')


# a. Attribute types and their semantics
attribute_types = df.dtypes
semantics = df.describe(include='all')
print('Attribute Types: \n',  attribute_types)
print('Semantics: \n', semantics)

In [None]:
# b. Statistical properties and correlations
# Select only numeric columns
numeric_columns = df.select_dtypes(include=[np.number])

# Displaying correlation matrix for numeric columns
correlations_numeric = numeric_columns.corr()

# Visualization of correlations using a heatmap
plt.figure(figsize=(12, 8))
sns.heatmap(correlations_numeric, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Matrix for Numeric Columns')
plt.savefig('correlation_matrix.png')
plt.show()


In [None]:
# c. Data quality aspects
missing_values = df.isnull().sum()
data_distribution = df['music_genre'].value_counts()
print( 'Missing values: \n', missing_values)
# Calculate the percentage of missing values for each column
missing_percentage = (df.isnull().sum() / len(df)) * 100

# Plot the distribution of missing values
plt.figure(figsize=(10, 6))
missing_percentage.plot(kind='bar')
plt.title('Distribution of Missing Values in Each Column')
plt.xlabel('Columns')
plt.ylabel('Percentage of Missing Values')
plt.savefig('distribution_missing_values.png')
plt.show()


In [None]:
# d. Visual exploration of data properties and hypotheses
# Visualize correlations using a heatmap (the same heatmap as was in part b. )
plt.figure(figsize=(12, 8))
sns.heatmap(correlations_numeric, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Matrix')
plt.show()

In [None]:
# Visualize data distribution of the target variable
plt.figure(figsize=(10, 6))
sns.countplot(x='music_genre', data=df)
plt.title('Data Distribution of Music Genres')
plt.xticks(rotation=45)
plt.savefig('data_distribution_music_genres.png')
plt.show()


In [None]:
# e. Evaluate potentially ethically sensitive attributes and unbalanced distributions
# No etchically sensitive attributes, the classes of the dataset are balanced

In [None]:
# f. Potential risks and biases
# It's important to consult domain experts for understanding biases related to music genres, as for example regarding the change in music genre for the same artist, which is described in the report.

# g. Actions likely required in data preparation
# Clean missing values 
# Analyse the outliers and remove them if needed
# Consider the correlated attributes during the preprocessing
# Remove the instance id
# Check if the same artist has songs for different genres

In [None]:
# 3.a Analyze and perform necessary actions based on analysis performed in the Data Understanding phase.
numerical_attributes = ['popularity' , 'acousticness', 'danceability', 'duration_ms', 'energy', 
                            'instrumentalness', 'liveness', 'loudness', 'speechiness', 'tempo', 'valence']


    # 3.a.1 set attribute tempo to float (already done for task d)
df = pd.read_csv('music_genre.csv')
df['tempo'] = pd.to_numeric(df['tempo'], errors='coerce')  # 'coerce' will turn non-numeric values to NaN

    # 3.a.2 Clean missing values (task a)
df_cleaned = df.dropna()

    # 3.a.3 Remove the instance id (task a)
df_cleaned = df_cleaned.drop(columns=['instance_id'])

    # 3.a.4 filter out duration of -1 (4.340 rows) (already done for task d)
df_cleaned = df_cleaned[df_cleaned['duration_ms'] != -1.0]


    # 3.a.5 Analyze outliers and remove them if needed (done on numerical attributes) (task a)
for attribute in numerical_attributes:
   q_low = df_cleaned[attribute].quantile(0.01)
   q_high = df_cleaned[attribute].quantile(0.99)
   df_cleaned_outliers = df_cleaned[(df_cleaned[attribute] > q_low) & (df_cleaned[attribute] < q_high)]

    # 3.a.6 #filter out artist "empty_field" (1.934 rows) (already done for task d)
df_cleaned_outliers = df_cleaned_outliers[df_cleaned_outliers['artist_name'] != 'empty_field']


    # 3.a.7 Check if the same artist has songs for more than two genres (3.166 rows) (task a)
artist_genre_counts = df_cleaned_outliers.groupby('artist_name')['music_genre'].nunique()
artists_with_multiple_genres = artist_genre_counts[artist_genre_counts > 2].index.tolist()

for artist in artists_with_multiple_genres:
    df_cleaned_outliers = df_cleaned_outliers[df_cleaned_outliers['artist_name'] != artist]

# Consider the correlated attributes during the preprocessing (task a, but is done during task b)
print('filtered rows:')
print(len(df)-len(df_cleaned_outliers))

In [None]:
# 3.b Analyze options and potential for derived attributes (note: if the potential is considered low, 
#these obviously do not necessarily have to be applied for your analysis, but options should be documented)

# 3.b.1 Feature Scaling (for numerical data)
    # Example: Scaling numerical features to a standard range (e.g., Min-Max scaling).
    # Potential: Ensures that features contribute equally to the model and improves convergence.

from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
df_derrived = df_cleaned_outliers.copy()
df_derrived[numerical_attributes] = scaler.fit_transform(df_derrived[numerical_attributes])

# 3.b.2 Consider correlated attributes during preprocessing
    # Attribute creation
    # Example: Combining two or more features to create an interaction term (e.g., loudness * energy).
    # Potential: Captures relationships between features that may have a combined effect on the target variable.


corr_matrix = df_cleaned_outliers.select_dtypes(include=[np.number]).corr().abs()
most_correlated_pairs = (corr_matrix.where(np.triu(np.ones(corr_matrix.shape),
                    k=1).astype(bool)).stack().sort_values(ascending=False)).head(3).reset_index()

df_derrived = df_derrived.copy()
correlated_attributes = []

for index, row in most_correlated_pairs.iterrows():
    new_column_name = f"{row['level_0']}_{row['level_1']}_interaction"
    correlated_attributes.append(new_column_name)
    df_derrived[new_column_name] = df_derrived[row['level_0']] * df_derrived[row['level_1']]


df_derrived[correlated_attributes] = scaler.fit_transform(df_derrived[correlated_attributes])



# 3.b.3 Categorical Feature Encoding:
    # Example: One-hot encoding categorical variables (e.g., genre) to convert them into numerical format.
    # Potential: Enables the use of categorical data in machine learning models

categorical_features = ['key', 'mode']  
df_derrived = pd.get_dummies(df_derrived, columns=categorical_features)
pd.set_option('display.max_columns', None)

df_derrived

#NOT DONE, low potential!
# 3.b.4 Ratios and Proportions:
    # Example: Creating ratios between two numerical features (e.g., energy divided by loudness).
    # Potential: Highlights relative importance or relationships between features.

# 3.b.5 Moving Averages:
    # Example: Calculating the moving average of a numerical feature over a specified window.
    # Potential: Smoothens trends and captures changes over time.


In [None]:
# 3.c Analyze options for additional external data sources, attributes that might be useful to better
    # address the business objectives or data mining goals (Note: this description may be
    # hypothetical, i.e. you are not necessarily required to actually obtain and integrate the external
    # data for the analysis)

    #not needed, because we have all necessary data in our data set (hypothetical on our platform)
    # 3.c.1 User Preferences:
    # Source: User surveys, feedback forms, or interaction logs.
    # Attributes: Explicit user preferences, liked genres, and feedback on existing recommendations.

In [None]:
# 3.d Describe other pre-processing steps considered, specifying which ones were applied or not
    # applied due to which reason. (e.g. data cleansing, transformations, binning, scaling, outlier
    # removal, attribute removal, transcoding, …) at a level of detail that ensures reproducibility of
    # changes to the data.

    # 3.a.1 transformation of attribute to float

    # 3.a.3 attribute removal (instance id)
        # Applied: Remove instance id - irrelevant or redundant attributes that do not contribute significantly to the analysis or model.
        # Reason: Reduces dimensionality, focuses on essential features, and may improve model performance.
    
    # 3.a.4 data cleansing
    
    # 3.a.5 outlier removal

    # 3.a.6 data cleansing

    # 3.a.7 data cleansing

    # 3.b.1 feature scaling
    # 3.b.2 attribute creation
    # 3.b.3 feature encoding


#not applied: 

# Binning:
    # Not Applied: Did not bin numerical values into discrete intervals.
    # Reason: Depending on the nature of the data, binning may or may not be suitable. In some cases, it can lead to loss of information.

In [None]:
# 4.Training and evaluating the SVM model
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import pandas as pd

# Splitting the target variable
y = df_derrived['music_genre']

# Features (dropping the useless ones)
X = df_derrived.drop(['obtained_date', 'artist_name', 'track_name', 'music_genre'], axis=1)

# Split the dataset into training, validation, and test sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, stratify=y, test_size=0.4, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, stratify=y_temp, test_size=0.5, random_state=42)

# Defining a range of C values for hyperparameter tuning
param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]}

# Creating the SVM classifier using quadratic kernel
svm_classifier = SVC(kernel='poly', degree=2)

# Performing grid search with cross-validation for receiving the best parameter for C
grid_search = GridSearchCV(svm_classifier, param_grid, cv=5, return_train_score=True, error_score='raise')
grid_search.fit(X_train, y_train)

# Retrieveing results from the grid search
cv_results = pd.DataFrame(grid_search.cv_results_)

# Displaing all accuracies for each C value
print("All accuracies for each C value:")
for index, row in cv_results.iterrows():
    print(f"C = {row['param_C']:.3f}: Train Accuracy = {row['mean_train_score']:.3f}, Validation Accuracy = {row['mean_test_score']:.3f}")

# Get the best C value
best_C = grid_search.best_params_['C']
print(f'\nBest C value: {best_C}')

# Use the best C value to create the final SVM model
final_svm_classifier = SVC(kernel='poly',degree=2, C=best_C)
final_svm_classifier.fit(X_train, y_train)

# Predictions on the validation set
y_val_pred = final_svm_classifier.predict(X_val)

# Performance on the validation set
accuracy_val = accuracy_score(y_val, y_val_pred)
classification_rep_val = classification_report(y_val, y_val_pred)

print('\nValidation Set Performance:')
print(f'Accuracy: {accuracy_val:.2f}')
print('Classification Report:\n', classification_rep_val)

# Predictions on the test set
y_test_pred = final_svm_classifier.predict(X_test)

# Performance on the test set
accuracy_test = accuracy_score(y_test, y_test_pred)

# Display confusion matrix

# Calculating confusion matrix
conf_matrix_test = confusion_matrix(y_test, y_test_pred)

# Plotting confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix_test, annot=True, fmt='d', cmap='Blues',
            xticklabels=final_rf_classifier.classes_,
            yticklabels=final_rf_classifier.classes_)
plt.title('Confusion Matrix (Test Set)')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.savefig('confusion_matrix_SVM.png')
plt.show()

# Classification report for the test set
classification_rep_test = classification_report(y_test, y_test_pred)
print('\nTest Set Performance:')
print(f'Accuracy: {accuracy_test:.2f}')
print('Classification Report:\n', classification_rep_test)

In [None]:
# 4.Training and evaluating the Random Forest model
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import pandas as pd

# Splitting the target variable
y = df_derrived['music_genre']

# Features (dropping the useless features)
X = df_derrived.drop(['obtained_date', 'artist_name', 'track_name', 'music_genre'], axis=1)

# Split the dataset into training, validation, and test sets with stratification
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42, stratify=y)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)

# Defining a range of parameters for Random Forest
param_grid = {
    'n_estimators': [10, 25, 50, 75],
    'max_depth': [None, 5, 10, 20],
    'min_samples_split': [2, 5, 50, 500],
    'min_samples_leaf': [2, 4, 40, 100]
}

# Creating the Random Forest classifier
rf_classifier = RandomForestClassifier(random_state=42)

# Performing grid search with cross-validation
grid_search = GridSearchCV(rf_classifier, param_grid, cv=5, return_train_score=True)
grid_search.fit(X_train, y_train)

# Retrieving the best parameters
best_params = grid_search.best_params_
print('Best Parameters:', best_params)

# Using the best parameters to create the final Random Forest model
final_rf_classifier = RandomForestClassifier(random_state=42, **best_params)
final_rf_classifier.fit(X_train, y_train)

# Predictions on the validation set
y_val_pred = final_rf_classifier.predict(X_val)

# Performance on the validation set
accuracy_val = accuracy_score(y_val, y_val_pred)
classification_rep_val = classification_report(y_val, y_val_pred)

print('\nValidation Set Performance:')
print(f'Accuracy: {accuracy_val:.2f}')
print('Classification Report:\n', classification_rep_val)

# Predictions on the test set
y_test_pred = final_rf_classifier.predict(X_test)

# Performance on the test set
accuracy_test = accuracy_score(y_test, y_test_pred)

# Display confusion matrix

# Calculating confusion matrix
conf_matrix_test = confusion_matrix(y_test, y_test_pred)

# Plotting confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix_test, annot=True, fmt='d', cmap='Blues',
            xticklabels=final_rf_classifier.classes_,
            yticklabels=final_rf_classifier.classes_)
plt.title('Confusion Matrix (Test Set)')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.savefig('confusion_matrix_random_forest.png')
plt.show()

# Classification report for the test set
classification_rep_test = classification_report(y_test, y_test_pred)
print('\nTest Set Performance:')
print(f'Accuracy: {accuracy_test:.2f}')
print('Classification Report:\n', classification_rep_test)
