In [None]:
import os
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, mean as _mean, stddev as _stddev, col , collect_list
from IPython.display import display

import numpy
from numpy import allclose
from pyspark.ml.linalg import Vectors
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml.evaluation import MulticlassClassificationEvaluator


In [None]:
import findspark
findspark.init()
findspark.find()



In [None]:
# Create a SparkSession
spark = SparkSession.builder \
    .appName("My Spark App") \
    .getOrCreate()

# load data

In [None]:
features_genre_df = spark.read.csv("cleaned_songs.csv", header=True, inferSchema=True)
features_genre_df = features_genre_df.na.drop()


# prepare data for classification

In [None]:
features = [col_name for col_name in features_genre_df.columns if col_name != "genre"]

for feature in features:
    features_genre_df = features_genre_df.filter( col(feature).cast("float").isNotNull() | col(feature).cast("int").isNotNull() | col(feature).cast("double").isNotNull() )
print(features_genre_df.count())


In [None]:

# for feature in features:
#     stats = cleaned_df.select(
#         _mean(col(feature)).alias("mean"), _stddev(col(feature)).alias("stddev")
#     ).collect()
#     mean = stats[0]["mean"]
#     stddev = stats[0]["stddev"]

#     # Normalize each feature using z-scoring
#     cleaned_df = cleaned_df.withColumn(feature, (col(feature) - mean) / stddev)

#     # make all values lies between -1 and 1

#     min_value = cleaned_df.agg({feature: "min"}).collect()[0][0]
#     max_value = cleaned_df.agg({feature: "max"}).collect()[0][0]

#     value = max(abs(min_value), abs(max_value))

#     cleaned_df = cleaned_df.withColumn(feature, (col(feature) / value))


In [None]:


# Function to map each value to a tuple of (value, value^2, 1)
def z_score_map_function(row):
    value = row   
    return [(value, value**2, 1)]

# Function to reduce tuples of (value, value^2, 1) to (sum, sum of squares, count)
def z_score_reduce_function(acc, value):
    return (acc[0] + value[0], acc[1] + value[1], acc[2] + value[2])
    
# Function to get absolute maximum value 
def abs_max_map_function(x):
    return [(abs(x[0]))]

# Function to get absolute maximum value
def abs_max_reduce_function(acc, value):
    return max(acc, value)

In [None]:
from pyspark.sql.types import DoubleType
for index , feature in enumerate(features):
    # cast to double
    features_genre_df = features_genre_df.withColumn(feature, col(feature).cast(DoubleType()))
    # Map step
    mapped_rdd = features_genre_df.rdd.flatMap(lambda x: z_score_map_function(x[index+1]))
    
    # Reduce step
    reduced_result = mapped_rdd.reduce(lambda acc, value: z_score_reduce_function(acc, value))

    # Calculate standard deviation
    sum_value = reduced_result[0]
    sum_of_squares = reduced_result[1]
    count = reduced_result[2]

    mean = sum_value / count
    variance = (sum_of_squares - (sum_value**2 / count)) / count
    stddev = variance ** 0.5
    
    #perform z-score normalization
    features_genre_df = features_genre_df.withColumn(feature, (col(feature) - mean) / stddev)
    
    # make all values lies between -1 and 1
    
    # get absolute maximum value
    mapped_rdd = features_genre_df.select(feature).rdd.flatMap(lambda x: abs_max_map_function(x))
    reduced_result = mapped_rdd.reduce(lambda acc, value: abs_max_reduce_function(acc, value))
    
    max_value = reduced_result
    
    features_genre_df = features_genre_df.withColumn(feature, (col(feature) / max_value))

    print("Standard Deviation:", stddev)
    print("Mean:", mean)
    print ("Max Value:", max_value)
    

In [None]:
#export to csv
features_genre_df.toPandas().to_csv("normalized_songs.csv", index=False , header=True)

In [None]:
# Index the string column 'genre' to a numerical column 'indexed_genre'
stringIndexer = StringIndexer(inputCol="genre", outputCol="indexed_genre")
si_model = stringIndexer.fit(features_genre_df)
td = si_model.transform(features_genre_df)

assembler = VectorAssembler(inputCols=features, outputCol="features" , handleInvalid="skip")
td = assembler.transform(td)


# split the data into train and test sets
td = td.select("indexed_genre" , "features")

td.printSchema()


In [None]:

display(td.limit(10).toPandas())


In [None]:
# Split the data into training and testing sets (80% training, 20% testing)
train_data, test_data = td.randomSplit([0.8, 0.2], seed=42)
train_labels= train_data.select("indexed_genre").collect()
# Initialize the RandomForestClassifier
rf = RandomForestClassifier(numTrees=30, maxDepth=20, labelCol="indexed_genre", seed=42,
                            leafCol="leafId")

# Train the model on the training data
model = rf.fit(train_data)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
genres = set(train_labels)
genres = list(genres)
def get_score_confusion_matrix(model, test_Data, test_labels):
    
    score_matrix = [[0 for i in range(len(genres))] for j in range(len(genres))]
    
    for i in range(len(test_labels)):
        pred = model.predict([test_Data[i]])
        score_matrix[genres.index(test_labels[i])][genres.index(pred[0])] += 1
    #normalize the confusion matrix
    score_matrix = [[score_matrix[i][j]/sum(score_matrix[i]) for j in range(len(genres))] for i in range(len(genres))]
    return score_matrix

def draw_confusion_matrix(confusion_matrix):
    plt.figure(figsize=(16, 5))
    sns.heatmap(confusion_matrix, annot=True, cmap='Blues', xticklabels=genres, yticklabels=genres , fmt='g')
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.show()
        

# SVM

### best score: 0.5236602052451539

# Random Forests

### best score: 0.5653010723123454

# decision tree

### best score: 0.47594171020071485