<a href="https://colab.research.google.com/github/basadhi/music_genre/blob/main/Music_Genre.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Classification model

In [1]:
pip install pyspark pandas matplotlib streamlit

Collecting streamlit
  Downloading streamlit-1.45.0-py3-none-any.whl.metadata (8.9 kB)
Collecting watchdog<7,>=2.1.5 (from streamlit)
  Downloading watchdog-6.0.0-py3-none-manylinux2014_x86_64.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.3/44.3 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
Collecting pydeck<1,>=0.8.0b4 (from streamlit)
  Downloading pydeck-0.9.1-py2.py3-none-any.whl.metadata (4.1 kB)
Downloading streamlit-1.45.0-py3-none-any.whl (9.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.9/9.9 MB[0m [31m49.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pydeck-0.9.1-py2.py3-none-any.whl (6.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m65.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading watchdog-6.0.0-py3-none-manylinux2014_x86_64.whl (79 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m79.1/79.1 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[?25hInst

In [15]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.ml import Pipeline
from pyspark.ml.feature import Tokenizer, StopWordsRemover, HashingTF, IDF, StringIndexer
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
import os
from pyspark.sql import functions as F
from pyspark.sql.window import Window

In [3]:
spark = SparkSession.builder \
    .appName("MusicGenreClassifier") \
    .getOrCreate()

In [5]:
mendeley_df = spark.read.csv("/content/drive/MyDrive/tcc_ceds_music.csv", header=True, inferSchema=True)

In [6]:
columns_needed = ["artist_name", "track_name", "release_date", "genre", "lyrics"]
mendeley_df = mendeley_df.select(columns_needed)

In [7]:
train_df, test_df = mendeley_df.randomSplit([0.8, 0.2], seed=42)

In [8]:
tokenizer = Tokenizer(inputCol="lyrics", outputCol="words")
stopword_remover = StopWordsRemover(inputCol="words", outputCol="filtered_words")
hashing_tf = HashingTF(inputCol="filtered_words", outputCol="raw_features", numFeatures=10000)
idf = IDF(inputCol="raw_features", outputCol="features")
label_indexer = StringIndexer(inputCol="genre", outputCol="label")

In [11]:
rf_classifier = RandomForestClassifier(labelCol="label", featuresCol="features", numTrees=100)

In [12]:
pipeline = Pipeline(stages=[
    tokenizer,
    stopword_remover,
    hashing_tf,
    idf,
    label_indexer,
    rf_classifier])

In [13]:
model = pipeline.fit(train_df)

In [14]:
predictions = model.transform(test_df)
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print(f"Test set accuracy: {accuracy:.4f}")

Test set accuracy: 0.2467


In [16]:
class_counts = mendeley_df.groupBy("genre").count()
class_counts.show()

+-------+-----+
|  genre|count|
+-------+-----+
|    pop| 7042|
|  blues| 4604|
|country| 5445|
|   jazz| 3845|
|hip hop|  904|
|   rock| 4034|
| reggae| 2498|
+-------+-----+



In [18]:
majority_count = class_counts.agg(F.max("count")).collect()[0][0]
print(majority_count)

7042


In [19]:
def upsample_class(df, label_col, majority_count):
    genres = [row['genre'] for row in class_counts.collect()]
    balanced_dfs = []

    for g in genres:
        class_df = df.filter(F.col(label_col) == g)
        count = class_df.count()

        # Calculate how many times to repeat
        repeat_factor = majority_count // count
        remainder = majority_count % count

        # Repeat full dataset
        repeated_df = class_df
        for _ in range(repeat_factor - 1):
            repeated_df = repeated_df.union(class_df)

        # Sample additional remainder rows if needed
        if remainder > 0:
            sampled_df = class_df.sample(withReplacement=True, fraction=(remainder / count + 0.01), seed=42).limit(remainder)
            repeated_df = repeated_df.union(sampled_df)

        balanced_dfs.append(repeated_df)

    # Combine all balanced classes
    balanced_df = balanced_dfs[0]
    for bdf in balanced_dfs[1:]:
        balanced_df = balanced_df.union(bdf)

    return balanced_df


In [20]:
balanced_df = upsample_class(mendeley_df, label_col="genre", majority_count=majority_count)

In [21]:
balanced_df.groupBy("genre").count().show()

+-------+-----+
|  genre|count|
+-------+-----+
|    pop| 7042|
|  blues| 7042|
|country| 7042|
|   jazz| 7042|
|hip hop| 7032|
|   rock| 7042|
| reggae| 7042|
+-------+-----+



In [30]:
train_df_bl, test_df_bl = balanced_df.randomSplit([0.8, 0.2], seed=42)

In [23]:
model = pipeline.fit(train_df_bl)

In [24]:
predictions = model.transform(test_df_bl)
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print(f"Test set accuracy: {accuracy:.4f}")

Test set accuracy: 0.3456


In [25]:
# Desired count per class
TARGET_COUNT = 1000

# Step 1: Get list of genres
genres = [row['genre'] for row in mendeley_df.select('genre').distinct().collect()]

# Step 2: For each genre, sample up to 1000 rows
balanced_dfs = []
for g in genres:
    class_df = mendeley_df.filter(F.col("genre") == g)

    # If class has more than TARGET_COUNT, downsample randomly
    class_count = class_df.count()
    if class_count >= TARGET_COUNT:
        sampled_df = class_df.sample(withReplacement=False, fraction=1.0, seed=42).limit(TARGET_COUNT)
    else:
        # If less, keep as is (OR: upsample if you want, but you didn’t ask for that here)
        sampled_df = class_df

    balanced_dfs.append(sampled_df)

#Union all classes together
downsampled_df = balanced_dfs[0]
for df in balanced_dfs[1:]:
    downsampled_df = downsampled_df.union(df)

downsampled_df.groupBy("genre").count().show()

+-------+-----+
|  genre|count|
+-------+-----+
|    pop| 1000|
|  blues| 1000|
|country| 1000|
|   jazz| 1000|
|hip hop|  904|
|   rock| 1000|
| reggae| 1000|
+-------+-----+



In [39]:
train_df_down, test_df_down = downsampled_df.randomSplit([0.8, 0.2], seed=42)

In [27]:
model = pipeline.fit(train_df_down)

In [28]:
predictions = model.transform(test_df_down)
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print(f"Test set accuracy: {accuracy:.4f}")

Test set accuracy: 0.2995


In [29]:
from pyspark.ml.classification import LogisticRegression

lr_classifier = LogisticRegression(labelCol="label", featuresCol="features", maxIter=100, regParam=0.01)

pipeline_lr = Pipeline(stages=[
    label_indexer,
    tokenizer,
    stopword_remover,
    hashing_tf,
    idf,
    lr_classifier
])

In [32]:
model = pipeline_lr.fit(train_df)

In [34]:
predictions = model.transform(test_df)
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print(f"Test set accuracy: {accuracy:.4f}")

Test set accuracy: 0.3346


In [36]:
model = pipeline_lr.fit(train_df_bl)

In [37]:
predictions = model.transform(test_df_bl)
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print(f"Test set accuracy: {accuracy:.4f}")

Test set accuracy: 0.6445


In [40]:
model = pipeline_lr.fit(train_df_down)

In [41]:
predictions = model.transform(test_df_down)
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print(f"Test set accuracy: {accuracy:.4f}")

Test set accuracy: 0.3346


Highest accuracy is given by the logistic regression model trained on the upsampled mendely dataset.

In [42]:
lr_model = pipeline_lr.fit(train_df_bl)

In [52]:
lr_model.save("lr_model_music_genre")

In [43]:
save_path = "/content/drive/My Drive/pyspark_models/lr_model_music_genre"
lr_model.write().overwrite().save(save_path)

In [46]:
!zip -r /content/drive/My\ Drive/pyspark_models/lr_model_music_genre.zip /content/drive/My\ Drive/pyspark_models/lr_model_music_genre

  adding: content/drive/My Drive/pyspark_models/lr_model_music_genre/ (stored 0%)
  adding: content/drive/My Drive/pyspark_models/lr_model_music_genre/metadata/ (stored 0%)
  adding: content/drive/My Drive/pyspark_models/lr_model_music_genre/metadata/part-00000 (deflated 25%)
  adding: content/drive/My Drive/pyspark_models/lr_model_music_genre/metadata/.part-00000.crc (stored 0%)
  adding: content/drive/My Drive/pyspark_models/lr_model_music_genre/metadata/_SUCCESS (stored 0%)
  adding: content/drive/My Drive/pyspark_models/lr_model_music_genre/metadata/._SUCCESS.crc (stored 0%)
  adding: content/drive/My Drive/pyspark_models/lr_model_music_genre/stages/ (stored 0%)
  adding: content/drive/My Drive/pyspark_models/lr_model_music_genre/stages/0_StringIndexer_717b58ae37aa/ (stored 0%)
  adding: content/drive/My Drive/pyspark_models/lr_model_music_genre/stages/0_StringIndexer_717b58ae37aa/metadata/ (stored 0%)
  adding: content/drive/My Drive/pyspark_models/lr_model_music_genre/stages/0_St

In [51]:
labels = label_indexer.fit(balanced_df).labels
with open('label_mapping.txt', 'w') as f:
    for label in labels:
        f.write(label + '\n')

In [56]:
lr_model.save('lr_model_new')


# Dataset

In [64]:
import pandas as pd

mendely = pd.read_csv("/content/drive/MyDrive/tcc_ceds_music.csv")
student = pd.read_csv("/content/Student_dataset_clean_fixed (2).csv")

In [66]:
merged = pd.concat([student, mendely], ignore_index=True)

# Show the first few rows of the merged DataFrame
print(merged.head())

# Save the merged DataFrame as a CSV file
merged.to_csv("merged_music_data.csv", index=False)

       artist_name                        track_name  release_date genre  \
0         Al Green               Let's Stay Together          1972  Soul   
1      Marvin Gaye                   What's Going On          1971  Soul   
2  Aretha Franklin                           Respect          1967  Soul   
3     Otis Redding  (Sittin' On) The Dock of the Bay          1968  Soul   
4     Bill Withers                 Ain't No Sunshine          1971  Soul   

                                              lyrics  Unnamed: 0  len  dating  \
0  I, I'm so in love with you Whatever you want t...         NaN  NaN     NaN   
1  Mother, mother There's too many of you crying ...         NaN  NaN     NaN   
2  What you want Baby, I got it What you need Do ...         NaN  NaN     NaN   
3  Sittin' in the mornin' sun I'll be sittin' whe...         NaN  NaN     NaN   
4  Ain't no sunshine when she's gone. It's not wa...         NaN  NaN     NaN   

   violence  world/life  ...  sadness  feelings  danceab

In [69]:
columns = ['Unnamed: 0', 'len', 'dating', 'violence', 'world/life', 'night/time',
       'shake the audience', 'family/gospel', 'romantic', 'communication',
       'obscene', 'music', 'movement/places', 'light/visual perceptions',
       'family/spiritual', 'like/girls', 'sadness', 'feelings', 'danceability',
       'loudness', 'acousticness', 'instrumentalness', 'valence', 'energy',
       'topic', 'age']

In [70]:
merged.drop(columns=columns, inplace=True)

In [71]:
merged.head()

Unnamed: 0,artist_name,track_name,release_date,genre,lyrics
0,Al Green,Let's Stay Together,1972,Soul,"I, I'm so in love with you Whatever you want t..."
1,Marvin Gaye,What's Going On,1971,Soul,"Mother, mother There's too many of you crying ..."
2,Aretha Franklin,Respect,1967,Soul,"What you want Baby, I got it What you need Do ..."
3,Otis Redding,(Sittin' On) The Dock of the Bay,1968,Soul,Sittin' in the mornin' sun I'll be sittin' whe...
4,Bill Withers,Ain't No Sunshine,1971,Soul,Ain't no sunshine when she's gone. It's not wa...


In [72]:
merged.isnull().sum()

Unnamed: 0,0
artist_name,0
track_name,0
release_date,0
genre,0
lyrics,0


In [74]:
merged.to_csv("merged_music_data1.csv", index=False)

In [75]:
# Save the merged DataFrame to a specific folder in Google Drive
file_path = "/content/drive/My Drive/merged_music_data1.csv"
merged.to_csv(file_path, index=False)

print("File saved to Google Drive:", file_path)


File saved to Google Drive: /content/drive/My Drive/merged_music_data1.csv


# Classification for merged dataset

In [76]:
merged_df = spark.read.csv("/content/drive/MyDrive/merged_music_data1.csv", header=True, inferSchema=True)

In [77]:
from pyspark.sql.functions import col,isnan, when, count
df2 = merged_df.select([count(when(col(c).contains('None') | \
                            col(c).contains('NULL') | \
                            (col(c) == '' ) | \
                            col(c).isNull() | \
                            isnan(c), c
                           )).alias(c)
                    for c in merged_df.columns])
df2.show()


+-----------+----------+------------+-----+------+
|artist_name|track_name|release_date|genre|lyrics|
+-----------+----------+------------+-----+------+
|          0|         0|           0|    0|     0|
+-----------+----------+------------+-----+------+



In [79]:
train_df_merged, test_df_merged = merged_df.randomSplit([0.8, 0.2], seed=42)

In [80]:
lr_model_merged = pipeline_lr.fit(train_df_merged)

In [85]:
predictions_merged = lr_model_merged.transform(test_df_merged)
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions_merged)
print(f"Test set accuracy: {accuracy:.4f}")

Test set accuracy: 0.3234


In [82]:
balanced_df_merged = upsample_class(merged_df, label_col="genre", majority_count=majority_count)

In [83]:
train_df_merged_bl, test_df_merged_bl = balanced_df_merged.randomSplit([0.8, 0.2], seed=42)

In [84]:
lr_model_merged_bl = pipeline_lr.fit(train_df_merged_bl)

In [86]:
predictions_merged_bl = lr_model_merged_bl.transform(test_df_merged_bl)
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions_merged_bl)
print(f"Test set accuracy: {accuracy:.4f}")

Test set accuracy: 0.6432


In [87]:
lr_model.save("lr_model_merged_music_genre")

In [88]:
save_path = "/content/drive/My Drive/pyspark_models/lr_model_merged_music_genre"
lr_model.write().overwrite().save(save_path)

In [89]:
!zip -r /content/drive/My\ Drive/pyspark_models/lr_model_merged_music_genre.zip /content/drive/My\ Drive/pyspark_models/lr_model_merged_music_genre

  adding: content/drive/My Drive/pyspark_models/lr_model_merged_music_genre/ (stored 0%)
  adding: content/drive/My Drive/pyspark_models/lr_model_merged_music_genre/metadata/ (stored 0%)
  adding: content/drive/My Drive/pyspark_models/lr_model_merged_music_genre/metadata/part-00000 (deflated 25%)
  adding: content/drive/My Drive/pyspark_models/lr_model_merged_music_genre/metadata/.part-00000.crc (stored 0%)
  adding: content/drive/My Drive/pyspark_models/lr_model_merged_music_genre/metadata/_SUCCESS (stored 0%)
  adding: content/drive/My Drive/pyspark_models/lr_model_merged_music_genre/metadata/._SUCCESS.crc (stored 0%)
  adding: content/drive/My Drive/pyspark_models/lr_model_merged_music_genre/stages/ (stored 0%)
  adding: content/drive/My Drive/pyspark_models/lr_model_merged_music_genre/stages/0_StringIndexer_717b58ae37aa/ (stored 0%)
  adding: content/drive/My Drive/pyspark_models/lr_model_merged_music_genre/stages/0_StringIndexer_717b58ae37aa/metadata/ (stored 0%)
  adding: content