In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, trim, lower



In [10]:
spark = SparkSession.builder.appName("merge-lyrics").getOrCreate()



In [11]:
# ── 1. Load the two CSVs ───────────────────────────────────────────────
mend_path    = "data/mendeley.csv"      # adjust
student_path = "data/Student_dataset.csv"        # root-level as per rubric

mend_raw  = spark.read.option("header", True).csv(mend_path)
stud_raw  = spark.read.option("header", True).csv(student_path)



In [12]:
# ── 2. Keep only the five required columns & normalise column names ───
keep_cols = ["artist_name", "track_name", "release_date", "genre", "lyrics"]

mend = (mend_raw.select(keep_cols)
        .withColumn("genre",  trim(lower(col("genre"))))
        .withColumn("release_date", col("release_date").substr(1, 4)))  # keep YYYY

stud = (stud_raw.select(keep_cols)
        .withColumn("genre",  trim(lower(col("genre"))))
        .withColumn("release_date", col("release_date").substr(1, 4)))



In [13]:
# ── 3. Merge (union) ──────────────────────────────────────────────────
merged = mend.unionByName(stud)

# Optional: drop exact lyric duplicates (rare but nice)
# merged = merged.dropDuplicates(subset=["lyrics"])

print("Merged rows:", merged.count())



Merged rows: 28522


In [14]:

# ── 4. Write to CSV with header ───────────────────────────────────────
merged.coalesce(1).write.mode("overwrite") \
      .option("header", True) \
      .csv("Merged_dataset_tmp")



In [15]:
# Move/rename the single part-file to the final required name
import shutil, glob, os
tmp_file = glob.glob("Merged_dataset_tmp/part-*")[0]
shutil.move(tmp_file, "data/Merged_dataset.csv")
shutil.rmtree("Merged_dataset_tmp")

print("✅  Merged_dataset.csv written")
spark.stop()

✅  Merged_dataset.csv written


In [16]:
import pandas as pd

In [17]:
df1 = pd.read_csv("data/Merged_dataset.csv")

ParserError: Error tokenizing data. C error: Expected 5 fields in line 1147, saw 6


In [19]:
import pandas as pd
df = pd.read_csv("data/Merged_dataset.csv",
                 on_bad_lines="skip",        # pandas ≥1.3
                 engine="python")            # slower but more tolerant
print(df.shape)


(28366, 5)


# Using Pandas

In [22]:
import pandas as pd
import csv
from pathlib import Path

# ── 1.  Load Mendeley (robust parser settings) ─────────────────────────
mend_path    = Path("data/mendeley.csv")
student_path = Path("data/Student_dataset.csv")

keep = ["artist_name", "track_name", "release_date", "genre", "lyrics"]


In [23]:
mendeley = pd.read_csv(mend_path)
student = pd.read_csv(student_path)

In [25]:
mendeley.head()

Unnamed: 0.1,Unnamed: 0,artist_name,track_name,release_date,genre,lyrics,len,dating,violence,world/life,...,sadness,feelings,danceability,loudness,acousticness,instrumentalness,valence,energy,topic,age
0,0,mukesh,mohabbat bhi jhoothi,1950,pop,hold time feel break feel untrue convince spea...,95,0.000598,0.063746,0.000598,...,0.380299,0.117175,0.357739,0.454119,0.997992,0.901822,0.339448,0.13711,sadness,1.0
1,4,frankie laine,i believe,1950,pop,believe drop rain fall grow believe darkest ni...,51,0.035537,0.096777,0.443435,...,0.001284,0.001284,0.331745,0.64754,0.954819,2e-06,0.325021,0.26324,world/life,1.0
2,6,johnnie ray,cry,1950,pop,sweetheart send letter goodbye secret feel bet...,24,0.00277,0.00277,0.00277,...,0.00277,0.225422,0.456298,0.585288,0.840361,0.0,0.351814,0.139112,music,1.0
3,10,pérez prado,patricia,1950,pop,kiss lips want stroll charm mambo chacha merin...,54,0.048249,0.001548,0.001548,...,0.225889,0.001548,0.686992,0.744404,0.083935,0.199393,0.77535,0.743736,romantic,1.0
4,12,giorgos papadopoulos,apopse eida oneiro,1950,pop,till darling till matter know till dream live ...,48,0.00135,0.00135,0.417772,...,0.0688,0.00135,0.291671,0.646489,0.975904,0.000246,0.597073,0.394375,romantic,1.0


In [36]:
mendeley = mendeley[keep]
student = student[keep] 

In [37]:
mendeley.head()

Unnamed: 0,artist_name,track_name,release_date,genre,lyrics
0,mukesh,mohabbat bhi jhoothi,1950,pop,hold time feel break feel untrue convince spea...
1,frankie laine,i believe,1950,pop,believe drop rain fall grow believe darkest ni...
2,johnnie ray,cry,1950,pop,sweetheart send letter goodbye secret feel bet...
3,pérez prado,patricia,1950,pop,kiss lips want stroll charm mambo chacha merin...
4,giorgos papadopoulos,apopse eida oneiro,1950,pop,till darling till matter know till dream live ...


In [38]:
student.head()

Unnamed: 0,artist_name,track_name,release_date,genre,lyrics
0,Anuel AA,Hasta Que Dios Diga,2020,Reggae,"[Intro: Anuel AA] Brr Hoy la noche se acaba, t..."
1,Bryant Myers,Gan-Ga - Remix,2019,Reggae,[Intro: Anuel AA & Bryant Myers] Brr Lanalizer...
2,Ele A El Dominio,Codeine Crazy - Spanish Remix,2019,Reggae,"[Intro] 'Toy bien loco Ajá Ah, ah Overdose Com..."
3,Yannc,Sigue Bailandome,2018,Reggae,"[Intro: Darkiel, Myke Towers & Brray] (Sheesh)..."
4,Hale,The Day You Said Goodnight,2005,Reggae,Take me as you are Push me off the road The sa...


In [39]:
# ── minimal normalisation (optional but recommended) ───────────────
for frame in (mendeley, student):
    frame["genre"]        = frame["genre"].str.strip().str.lower()
    frame["release_date"] = frame["release_date"].astype(str).str[:4]  # keep YYYY

In [40]:
# ------------------------------------------------------------------
# concatenate the two data-frames row-wise
# ------------------------------------------------------------------
merged = pd.concat([mendeley, student], ignore_index=True)

In [41]:
# optional: drop rows with identical lyric text
# merged = merged.drop_duplicates(subset=["lyrics"])

print("Merged shape:", merged.shape)

Merged shape: (28522, 5)


In [43]:
# ------------------------------------------------------------------
# write a clean, safely-quoted CSV for the assignment
# ------------------------------------------------------------------
merged.to_csv(
    "data/Merged_dataset.csv",
    index=False,
    quoting=csv.QUOTE_ALL,   # wrap every field in double quotes
    lineterminator="\n",
    encoding="utf-8"
)