In [405]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("taylor").getOrCreate()
df = spark.read.csv("taylor.csv", header=True, inferSchema=True, sep=";")
df.show()

+------------+--------------------+----+-------------------+-------+-------+-----+
|       album|               title|year|           duration|spotify|youtube|grade|
+------------+--------------------+----+-------------------+-------+-------+-----+
|Taylor Swift|          Tim McGraw|2006|2024-02-04 03:52:00|    102|     50|  7,9|
|Taylor Swift|     Picture To Burn|2008|2024-02-04 02:53:00|    143|    118| 8,85|
|Taylor Swift|Teardrops On My G...|2007|2024-02-04 03:23:00|    177|    166|  8,7|
|Taylor Swift|A Place in This W...|2006|2024-02-04 03:19:00|     30|    0,7| 9,25|
|Taylor Swift|         Cold As You|2006|2024-02-04 03:59:00|     31|   0,75| 7,85|
|Taylor Swift|         The Outside|2006|2024-02-04 03:27:00|     21|    0,5| 7,55|
|Taylor Swift|Tied Together wit...|2006|2024-02-04 04:08:00|     24|    0,5| 7,35|
|Taylor Swift|      Stay Beautiful|2006|2024-02-04 03:56:00|     26|    0,5|  6,5|
|Taylor Swift|   Should've Said No|2008|2024-02-04 04:02:00|     92|      2| 8,35|
|Tay

In [406]:
from pyspark.sql.functions import hour, minute, col
from pyspark.sql.functions import regexp_replace

df = df.withColumn("grade", regexp_replace("grade", ",", "."))
df = df.withColumn("grade", df["grade"].cast("double"))
df = df.withColumn("youtube", df["youtube"].cast("integer"))


df = df.withColumn("minutes", hour("duration"))
df = df.withColumn("seconds", minute("duration"))
df = df.withColumn("duration", col("minutes") * 60 + col("seconds")).drop("hour", "minute")
df = df.withColumn("duration", df["duration"].cast("integer"))
df = df.drop("minutes")
df = df.drop("seconds")


df.printSchema()

cols = df.columns
print(cols)

root
 |-- album: string (nullable = true)
 |-- title: string (nullable = true)
 |-- year: integer (nullable = true)
 |-- duration: integer (nullable = true)
 |-- spotify: integer (nullable = true)
 |-- youtube: integer (nullable = true)
 |-- grade: double (nullable = true)

['album', 'title', 'year', 'duration', 'spotify', 'youtube', 'grade']


In [407]:
from pyspark.ml.feature import StringIndexer

indexer = StringIndexer(inputCol="album", outputCol="album_index")
indexed = indexer.fit(df).transform(df)

In [409]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler


assembler = VectorAssembler(inputCols=[
    'year',
'duration',
 'spotify',
 'youtube',
 'album_index'], 
outputCol="features",
handleInvalid="skip")

In [410]:
output = assembler.transform(indexed)
output.select("features", "grade").show()

+--------------------+-----+
|            features|grade|
+--------------------+-----+
|[2006.0,232.0,102...|  7.9|
|[2008.0,173.0,143...| 8.85|
|[2007.0,203.0,177...|  8.7|
|[2008.0,242.0,92....| 8.35|
|[2007.0,201.0,243...| 8.65|
|[2006.0,213.0,38....|  8.6|
|[2008.0,241.0,167...|9.725|
|[2008.0,294.0,106...|  7.6|
|[2008.0,235.0,766...| 9.45|
|[2008.0,234.0,104...|  8.2|
|[2008.0,231.0,554...| 9.74|
|[2008.0,261.0,54....|  7.2|
|[2008.0,243.0,309...| 9.65|
|[2008.0,245.0,39....|  6.9|
|[2008.0,279.0,34....| 7.65|
|[2008.0,237.0,31....| 7.35|
|[2008.0,263.0,28....| 7.45|
|[2021.0,277.0,284...|  9.2|
|[2010.0,237.0,38....| 8.75|
|[2010.0,231.0,107...|  9.2|
+--------------------+-----+
only showing top 20 rows



In [416]:
final_data = output.select("features", "grade")

In [417]:
train_data, test_data = final_data.randomSplit([0.75, 0.25])

In [1]:
train_data.describe().show()
test_data.describe().show()

NameError: name 'train_data' is not defined

In [420]:
from pyspark.ml.regression import LinearRegression

taylor_lr = LinearRegression(labelCol="grade")
trained_taylor_model = taylor_lr.fit(train_data)

taylor_results = trained_taylor_model.evaluate(test_data)

In [421]:
taylor_results.r2

0.3899618056470763

In [422]:
taylor_results.rootMeanSquaredError

0.5857403741129606

In [423]:
taylor_results.meanAbsoluteError

0.5032361843606562

In [424]:
taylor_results.meanSquaredError

0.34309178586599104

In [425]:
df.columns

['album', 'title', 'year', 'duration', 'spotify', 'youtube', 'grade']

In [426]:
from pyspark.sql.functions import corr
df.select(corr("grade", "spotify")).show()

+--------------------+
|corr(grade, spotify)|
+--------------------+
|  0.5363427965233678|
+--------------------+

