# Anime LR Problem

Import the data

In [1]:
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").appName('Anime').getOrCreate()
sc = spark.sparkContext

22/12/04 22:33:39 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [2]:
# read data
df = spark.read.option("header","true").csv("anime.csv",inferSchema = True)

[Stage 0:>                                                          (0 + 1) / 1]                                                                                

Preprocess the data

In [3]:
# drop NA value
df = df[df['Score'] != 'Unknown']
df = df[df['Source'] != 'Unknown']
df = df[df['Rating'] != 'Unknown']

In [4]:
# convert string to float & integer
from pyspark.sql.functions import col
from pyspark.sql.types import FloatType
df = df.withColumn('Score',col('Score').cast('float'))
df = df.withColumn('Popularity',col('Popularity').cast('int'))

In [5]:
# create a new column 'Views'
df = df.withColumn('Views',df['Score-10']+df['Score-9']+df['Score-8']+df['Score-7']+df['Score-6']
                  +df['Score-5']+df['Score-4']+df['Score-3']+df['Score-2']+df['Score-1'])
df = df.dropna()

In [6]:
# select column
df = df.select(['Score','Type','Source','Rating','Popularity','Members','Favorites','Watching',
                'On-Hold','Dropped','Plan to Watch','Views'])
df.show(5)

22/12/04 22:34:04 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


+-----+-----+--------+--------------------+----------+-------+---------+--------+-------+-------+-------------+--------+
|Score| Type|  Source|              Rating|Popularity|Members|Favorites|Watching|On-Hold|Dropped|Plan to Watch|   Views|
+-----+-----+--------+--------------------+----------+-------+---------+--------+-------+-------+-------------+--------+
| 8.78|   TV|Original|R - 17+ (violence...|        39|1251960|    61971|  105808|  71513|  26678|       329800|641705.0|
| 8.39|Movie|Original|R - 17+ (violence...|       518| 273145|     1174|    4143|   1935|    770|        57964|160349.0|
| 8.24|   TV|   Manga|PG-13 - Teens 13 ...|       201| 558913|    12944|   29113|  25465|  13925|       146918|286146.0|
| 7.27|   TV|Original|PG-13 - Teens 13 ...|      1467|  94683|      587|    4300|   5121|   5378|        33719| 39094.0|
| 6.98|   TV|   Manga|       PG - Children|      4369|  13224|       18|     642|    766|   1108|         3394|  5923.0|
+-----+-----+--------+----------

In [7]:
# x and y variables
df.printSchema()

root
 |-- Score: float (nullable = true)
 |-- Type: string (nullable = true)
 |-- Source: string (nullable = true)
 |-- Rating: string (nullable = true)
 |-- Popularity: integer (nullable = true)
 |-- Members: integer (nullable = true)
 |-- Favorites: integer (nullable = true)
 |-- Watching: integer (nullable = true)
 |-- On-Hold: integer (nullable = true)
 |-- Dropped: integer (nullable = true)
 |-- Plan to Watch: integer (nullable = true)
 |-- Views: double (nullable = true)



In [8]:
# summary
df.summary().toPandas()

                                                                                

Unnamed: 0,summary,Score,Type,Source,Rating,Popularity,Members,Favorites,Watching,On-Hold,Dropped,Plan to Watch,Views
0,count,10123.0,10123,10123,10123,10123.0,10123.0,10123.0,10123.0,10123.0,10123.0,10123.0,10123.0
1,mean,6.618139882374102,,,,5730.270473179888,58803.960979946656,789.5186209621654,3820.6660081003656,1637.2161414600414,1996.3671836412127,13467.02212782772,30493.59379630544
2,stddev,0.8857704884898757,,,,3690.463058906929,160664.97255351138,5327.693221235331,18320.451425310617,5532.227784066705,6109.466350261791,29741.68165984968,96417.33167399192
3,min,1.85,Movie,4-koma manga,G - All Ages,1.0,172.0,0.0,0.0,0.0,0.0,13.0,101.0
4,25%,6.07,,,,2580.0,2243.0,3.0,87.0,54.0,77.0,789.0,745.0
5,50%,6.64,,,,5357.0,7999.0,17.0,363.0,210.0,198.0,2509.0,3110.0
6,75%,7.24,,,,8557.0,39050.0,116.0,1543.0,815.0,862.0,10587.0,17351.0
7,max,9.19,TV,Web manga,Rx - Hentai,15374.0,2589552.0,183914.0,887333.0,187919.0,174710.0,425531.0,1826691.0


Linear Regression

In [9]:
# convert string variables
from pyspark.ml.feature import StringIndexer,VectorAssembler
from pyspark.ml import Pipeline
convert = [StringIndexer(inputCol = column, outputCol = column+"_index")
            .fit(df) for column in ['Type','Source','Rating']]
pipeline = Pipeline(stages = convert)
df = pipeline.fit(df).transform(df)
df = df.drop('Type','Source','Rating')
df.show(10)

                                                                                

+-----+----------+-------+---------+--------+-------+-------+-------------+--------+----------+------------+------------+
|Score|Popularity|Members|Favorites|Watching|On-Hold|Dropped|Plan to Watch|   Views|Type_index|Source_index|Rating_index|
+-----+----------+-------+---------+--------+-------+-------+-------------+--------+----------+------------+------------+
| 8.78|        39|1251960|    61971|  105808|  71513|  26678|       329800|641705.0|       0.0|         1.0|         3.0|
| 8.39|       518| 273145|     1174|    4143|   1935|    770|        57964|160349.0|       2.0|         1.0|         3.0|
| 8.24|       201| 558913|    12944|   29113|  25465|  13925|       146918|286146.0|       0.0|         0.0|         0.0|
| 7.27|      1467|  94683|      587|    4300|   5121|   5378|        33719| 39094.0|       0.0|         1.0|         0.0|
| 6.98|      4369|  13224|       18|     642|    766|   1108|         3394|  5923.0|       0.0|         0.0|         5.0|
| 7.95|      1003| 14825

In [10]:
# create vector
feature = VectorAssembler(inputCols = df.columns[1:],outputCol = "Features")
feature_vector = feature.transform(df)

In [11]:
# split data to train and test subset
(traindata,testdata) = feature_vector.randomSplit([0.8, 0.2],seed = 42)

In [12]:
# model
from pyspark.ml.regression import LinearRegression
score_lr = LinearRegression(featuresCol = 'Features',labelCol = 'Score')
train_model = score_lr.fit(traindata)
results = train_model.evaluate(traindata)

22/12/04 22:34:33 WARN Instrumentation: [1c3ae9dc] regParam is zero, which might cause numerical instability and overfitting.
22/12/04 22:34:33 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeSystemBLAS
22/12/04 22:34:33 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeRefBLAS
22/12/04 22:34:33 WARN LAPACK: Failed to load implementation from: com.github.fommil.netlib.NativeSystemLAPACK
22/12/04 22:34:33 WARN LAPACK: Failed to load implementation from: com.github.fommil.netlib.NativeRefLAPACK


In [14]:
# coeffcients
train_model.coefficients

DenseVector([-0.0002, -0.0, -0.0, 0.0, 0.0001, -0.0001, 0.0, 0.0, 0.0143, -0.0051, -0.0449])

In [13]:
# error rate
print('Mean Squared Error :',results.meanSquaredError)
print('Rsquared Error :',results.r2)

Mean Squared Error : 0.3685942532977168
Rsquared Error : 0.5320844620570304


In [14]:
# predictions
predict_data = testdata.select('Features')
predictions = train_model.transform(predict_data)
predictions.show(10)

+--------------------+-----------------+
|            Features|       prediction|
+--------------------+-----------------+
|[2216.0,52059.0,2...|7.215597854550731|
|[6805.0,4426.0,27...|6.266605410005332|
|[9448.0,1577.0,1....|5.898949599246521|
|[9409.0,1597.0,1....|5.905856513538575|
|[9967.0,1249.0,4....|5.845432882437087|
|[8817.0,2013.0,1....|6.208256058983739|
|[997.0,149948.0,2...|6.422304324758633|
|[8950.0,1908.0,0....|6.051613506792226|
|[10445.0,1016.0,1...|5.874748942111927|
|[6007.0,6051.0,18...|6.637592076915379|
+--------------------+-----------------+
only showing top 10 rows

