# Exercice 1

## Import 

In [1]:
from pyspark.sql import SparkSession
import pyspark
import findspark
from pyspark.sql import SparkSession
from pyspark import SparkContext
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.sql import SQLContext
import pyspark.sql.functions as F
from pyspark.sql import DataFrameReader

In [2]:
#Initialisation de findspark
findspark.init()

In [3]:
#Création du SparkSession
spark = SparkSession.builder.appName("Exemple Spark").getOrCreate()
sc = SparkContext.getOrCreate()

## Importer le jeu de données

In [4]:
schema = StructType([\
    StructField("id_film", IntegerType(), True),\
    StructField("id_util", IntegerType(), True),\
    StructField("note", IntegerType(), True),\
    StructField("timestamp", IntegerType(), True)])

In [5]:
movies_ratings = (spark.read.format("csv").option("header","false").option("sep","\t").schema(schema).load("./Exercices_Dataframe/u.data"))
movies_ratings.show()

+-------+-------+----+---------+
|id_film|id_util|note|timestamp|
+-------+-------+----+---------+
|    196|    242|   3|881250949|
|    186|    302|   3|891717742|
|     22|    377|   1|878887116|
|    244|     51|   2|880606923|
|    166|    346|   1|886397596|
|    298|    474|   4|884182806|
|    115|    265|   2|881171488|
|    253|    465|   5|891628467|
|    305|    451|   3|886324817|
|      6|     86|   3|883603013|
|     62|    257|   2|879372434|
|    286|   1014|   5|879781125|
|    200|    222|   5|876042340|
|    210|     40|   3|891035994|
|    224|     29|   3|888104457|
|    303|    785|   3|879485318|
|    122|    387|   5|879270459|
|    194|    274|   2|879539794|
|    291|   1042|   4|874834944|
|    234|   1184|   2|892079237|
+-------+-------+----+---------+
only showing top 20 rows



## Split des données 80/20

In [6]:
# Create test and train set
(train, test) = movies_ratings.randomSplit([0.8, 0.2], seed = 42)

## Création du modèle et évaluation

In [7]:
# Import the required functions
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS

# Create ALS model
als = ALS(
         itemCol="id_film",
         userCol="id_util", 
         ratingCol="note", 
         nonnegative = True, 
         implicitPrefs = False,
         coldStartStrategy="drop"
)

In [8]:
# Define evaluator as RMSE and print length of evaluator
evaluator = RegressionEvaluator(metricName="rmse", labelCol="note", predictionCol="prediction") 

## Prédictions et score du Modèle 

In [9]:
#Fit model to the 'train' dataset
model = als.fit(train)

# View the predictions
test_predictions = model.transform(test)
test_predictions.show()

RMSE = evaluator.evaluate(test_predictions)
print(RMSE)

+-------+-------+----+---------+----------+
|id_film|id_util|note|timestamp|prediction|
+-------+-------+----+---------+----------+
|      6|    463|   4|883601713| 3.3823757|
|      7|    463|   4|891353192| 3.6681702|
|      7|    496|   5|891351083| 4.5561175|
|     10|    496|   5|877889005| 4.4123445|
|     13|    471|   1|882140455|  3.126554|
|     16|    471|   3|877724845| 4.0041533|
|     21|    148|   1|874951482| 2.4498372|
|     42|    496|   5|881107718|  4.707486|
|     65|    471|   4|879217434| 3.5055668|
|     67|    833|   4|875379794| 3.2711296|
|     75|    833|   2|884051113| 3.2874758|
|     83|    471|   3|891182000| 3.7135022|
|     84|    148|   4|883452274| 3.5118935|
|     92|    148|   2|877383934| 2.8538518|
|     92|    463|   4|875656623| 3.8172255|
|     94|    496|   3|885873159|  4.060447|
|     95|    471|   5|884266051|   3.58706|
|     95|    496|   4|879198746|  4.300857|
|     99|    471|   4|885679091| 3.5642917|
|    119|    471|   4|886177338|

## Recommandations

In [13]:
rec = model.recommendForAllUsers(5)
rec.show()

+-------+--------------------+
|id_util|     recommendations|
+-------+--------------------+
|      1|[{688, 5.5186486}...|
|     12|[{688, 5.311125},...|
|     22|[{688, 5.770404},...|
|     26|[{688, 4.84113}, ...|
|     27|[{519, 4.728087},...|
|     28|[{688, 5.523353},...|
|     31|[{688, 4.940987},...|
|     34|[{628, 4.419863},...|
|     44|[{688, 4.4623766}...|
|     47|[{688, 4.7291183}...|
|     52|[{519, 4.594738},...|
|     53|[{366, 4.670725},...|
|     65|[{300, 4.927075},...|
|     76|[{688, 4.8965645}...|
|     78|[{688, 4.5673275}...|
|     81|[{118, 4.411306},...|
|     85|[{546, 4.3037753}...|
|     91|[{928, 5.133658},...|
|     93|[{4, 4.8318295}, ...|
|    101|[{636, 4.9133534}...|
+-------+--------------------+
only showing top 20 rows



# Exercice 2

In [19]:
df = (spark.read.format("csv").option("header","true").load("./Exercices_SparkML/realestate.csv"))
df.show()

+---+---------------+--------+-------------+-----------------------+--------+---------+---------------+
| No|TransactionDate|HouseAge|DistanceToMRT|NumberConvenienceStores|Latitude|Longitude|PriceOfUnitArea|
+---+---------------+--------+-------------+-----------------------+--------+---------+---------------+
|  1|       2012.917|      32|     84.87882|                     10|24.98298|121.54024|           37.9|
|  2|       2012.917|    19.5|     306.5947|                      9|24.98034|121.53951|           42.2|
|  3|       2013.583|    13.3|     561.9845|                      5|24.98746|121.54391|           47.3|
|  4|         2013.5|    13.3|     561.9845|                      5|24.98746|121.54391|           54.8|
|  5|       2012.833|       5|     390.5684|                      5|24.97937|121.54245|           43.1|
|  6|       2012.667|     7.1|      2175.03|                      3|24.96305|121.51254|           32.1|
|  7|       2012.667|    34.5|     623.4731|                    

In [20]:
df2 = df.selectExpr("cast(No as int) No",
    "cast(TransactionDate as float) TransactionDate",
    "cast(HouseAge as float) HouseAge",
    "cast(DistanceToMRT as float) DistanceToMRT",
    "cast(NumberConvenienceStores as int) NumberConvenienceStores",
    "cast(Latitude as float) Latitude", 
    "cast(Longitude as float) Longitude",
    "cast(PriceOfUnitArea as float) PriceOfUnitArea")               
df2.printSchema()
df2.show()

root
 |-- No: integer (nullable = true)
 |-- TransactionDate: float (nullable = true)
 |-- HouseAge: float (nullable = true)
 |-- DistanceToMRT: float (nullable = true)
 |-- NumberConvenienceStores: integer (nullable = true)
 |-- Latitude: float (nullable = true)
 |-- Longitude: float (nullable = true)
 |-- PriceOfUnitArea: float (nullable = true)

+---+---------------+--------+-------------+-----------------------+--------+---------+---------------+
| No|TransactionDate|HouseAge|DistanceToMRT|NumberConvenienceStores|Latitude|Longitude|PriceOfUnitArea|
+---+---------------+--------+-------------+-----------------------+--------+---------+---------------+
|  1|       2012.917|    32.0|     84.87882|                     10|24.98298|121.54024|           37.9|
|  2|       2012.917|    19.5|     306.5947|                      9|24.98034|121.53951|           42.2|
|  3|       2013.583|    13.3|     561.9845|                      5|24.98746|121.54391|           47.3|
|  4|         2013.5|    

In [21]:
features = df.columns[0:6]
label = df.columns[-1]
print("features: ", features)
print("label: ", label)

features:  ['No', 'TransactionDate', 'HouseAge', 'DistanceToMRT', 'NumberConvenienceStores', 'Latitude']
label:  PriceOfUnitArea


In [22]:
from pyspark.ml.feature import VectorAssembler
assembler = VectorAssembler(inputCols = features, outputCol="features")
output = assembler.transform(df)
output.select("features", "clicked").show(truncate=False)

IllegalArgumentException: Data type string of column No is not supported.
Data type string of column TransactionDate is not supported.
Data type string of column HouseAge is not supported.
Data type string of column DistanceToMRT is not supported.
Data type string of column NumberConvenienceStores is not supported.
Data type string of column Latitude is not supported.

In [None]:
train, test = output.randomSplit([0.8, 0.2], seed = 42)
print(train.count())
print(test.count())

In [None]:
Decison_Tree = DecisionTreeRegressor(featuresCol = "features", labelCol = label)

In [None]:
# Define evaluator as RMSE and print length of evaluator
evaluator = RegressionEvaluator(metricName="rmse", labelCol=label, predictionCol="prediction") 

In [None]:
#Fit model to the 'train' dataset
model = Decison_Tree.fit(train)

# View the predictions
test_predictions = model.transform(test)
test_predictions.show()

RMSE = evaluator.evaluate(test_predictions)
print(RMSE)