# Exercice 1

## Import 

In [1]:
from pyspark.sql import SparkSession
import pyspark
import findspark
from pyspark.sql import SparkSession
from pyspark import SparkContext
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.sql import SQLContext
import pyspark.sql.functions as F
from pyspark.sql import DataFrameReader

In [2]:
#Initialisation de findspark
findspark.init()

In [3]:
#Création du SparkSession
spark = SparkSession.builder.appName("Exemple Spark").getOrCreate()
sc = SparkContext.getOrCreate()

## Importer le jeu de données

In [4]:
schema = StructType([\
    StructField("id_film", IntegerType(), True),\
    StructField("id_util", IntegerType(), True),\
    StructField("note", IntegerType(), True),\
    StructField("timestamp", IntegerType(), True)])

In [5]:
movies_ratings = (spark.read.format("csv").option("header","false").option("sep","\t").schema(schema).load("../Exercices_Dataframe/u.data"))
movies_ratings.show()

+-------+-------+----+---------+
|id_film|id_util|note|timestamp|
+-------+-------+----+---------+
|    196|    242|   3|881250949|
|    186|    302|   3|891717742|
|     22|    377|   1|878887116|
|    244|     51|   2|880606923|
|    166|    346|   1|886397596|
|    298|    474|   4|884182806|
|    115|    265|   2|881171488|
|    253|    465|   5|891628467|
|    305|    451|   3|886324817|
|      6|     86|   3|883603013|
|     62|    257|   2|879372434|
|    286|   1014|   5|879781125|
|    200|    222|   5|876042340|
|    210|     40|   3|891035994|
|    224|     29|   3|888104457|
|    303|    785|   3|879485318|
|    122|    387|   5|879270459|
|    194|    274|   2|879539794|
|    291|   1042|   4|874834944|
|    234|   1184|   2|892079237|
+-------+-------+----+---------+
only showing top 20 rows



## Split des données 80/20

In [6]:
# Create test and train set
(train, test) = movies_ratings.randomSplit([0.8, 0.2], seed = 42)

## Création du modèle et évaluation

In [7]:
# Import the required functions
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS

# Create ALS model
als = ALS(
         itemCol="id_film",
         userCol="id_util", 
         ratingCol="note", 
         nonnegative = True, 
         implicitPrefs = False,
         coldStartStrategy="drop"
)

In [8]:
# Define evaluator as RMSE and print length of evaluator
evaluator = RegressionEvaluator(metricName="rmse", labelCol="note", predictionCol="prediction") 

## Prédictions et score du Modèle 

In [10]:
#Fit model to the 'train' dataset
model = als.fit(train)

# View the predictions
test_predictions = model.transform(test)
test_predictions.show()

RMSE = evaluator.evaluate(test_predictions)
print(RMSE)

+-------+-------+----+---------+----------+
|id_film|id_util|note|timestamp|prediction|
+-------+-------+----+---------+----------+
|      6|    463|   4|883601713|   3.69308|
|      7|    463|   4|891353192| 3.8839319|
|      7|    496|   5|891351083|  4.479812|
|     10|    496|   5|877889005|  4.421669|
|     13|    471|   1|882140455|  3.233928|
|     16|    471|   3|877724845| 4.0094953|
|     21|    148|   1|874951482|  2.457415|
|     42|    496|   5|881107718| 4.7317567|
|     65|    471|   4|879217434|   3.67897|
|     67|    833|   4|875379794| 3.4029696|
|     75|    833|   2|884051113| 3.2552097|
|     83|    471|   3|891182000| 3.7587712|
|     84|    148|   4|883452274| 3.3591747|
|     92|    148|   2|877383934| 2.8067575|
|     92|    463|   4|875656623| 3.7974286|
|     94|    496|   3|885873159| 4.1130147|
|     95|    471|   5|884266051| 3.3027093|
|     95|    496|   4|879198746| 4.2307005|
|     99|    471|   4|885679091| 3.5876184|
|    119|    471|   4|886177338|

## Recommandations

In [11]:
rec = model.recommendForAllUsers(5)
rec.show()

+-------+--------------------+
|id_util|     recommendations|
+-------+--------------------+
|      1|[{688, 5.532333},...|
|     12|[{688, 5.5139046}...|
|     22|[{688, 5.622994},...|
|     26|[{849, 4.387957},...|
|     27|[{137, 4.839761},...|
|     28|[{688, 5.306454},...|
|     31|[{849, 4.9167933}...|
|     34|[{38, 5.372229}, ...|
|     44|[{688, 4.933082},...|
|     47|[{808, 4.6845784}...|
|     52|[{688, 4.6760197}...|
|     53|[{636, 4.443152},...|
|     65|[{688, 4.6812096}...|
|     76|[{849, 4.7106395}...|
|     78|[{38, 4.619017}, ...|
|     81|[{697, 4.775359},...|
|     85|[{519, 4.426957},...|
|     91|[{355, 4.8935}, {...|
|     93|[{928, 5.1379266}...|
|    101|[{519, 5.021376},...|
+-------+--------------------+
only showing top 20 rows



# Exercice 2

In [12]:
df = (spark.read.format("csv").option("header","true").load("../Exercices_SparkML/realestate.csv"))
df.show()

+---+---------------+--------+-------------+-----------------------+--------+---------+---------------+
| No|TransactionDate|HouseAge|DistanceToMRT|NumberConvenienceStores|Latitude|Longitude|PriceOfUnitArea|
+---+---------------+--------+-------------+-----------------------+--------+---------+---------------+
|  1|       2012.917|      32|     84.87882|                     10|24.98298|121.54024|           37.9|
|  2|       2012.917|    19.5|     306.5947|                      9|24.98034|121.53951|           42.2|
|  3|       2013.583|    13.3|     561.9845|                      5|24.98746|121.54391|           47.3|
|  4|         2013.5|    13.3|     561.9845|                      5|24.98746|121.54391|           54.8|
|  5|       2012.833|       5|     390.5684|                      5|24.97937|121.54245|           43.1|
|  6|       2012.667|     7.1|      2175.03|                      3|24.96305|121.51254|           32.1|
|  7|       2012.667|    34.5|     623.4731|                    

In [13]:
features = df.columns[0:6]
label = df.columns[-1]
print("features: ", features)
print("label: ", label)

features:  ['No', 'TransactionDate', 'HouseAge', 'DistanceToMRT', 'NumberConvenienceStores', 'Latitude']
label:  PriceOfUnitArea


In [14]:
from pyspark.ml.feature import VectorAssembler
assembler = VectorAssembler(inputCols = features, outputCol="features")

In [15]:
df2 = df.selectExpr("cast(No as int) No",
    "cast(TransactionDate as float) TransactionDate",
    "cast(HouseAge as float) HouseAge",
    "cast(DistanceToMRT as float) DistanceToMRT",
    "cast(NumberConvenienceStores as int) NumberConvenienceStores",
    "cast(Latitude as float) Latitude", 
    "cast(Longitude as float) Longitude",
    "cast(PriceOfUnitArea as float) PriceOfUnitArea")               
df2.printSchema()
df2.show()

root
 |-- No: integer (nullable = true)
 |-- TransactionDate: float (nullable = true)
 |-- HouseAge: float (nullable = true)
 |-- DistanceToMRT: float (nullable = true)
 |-- NumberConvenienceStores: integer (nullable = true)
 |-- Latitude: float (nullable = true)
 |-- Longitude: float (nullable = true)
 |-- PriceOfUnitArea: float (nullable = true)

+---+---------------+--------+-------------+-----------------------+--------+---------+---------------+
| No|TransactionDate|HouseAge|DistanceToMRT|NumberConvenienceStores|Latitude|Longitude|PriceOfUnitArea|
+---+---------------+--------+-------------+-----------------------+--------+---------+---------------+
|  1|       2012.917|    32.0|     84.87882|                     10|24.98298|121.54024|           37.9|
|  2|       2012.917|    19.5|     306.5947|                      9|24.98034|121.53951|           42.2|
|  3|       2013.583|    13.3|     561.9845|                      5|24.98746|121.54391|           47.3|
|  4|         2013.5|    

In [18]:
output = assembler.transform(df2)
output.select("features", "PriceOfUnitArea").show(truncate=False)

+-----------------------------------------------------------------------------------+---------------+
|features                                                                           |PriceOfUnitArea|
+-----------------------------------------------------------------------------------+---------------+
|[1.0,2012.9169921875,32.0,84.87882232666016,10.0,24.982980728149414]               |37.9           |
|[2.0,2012.9169921875,19.5,306.5946960449219,9.0,24.98033905029297]                 |42.2           |
|[3.0,2013.5830078125,13.300000190734863,561.9844970703125,5.0,24.987459182739258]  |47.3           |
|[4.0,2013.5,13.300000190734863,561.9844970703125,5.0,24.987459182739258]           |54.8           |
|[5.0,2012.8330078125,5.0,390.5683898925781,5.0,24.9793701171875]                   |43.1           |
|[6.0,2012.6669921875,7.099999904632568,2175.030029296875,3.0,24.963050842285156]   |32.1           |
|[7.0,2012.6669921875,34.5,623.4730834960938,7.0,24.97933006286621]               

In [19]:
train, test = output.randomSplit([0.8, 0.2], seed = 42)
print(train.count())
print(test.count())

355
59


In [21]:
from pyspark.ml.regression import DecisionTreeRegressor
Decison_Tree = DecisionTreeRegressor(featuresCol = "features", labelCol = label)

In [22]:
from pyspark.ml.evaluation import RegressionEvaluator
# Define evaluator as RMSE and print length of evaluator
evaluator = RegressionEvaluator(metricName="rmse", labelCol=label, predictionCol="prediction") 

In [23]:
#Fit model to the 'train' dataset
model = Decison_Tree.fit(train)

# View the predictions
test_predictions = model.transform(test)
test_predictions.show()

RMSE = evaluator.evaluate(test_predictions)
print(RMSE)

+---+---------------+--------+-------------+-----------------------+--------+---------+---------------+--------------------+------------------+
| No|TransactionDate|HouseAge|DistanceToMRT|NumberConvenienceStores|Latitude|Longitude|PriceOfUnitArea|            features|        prediction|
+---+---------------+--------+-------------+-----------------------+--------+---------+---------------+--------------------+------------------+
|  3|       2013.583|    13.3|     561.9845|                      5|24.98746|121.54391|           47.3|[3.0,2013.5830078...|40.441538209181566|
|  7|       2012.667|    34.5|     623.4731|                      7|24.97933|121.53642|           40.3|[7.0,2012.6669921...| 35.48181811246005|
|  9|         2013.5|    31.7|     5512.038|                      1|24.95095|121.48458|           18.8|[9.0,2013.5,31.70...|21.100000381469727|
| 14|       2012.667|    20.4|     2469.645|                      4|24.96108|121.51046|           23.8|[14.0,2012.666992...| 25.99062493