In [47]:
#pyspark 
#Cargamos las funcionalidades de spark en el notebook
from pyspark.ml.linalg import Vectors
from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .appName("Python Spark SQL basic example") \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()
    
flights = spark.read.load('flights.csv',format="csv", sep=",", inferSchema="true", header="true")

In [2]:
#Como vimos en clase, algunas variables necesitan ser modificadas para ser utilizadas en los modelos de ML
#la mayor parte de este código fue tomada directamente de las notas de clase y documentación
#Hacemos los arreglos necesarios a las varibles, hot encoder y tokenizer
from pyspark.ml.feature import OneHotEncoder, StringIndexer
from pyspark.ml import Pipeline

variables = ["AIRLINE","ORIGIN_AIRPORT","DESTINATION_AIRPORT","DAY_OF_WEEK"]
variables_string_encoded=["AIRLINE_strenc","ORIGIN_AIRPORT_strenc","DESTINATION_AIRPORT_strenc","DAY_OF_WEEK_strenc"]

stage_string = [StringIndexer(inputCol= c, outputCol= c+"_strenc") for c in variables]
stage_one_hot = [OneHotEncoder(inputCol= c+"_strenc", outputCol= c+ "_one_hot") for c in variables]

ppl = Pipeline(stages= stage_string + stage_one_hot)
flights = ppl.fit(flights).transform(flights)

from pyspark.ml.feature import HashingTF, Tokenizer

tokenizer = Tokenizer(inputCol="CANCELLATION_REASON", outputCol="CANCELLATION_REASON_tok")
hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="CANCELLATION_REASON_hash")


flights = flights.dropna(how='any', thresh=None, subset=["YEAR","MONTH","DAY","FLIGHT_NUMBER","SCHEDULED_DEPARTURE","DEPARTURE_TIME","TAXI_OUT", "WHEELS_OFF", "SCHEDULED_TIME","ELAPSED_TIME","AIR_TIME","DISTANCE","WHEELS_OFF","SCHEDULED_TIME", "ARRIVAL_TIME","ARRIVAL_DELAY"])

In [3]:
#Antes de pasar mis datos a un cluster de AWS voy a probar mi código localmente utilizando una imagen de Spark en Jupyter
#https://medium.com/@suci/running-pyspark-on-jupyter-notebook-with-docker-602b18ac4494
flights = flights.sample(False, .0001, 103531)
flights.count()

579

In [4]:
#Preparo mis datos
#A diferencia del ejecicio visto en clase, es necesario definir un conjunto de variables explicativas 
#para poder utilizar los modelo de ML. Tomaremos las variables que ya transformamos junto con algunas  
#de las originales para formar el vector "features" que fungirá como variables explicativas de la regresión.
#En clase únicamente fue necesario genera un dataset con el nombre de las columnas que se necesita, 
#aquí tenemos que "empaquetar" algunas variables para utilizarlas como X en nuestros modelos de ML.

#Para los 2 modelos de ML que voy a utilizar(regresión lineal y random forest) utilizaré el mismo vector "features"
#Para definir el vetor de features para mi regresión lineal
from pyspark.ml.feature import VectorAssembler

variables_int=["YEAR","MONTH","DAY","FLIGHT_NUMBER","SCHEDULED_DEPARTURE","DEPARTURE_TIME","TAXI_OUT",
              "WHEELS_OFF", "SCHEDULED_TIME","ELAPSED_TIME","AIR_TIME","DISTANCE","WHEELS_OFF","SCHEDULED_TIME",
              "ARRIVAL_TIME","ARRIVAL_DELAY"]

variables_trans=["AIRLINE_one_hot","ORIGIN_AIRPORT_one_hot","DESTINATION_AIRPORT_one_hot","DAY_OF_WEEK_one_hot"]

#Este es el paquete de variables explicativas que voy a considerar en mi ejercicio:
features = variables_int + variables_trans

vector_assembler = VectorAssembler(inputCols = features, outputCol= "features")

flights = vector_assembler.transform(flights)

In [6]:
#Separamos en entrenamiento y prueba
train = flights.sample(False, 0.7, 103531)
test = flights.subtract(train)
train.count()#Tamaño de nuestro conjunto de entrenamiento

396

In [8]:
test.count()

183

In [9]:
#El primer modelo para la predicción del DEPARTURE_DELAY será el modelo de regresión lineal
#Nota: primero voy a probar el correcto funcionamiento de mis modelos de forma individual. 
# --- Regresión Lineal ---
from pyspark.ml.regression import LinearRegression

#lr =  LinearRegression(labelCol = "DEPARTURE_DELAY", featuresCol = "features", maxIter = 20)
#lr_model = lr.fit(train)



In [11]:
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import RegressionEvaluator 

very_small_sample = flights.sample(False, 0.1).cache()

#pca_model = PCA(inputCol = "features", outputCol = "features_cv")
lr = LinearRegression(labelCol = "DEPARTURE_DELAY", featuresCol = "features")
ppl_cv = Pipeline(stages = [lr])

paramGrid = ParamGridBuilder() \
    .addGrid(lr.maxIter, [5, 10]) \
    .build()

In [12]:
very_small_sample.count()

59

In [13]:
crossval = CrossValidator(estimator = ppl_cv,
                        estimatorParamMaps=paramGrid,
                        evaluator = RegressionEvaluator(
    labelCol="DEPARTURE_DELAY", predictionCol="prediction", metricName="rmse"),
                        numFolds= 3)

cv_model = crossval.fit(very_small_sample)

In [14]:
predictions = cv_model.transform(test)
evaluator= RegressionEvaluator(labelCol = "DEPARTURE_DELAY", predictionCol="prediction", metricName= "rmse")
rmse = evaluator.evaluate(predictions)

In [15]:
predictions.select("DEPARTURE_DELAY","prediction").show(5)


+---------------+-------------------+
|DEPARTURE_DELAY|         prediction|
+---------------+-------------------+
|             10|-20.957390861362413|
|             -8| 4.1462204339933795|
|              4|-2.6883326485872256|
|              6|  24.95880530663274|
|             21| -4.354242736651701|
+---------------+-------------------+
only showing top 5 rows



In [16]:
print(rmse)

27.4639266305365


In [17]:
cv_model.bestModel.stages[0].extractParamMap()

{Param(parent='LinearRegression_4dec9ca4104234617391', name='aggregationDepth', doc='suggested depth for treeAggregate (>= 2)'): 2,
 Param(parent='LinearRegression_4dec9ca4104234617391', name='elasticNetParam', doc='the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty'): 0.0,
 Param(parent='LinearRegression_4dec9ca4104234617391', name='epsilon', doc='The shape parameter to control the amount of robustness. Must be > 1.0.'): 1.35,
 Param(parent='LinearRegression_4dec9ca4104234617391', name='featuresCol', doc='features column name'): 'features',
 Param(parent='LinearRegression_4dec9ca4104234617391', name='fitIntercept', doc='whether to fit an intercept term'): True,
 Param(parent='LinearRegression_4dec9ca4104234617391', name='labelCol', doc='label column name'): 'DEPARTURE_DELAY',
 Param(parent='LinearRegression_4dec9ca4104234617391', name='loss', doc='The loss function to be optimized. Supported options: square

In [18]:
from pyspark.ml.regression import RandomForestRegressor

(training_data, test_data) = flights.randomSplit([0.7, 0.3])
rf = RandomForestRegressor(labelCol = "DEPARTURE_DELAY", featuresCol = "features", numTrees = 20)
rf_model = rf.fit(training_data)

from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import RegressionEvaluator 

very_small_sample = flights.sample(False, 0.1).cache()

#pca_model = PCA(inputCol = "features", outputCol = "features_cv")
rf = RandomForestRegressor(labelCol = "DEPARTURE_DELAY", featuresCol = "features")
ppl_cv = Pipeline(stages = [rf])

paramGrid = ParamGridBuilder() \
    .addGrid(rf.numTrees, [20, 30]) \
    .addGrid(rf.maxDepth,[5,10]) \
    .build()
    
crossval = CrossValidator(estimator = ppl_cv,
                        estimatorParamMaps=paramGrid,
                        evaluator = RegressionEvaluator(
    labelCol="DEPARTURE_DELAY", predictionCol="prediction", metricName="rmse"),
                        numFolds= 3)

cv_model = crossval.fit(very_small_sample)



In [19]:
predictions = cv_model.transform(test_data)
evaluator= RegressionEvaluator(labelCol = "DEPARTURE_DELAY", predictionCol="prediction", metricName= "rmse")
rmse = evaluator.evaluate(predictions)

In [28]:
predictions.select("DEPARTURE_DELAY","prediction").show(5)

+---------------+-------------------+
|DEPARTURE_DELAY|         prediction|
+---------------+-------------------+
|             -5| 24.303759340731663|
|              7|-0.8809879290405336|
|             -2| 2.8170616673917506|
|             -4|   16.7674110156983|
|             11| 23.025632126269063|
+---------------+-------------------+
only showing top 5 rows



In [29]:
print(rmse)

40.80727095918729


In [30]:
classifiers = {
         'RF': RandomForestRegressor(labelCol = "DEPARTURE_DELAY", featuresCol = "features"),
         'LR': LinearRegression(labelCol = "DEPARTURE_DELAY", featuresCol = "features")}

grid = {
    "RF":ParamGridBuilder() \
    .addGrid(rf.maxDepth, [ 10,5]) \
    .addGrid(rf.numTrees,[20, 30]) \
    .build(),
    
    "LR":ParamGridBuilder() \
    .addGrid(lr.regParam, [.05, .1]) \
    .addGrid(lr.maxIter,[5,10]) \
    .build()
     }

In [31]:
lista=["RF","LR"]


In [35]:
classifiers[lista[0]]
for i in range(1,3):
    modelo = classifiers[lista[i-1]]
    modelo_cv = modelo
    paramGrid=grid[lista[i-1]]
    
    crossval = CrossValidator(estimator = modelo_cv,
                        estimatorParamMaps=paramGrid,
                        evaluator = RegressionEvaluator(
    labelCol="DEPARTURE_DELAY", predictionCol="prediction", metricName="rmse"),
                        numFolds= 2)

    cv_model = crossval.fit(very_small_sample)
    
    
    predictions = cv_model.transform(test_data)
    evaluat< or= RegressionEvaluator(labelCol = "DEPARTURE_DELAY", predictionCol="prediction", metricName= "rmse")
    rmse = evaluator.evaluate(predictions)
    print(lista[i-1])
    print(predictions.select("DEPARTURE_DELAY","prediction").show(5))
    print(rmse)
    print(cv_model.bestModel.extractParamMap())
    i=i+1
    


RF
+---------------+------------------+
|DEPARTURE_DELAY|        prediction|
+---------------+------------------+
|             -5|17.113580745341615|
|              7| 9.236200023421874|
|             -2| 3.865986367887004|
|             -4|18.035189940743912|
|             11|10.845072310679843|
+---------------+------------------+
only showing top 5 rows

None
40.80727095918729
{Param(parent='RandomForestRegressor_46bfb9012de17dc7b8fd', name='cacheNodeIds', doc='If false, the algorithm will pass trees to executors to match instances with nodes. If true, the algorithm will cache node IDs for each instance. Caching can speed up training of deeper trees.'): False, Param(parent='RandomForestRegressor_46bfb9012de17dc7b8fd', name='checkpointInterval', doc='set checkpoint interval (>= 1) or disable checkpoint (-1). E.g. 10 means that the cache will get checkpointed every 10 iterations. Note: this setting will be ignored if the checkpoint directory is not set in the SparkContext'): 10, Para

In [36]:
crossval = CrossValidator(estimator = modelo_cv,
                        estimatorParamMaps=paramGrid,
                        evaluator = RegressionEvaluator(
    labelCol="DEPARTURE_DELAY", predictionCol="prediction", metricName="rmse"),
                        numFolds= 3)

cv_model = crossval.fit(very_small_sample)



In [38]:
predictions = cv_model.transform(test_data)
evaluator= RegressionEvaluator(labelCol = "DEPARTURE_DELAY", predictionCol="prediction", metricName= "rmse")
rmse = evaluator.evaluate(predictions)
print(predictions.select("DEPARTURE_DELAY","prediction").show(5))
print(rmse)
#print(cv_model.bestModel.stages[0].extractParamMap())

+---------------+-------------------+
|DEPARTURE_DELAY|         prediction|
+---------------+-------------------+
|             -5| 24.303759340731663|
|              7|-0.8809879290405336|
|             -2| 2.8170616673917506|
|             -4|   16.7674110156983|
|             11| 23.025632126269063|
+---------------+-------------------+
only showing top 5 rows

None
35.28734126414375


In [39]:
    print(cv_model.bestModel.extractParamMap())


{Param(parent='LinearRegression_4756bcc6c22cf88bf1ae', name='aggregationDepth', doc='suggested depth for treeAggregate (>= 2)'): 2, Param(parent='LinearRegression_4756bcc6c22cf88bf1ae', name='elasticNetParam', doc='the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty'): 0.0, Param(parent='LinearRegression_4756bcc6c22cf88bf1ae', name='epsilon', doc='The shape parameter to control the amount of robustness. Must be > 1.0.'): 1.35, Param(parent='LinearRegression_4756bcc6c22cf88bf1ae', name='featuresCol', doc='features column name'): 'features', Param(parent='LinearRegression_4756bcc6c22cf88bf1ae', name='fitIntercept', doc='whether to fit an intercept term'): True, Param(parent='LinearRegression_4756bcc6c22cf88bf1ae', name='labelCol', doc='label column name'): 'DEPARTURE_DELAY', Param(parent='LinearRegression_4756bcc6c22cf88bf1ae', name='loss', doc='The loss function to be optimized. Supported options: squaredError