In [62]:
from pyspark.sql import SparkSession

from pyspark.sql import SparkSession
spark = SparkSession \
    .builder \
    .appName("Python Spark SQL basic example") \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()

In [63]:
flights = spark.read.load('flights.csv',
                     format="csv", sep=",", inferSchema="true", header="true")
flights.printSchema()

root
 |-- YEAR: integer (nullable = true)
 |-- MONTH: integer (nullable = true)
 |-- DAY: integer (nullable = true)
 |-- DAY_OF_WEEK: integer (nullable = true)
 |-- AIRLINE: string (nullable = true)
 |-- FLIGHT_NUMBER: integer (nullable = true)
 |-- TAIL_NUMBER: string (nullable = true)
 |-- ORIGIN_AIRPORT: string (nullable = true)
 |-- DESTINATION_AIRPORT: string (nullable = true)
 |-- SCHEDULED_DEPARTURE: integer (nullable = true)
 |-- DEPARTURE_TIME: integer (nullable = true)
 |-- DEPARTURE_DELAY: integer (nullable = true)
 |-- TAXI_OUT: integer (nullable = true)
 |-- WHEELS_OFF: integer (nullable = true)
 |-- SCHEDULED_TIME: integer (nullable = true)
 |-- ELAPSED_TIME: integer (nullable = true)
 |-- AIR_TIME: integer (nullable = true)
 |-- DISTANCE: integer (nullable = true)
 |-- WHEELS_ON: integer (nullable = true)
 |-- TAXI_IN: integer (nullable = true)
 |-- SCHEDULED_ARRIVAL: integer (nullable = true)
 |-- ARRIVAL_TIME: integer (nullable = true)
 |-- ARRIVAL_DELAY: integer (null

 Con esto podemos ver que datos debemos transformar o modificar para poder usarlos en el modelo:  
        1.- Variables vategóricas en "integer" a numéricas  
        2.- One-hot encoding a las variables categóricas  
        3.- Ver variables como cancellation reason para ver como le hacemos token-hash.  
        4.- Meter en el modelo.  

1.- Transformar variables categóricas a numéricas con StringIndexer

In [64]:
# Las que vamos a cambiar van a ser: AIRLINE,ORIGIN_AIRPORT, DESTINATION_AIRPORT
from pyspark.ml.feature import OneHotEncoder, StringIndexer
from pyspark.ml import Pipeline


variables = ["AIRLINE","ORIGIN_AIRPORT","DESTINATION_AIRPORT","DAY_OF_WEEK"]
variables_string_encoded=["AIRLINE_string_encoded","ORIGIN_AIRPORT_string_encoded","DESTINATION_AIRPORT_string_encoded","DAY_OF_WEEK_string_encoded"]

stage_string = [StringIndexer(inputCol= c, outputCol= c+"_string_encoded") for c in variables]
stage_one_hot = [OneHotEncoder(inputCol= c+"_string_encoded", outputCol= c+ "_one_hot") for c in variables]

ppl = Pipeline(stages= stage_string + stage_one_hot)
flights = ppl.fit(flights).transform(flights)

In [None]:
## Tokenizer
from pyspark.ml.feature import HashingTF, Tokenizer

tokenizer = Tokenizer(inputCol="CANCELLATION_REASON", outputCol="CANCELLATION_REASON_tok")
hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="CANCELLATION_REASON_hash")


In [65]:
from pyspark.sql.functions import col,sum
flights.select(*(sum(col(c).isNull().cast("int")).alias(c) for c in flights.columns)).show()
flights.count()

Exception ignored in: <object repr() failed>
Traceback (most recent call last):
  File "/usr/local/spark/python/pyspark/ml/wrapper.py", line 105, in __del__
    SparkContext._active_spark_context._gateway.detach(self._java_obj)
AttributeError: 'RegressionEvaluator' object has no attribute '_java_obj'


+----+-----+---+-----------+-------+-------------+-----------+--------------+-------------------+-------------------+--------------+---------------+--------+----------+--------------+------------+--------+--------+---------+-------+-----------------+------------+-------------+--------+---------+-------------------+----------------+--------------+-------------+-------------------+-------------+----------------------+-----------------------------+----------------------------------+--------------------------+---------------+----------------------+---------------------------+-------------------+
|YEAR|MONTH|DAY|DAY_OF_WEEK|AIRLINE|FLIGHT_NUMBER|TAIL_NUMBER|ORIGIN_AIRPORT|DESTINATION_AIRPORT|SCHEDULED_DEPARTURE|DEPARTURE_TIME|DEPARTURE_DELAY|TAXI_OUT|WHEELS_OFF|SCHEDULED_TIME|ELAPSED_TIME|AIR_TIME|DISTANCE|WHEELS_ON|TAXI_IN|SCHEDULED_ARRIVAL|ARRIVAL_TIME|ARRIVAL_DELAY|DIVERTED|CANCELLED|CANCELLATION_REASON|AIR_SYSTEM_DELAY|SECURITY_DELAY|AIRLINE_DELAY|LATE_AIRCRAFT_DELAY|WEATHER_DELAY|AIRLI

5819079

In [66]:
#flights1 = flights.dropna(thresh=1, subset=["YEAR","MONTH","DAY","FLIGHT_NUMBER","SCHEDULED_DEPARTURE","DEPARTURE_TIME","TAXI_OUT",
#              "WHEELS_OFF", "SCHEDULED_TIME","ELAPSED_TIME","AIR_TIME","DISTANCE","WHEELS_OFF","SCHEDULED_TIME",
#              "ARRIVAL_TIME","ARRIVAL_DELAY","AIRLINE_one_hot","ORIGIN_AIRPORT_one_hot","DESTINATION_AIRPORT_one_hot","DAY_OF_WEEK_one_hot"])

flights1 = flights.dropna(how='any', thresh=None, subset=["YEAR","MONTH","DAY","FLIGHT_NUMBER","SCHEDULED_DEPARTURE","DEPARTURE_TIME","TAXI_OUT",
              "WHEELS_OFF", "SCHEDULED_TIME","ELAPSED_TIME","AIR_TIME","DISTANCE","WHEELS_OFF","SCHEDULED_TIME",
              "ARRIVAL_TIME","ARRIVAL_DELAY"])
flights1.count()

5714008

In [67]:
from pyspark.ml.feature import VectorAssembler

variables_int=["YEAR","MONTH","DAY","FLIGHT_NUMBER","SCHEDULED_DEPARTURE","DEPARTURE_TIME","TAXI_OUT",
              "WHEELS_OFF", "SCHEDULED_TIME","ELAPSED_TIME","AIR_TIME","DISTANCE","WHEELS_OFF","SCHEDULED_TIME",
              "ARRIVAL_TIME","ARRIVAL_DELAY"]

variables_trans=["AIRLINE_one_hot","ORIGIN_AIRPORT_one_hot","DESTINATION_AIRPORT_one_hot","DAY_OF_WEEK_one_hot"]

features = variables_int + variables_trans

vector_assembler = VectorAssembler(inputCols = features, outputCol= "features")
data_training_and_test1 = vector_assembler.transform(flights1)


In [None]:
data_training_and_test1.count()

In [None]:
data_training_and_test1.show()

In [8]:
#from pyspark.ml.feature import PCA
#pca_model = PCA(k = 5,inputCol = "features", outputCol = "pca_features")
#model = pca_model.fit(data_training_and_test1)
#data_training_and_test1 = model.transform(data_training_and_test1)

In [68]:
from pyspark.ml.regression import RandomForestRegressor

(training_data, test_data) = data_training_and_test1.randomSplit([0.7, 0.3])
rf = RandomForestRegressor(labelCol = "DEPARTURE_DELAY", featuresCol = "features", numTrees = 20)
rf_model = rf.fit(training_data)



In [79]:
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import RegressionEvaluator 

very_small_sample = data_training_and_test1.sample(False, 0.001).cache()

#pca_model = PCA(inputCol = "features", outputCol = "features_cv")
rf = RandomForestRegressor(labelCol = "DEPARTURE_DELAY", featuresCol = "features")
ppl_cv = Pipeline(stages = [rf])

paramGrid = ParamGridBuilder() \
    .addGrid(rf.numTrees, [20, 30]) \
    .addGrid(rf.maxDepth,[5,10]) \
    .build()



In [80]:
crossval = CrossValidator(estimator = ppl_cv,
                        estimatorParamMaps=paramGrid,
                        evaluator = RegressionEvaluator(
    labelCol="DEPARTURE_DELAY", predictionCol="prediction", metricName="rmse"),
                        numFolds= 3)

cv_model = crossval.fit(very_small_sample)

In [82]:
predictions = cv_model.transform(test_data)
evaluator= RegressionEvaluator(labelCol = "DEPARTURE_DELAY", predictionCol="prediction", metricName= "rmse")
rmse = evaluator.evaluate(predictions)

In [83]:
predictions.select("DEPARTURE_DELAY","prediction").show(5)
  

+---------------+------------------+
|DEPARTURE_DELAY|        prediction|
+---------------+------------------+
|             -5|0.2891211948259482|
|              0| -0.59855011451892|
|             -4|-2.945706573606688|
|              3| 8.867815057557763|
|              6|22.700427670343363|
+---------------+------------------+
only showing top 5 rows



In [84]:
print(rmse)

18.79777317492078


In [220]:
from pyspark.ml.regression import GeneralizedLinearRegression
very_small_sample = data_training_and_test1.sample(False, 0.001).cache()

#pca_model = PCA(inputCol = "features", outputCol = "features_cv")
glr = GeneralizedLinearRegression(labelCol = "DEPARTURE_DELAY", featuresCol = "features")
glr_cv = Pipeline(stages = [glr])

paramGrid=ParamGridBuilder() \
    .addGrid(glr.regParam, [.05, .1]) \
    .addGrid(glr.maxIter,[5,10]) \
    .build()



In [90]:
crossval = CrossValidator(estimator = glr_cv,
                        estimatorParamMaps=paramGrid,
                        evaluator = RegressionEvaluator(
    labelCol="DEPARTURE_DELAY", predictionCol="prediction", metricName="rmse"),
                        numFolds= 3)

cv_model = crossval.fit(very_small_sample)

In [91]:
predictions = cv_model.transform(test_data)
evaluator= RegressionEvaluator(labelCol = "DEPARTURE_DELAY", predictionCol="prediction", metricName= "rmse")
rmse = evaluator.evaluate(predictions)

In [92]:
predictions.select("DEPARTURE_DELAY","prediction").show(5)

+---------------+--------------------+
|DEPARTURE_DELAY|          prediction|
+---------------+--------------------+
|             -5|  -4.526025993752646|
|              0|-0.10591200509961474|
|             -4| -4.0215540153038125|
|              3|   3.938270842650745|
|              6|   7.180563166981721|
+---------------+--------------------+
only showing top 5 rows



In [93]:
print(rmse)

1.3367365439572


In [103]:
cv_model.bestModel.stages[0].extractParamMap()

{Param(parent='GeneralizedLinearRegression_48989823353e63dbcf36', name='family', doc='The name of family which is a description of the error distribution to be used in the model. Supported options: poisson, binomial, gaussian, gamma, tweedie.'): 'gaussian',
 Param(parent='GeneralizedLinearRegression_48989823353e63dbcf36', name='featuresCol', doc='features column name'): 'features',
 Param(parent='GeneralizedLinearRegression_48989823353e63dbcf36', name='fitIntercept', doc='whether to fit an intercept term'): True,
 Param(parent='GeneralizedLinearRegression_48989823353e63dbcf36', name='labelCol', doc='label column name'): 'DEPARTURE_DELAY',
 Param(parent='GeneralizedLinearRegression_48989823353e63dbcf36', name='maxIter', doc='maximum number of iterations (>= 0)'): 5,
 Param(parent='GeneralizedLinearRegression_48989823353e63dbcf36', name='predictionCol', doc='prediction column name'): 'prediction',
 Param(parent='GeneralizedLinearRegression_48989823353e63dbcf36', name='regParam', doc='reg

In [219]:
classifiers = {
         'RF': RandomForestRegressor(labelCol = "DEPARTURE_DELAY", featuresCol = "features"),
         'GLR': GeneralizedLinearRegression(labelCol = "DEPARTURE_DELAY", featuresCol = "features")}

grid = {
    "RF":ParamGridBuilder() \
    .addGrid(rf.maxDepth, [ 10,5]) \
    .addGrid(rf.numTrees,[20, 30]) \
    .build(),
    
    "GLR":ParamGridBuilder() \
    .addGrid(glr.regParam, [.05, .1]) \
    .addGrid(glr.maxIter,[5,10]) \
    .build()
     }

In [229]:
lista=["RF","GLR"]
classifiers[lista[0]]

RandomForestRegressor_48869beebab65e2c4821

In [235]:
lista=["RF","GLR"]
classifiers[lista[0]]
for i in range(1,3):
    modelo = classifiers[lista[i-1]]
    modelo_cv = modelo
    paramGrid=grid[lista[i-1]]
    
    crossval = CrossValidator(estimator = modelo_cv,
                        estimatorParamMaps=paramGrid,
                        evaluator = RegressionEvaluator(
    labelCol="DEPARTURE_DELAY", predictionCol="prediction", metricName="rmse"),
                        numFolds= 3)

    cv_model = crossval.fit(very_small_sample)
    
    
    predictions = cv_model.transform(test_data)
    evaluator= RegressionEvaluator(labelCol = "DEPARTURE_DELAY", predictionCol="prediction", metricName= "rmse")
    rmse = evaluator.evaluate(predictions)
    print(lista[i-1])
    print(predictions.select("DEPARTURE_DELAY","prediction").show(5))
    print(rmse)
    print(cv_model.bestModel.extractParamMap())
    i=i+1




RF
+---------------+------------------+
|DEPARTURE_DELAY|        prediction|
+---------------+------------------+
|             -5|1.3999266505974748|
|              0| 4.621446789157801|
|             -4|-1.468974901819656|
|              3| 6.694421276066909|
|              6|28.734671273107274|
+---------------+------------------+
only showing top 5 rows

None
20.64587610545861
{Param(parent='RandomForestRegressor_48869beebab65e2c4821', name='cacheNodeIds', doc='If false, the algorithm will pass trees to executors to match instances with nodes. If true, the algorithm will cache node IDs for each instance. Caching can speed up training of deeper trees.'): False, Param(parent='RandomForestRegressor_48869beebab65e2c4821', name='checkpointInterval', doc='set checkpoint interval (>= 1) or disable checkpoint (-1). E.g. 10 means that the cache will get checkpointed every 10 iterations. Note: this setting will be ignored if the checkpoint directory is not set in the SparkContext'): 10, Para

In [224]:
crossval = CrossValidator(estimator = modelo_cv,
                        estimatorParamMaps=paramGrid,
                        evaluator = RegressionEvaluator(
    labelCol="DEPARTURE_DELAY", predictionCol="prediction", metricName="rmse"),
                        numFolds= 3)

cv_model = crossval.fit(very_small_sample)

In [225]:
predictions = cv_model.transform(test_data)
evaluator= RegressionEvaluator(labelCol = "DEPARTURE_DELAY", predictionCol="prediction", metricName= "rmse")
rmse = evaluator.evaluate(predictions)
print(predictions.select("DEPARTURE_DELAY","prediction").show(5))
print(rmse)
print(cv_model.bestModel.stages[0].extractParamMap())

+---------------+------------------+
|DEPARTURE_DELAY|        prediction|
+---------------+------------------+
|             -5|1.3999266505974748|
|              0| 4.621446789157801|
|             -4|-1.468974901819656|
|              3| 6.694421276066909|
|              6|28.734671273107274|
+---------------+------------------+
only showing top 5 rows

None
20.64587610545861
