In [1]:
import os, glob
import pandas as pd
import numpy as np
from pyspark import SparkContext
from pyspark.sql import SQLContext, SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import *
from pyspark.ml.feature import RFormula

%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt 
import seaborn as sns
sns.set(font_scale=1.5, rc={'text.usetex' : True})

sc = SparkContext('local')
spark = SparkSession(sc)

path = "./input/clean/training_data/"

In [2]:
NC_data = spark.read.csv(path + "NOAA_NC_DAvg_training_data.csv", header=True, inferSchema=True)
NC_data = NC_data.drop(*[t for t in NC_data.columns if t[-1] == 'T']+['date','CLASS'])
NC_data.show(5)

+------------------+------------------+------------------+-----------------+-----------------+-----------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+-------------------+--------------------+------------------+------------------+------------------+
|             ATemp|           MaxTemp|           MinTemp|               RH|            MaxRH|            MinRH|                BP|             MaxBP|             MinBP|              WSpd|              Wdir|            SDWDir|           MaxWSpd|            MinWSpd|             TotPrcp|            TotPAR|           AvgVolt|              Temp|
+------------------+------------------+------------------+-----------------+-----------------+-----------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+-------------------+--------------------+------------------+-------

In [3]:
# vector of features
rf = RFormula(formula="Temp~.")
vector_rf = rf.fit(NC_data).transform(NC_data).select(["features","label"])
vector_rf.show(5)

+--------------------+------------------+
|            features|             label|
+--------------------+------------------+
|[8.29687499999999...|10.641145833333328|
|[11.0645833333333...|11.089062499999999|
|[15.5343749999999...|12.167708333333321|
|[19.1302083333333...|13.960937499999993|
|[19.6677083333333...|15.548437499999999|
+--------------------+------------------+
only showing top 5 rows



In [5]:
from pyspark.ml.regression import GBTRegressor
from pyspark.ml.evaluation import RegressionEvaluator

train, test = vector_rf.randomSplit([0.7, 0.3])

# Train a GBT model.
gbt = GBTRegressor(featuresCol="features", maxIter=10, seed = 1230)

# Train model.  This also runs the indexer.
model = gbt.fit(train)

# Make predictions.
predictions = model.transform(test)

# Select example rows to display.
predictions.select("prediction", "label", "features").show(5)

# Select (prediction, true label) and compute test error
evaluator = RegressionEvaluator(
    labelCol="label", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)

+------------------+-----------------+--------------------+
|        prediction|            label|            features|
+------------------+-----------------+--------------------+
|  7.70186497124369|5.170833333333334|[-3.7364583333333...|
|  6.33642535087859|7.708333333333331|[0.36562500000000...|
| 6.033975011571917|6.215624999999999|[0.94270833333333...|
|6.3316424820201265|5.864583333333335|[1.20937500000000...|
|  5.40155232311758|7.271354166666668|[1.32083333333333...|
+------------------+-----------------+--------------------+
only showing top 5 rows

Root Mean Squared Error (RMSE) on test data = 1.95293


In [12]:
from mmlspark.lightgbm import LightGBMRegressor

lgb = LightGBMRegressor(
  numIterations=500,
  learningRate=0.05
)

paramGrid = (ParamGridBuilder()
             .addGrid(lgb.numLeaves, range(10,200,1))
             .addGrid(lgb.maxDepth, range(1,15,1))
             .addGrid(lgb.baggingFraction, np.arange(0,1,0.01))
             ,addGrid(lgb.featureFraction, np.arange(0,1,0.01))
             .addGrid(lgb.minSumHessianInLeaf, np.arrange(0.0005,0.01,0.0001))
             .addGrid(lgb.lambdaL1, range(0,20000000,1))
             .addGrid(lgb.lambdaL2, range(0,20000000,1))
             .build())

evaluator=RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="rmse")
cv = CrossValidator(estimator=lgb, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=5)
cvModel_lgb = cv.fit(train)

ModuleNotFoundError: No module named 'mmlspark.lightgbm._LightGBMRegressor'