<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Load-data" data-toc-modified-id="Load-data-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Load data</a></span></li><li><span><a href="#Modelling" data-toc-modified-id="Modelling-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Modelling</a></span></li><li><span><a href="#Model-Predictions" data-toc-modified-id="Model-Predictions-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Model Predictions</a></span></li><li><span><a href="#Model-evaluation" data-toc-modified-id="Model-evaluation-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Model evaluation</a></span></li></ul></div>

In [1]:
import numpy as np
import pandas as pd
import pyspark
from pyspark import SparkConf, SparkContext, SQLContext
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf # @udf("integer") def myfunc(x,y): return x - y
from pyspark.sql import functions as F # stddev format_number date_format, dayofyear, when
from pyspark.sql.types import StructField, StringType, IntegerType, StructType

print([(x.__name__,x.__version__) for x in [np, pd, pyspark]])

spark = pyspark.sql.SparkSession.builder.appName('dtree').getOrCreate()
sc = spark.sparkContext
sqlContext = SQLContext(sc) # spark_df = sqlContext.createDataFrame(pandas_df)
sc.setLogLevel("INFO")

[('numpy', '1.17.1'), ('pandas', '0.25.1'), ('pyspark', '2.4.4')]


In [35]:
from pyspark.ml import Pipeline
from pyspark.ml.regression import (RandomForestRegressor, GBTRegressor,
                                  DecisionTreeRegressor)

from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# Load data

In [7]:
!head -2 ../data/sample_libsvm_data.txt

0 128:51 129:159 130:253 131:159 132:50 155:48 156:238 157:252 158:252 159:252 160:237 182:54 183:227 184:253 185:252 186:239 187:233 188:252 189:57 190:6 208:10 209:60 210:224 211:252 212:253 213:252 214:202 215:84 216:252 217:253 218:122 236:163 237:252 238:252 239:252 240:253 241:252 242:252 243:96 244:189 245:253 246:167 263:51 264:238 265:253 266:253 267:190 268:114 269:253 270:228 271:47 272:79 273:255 274:168 290:48 291:238 292:252 293:252 294:179 295:12 296:75 297:121 298:21 301:253 302:243 303:50 317:38 318:165 319:253 320:233 321:208 322:84 329:253 330:252 331:165 344:7 345:178 346:252 347:240 348:71 349:19 350:28 357:253 358:252 359:195 372:57 373:252 374:252 375:63 385:253 386:252 387:195 400:198 401:253 402:190 413:255 414:253 415:196 427:76 428:246 429:252 430:112 441:253 442:252 443:148 455:85 456:252 457:230 458:25 467:7 468:135 469:253 470:186 471:12 483:85 484:252 485:223 494:7 495:131 496:252 497:225 498:71 511:85 512:252 513:145 521:48 522:165 523:252 524:173 539:86

In [8]:
data = spark.read.format('libsvm').load('../data/sample_libsvm_data.txt')

In [10]:
print(data.count())
data.show()

100
+-----+--------------------+
|label|            features|
+-----+--------------------+
|  0.0|(692,[127,128,129...|
|  1.0|(692,[158,159,160...|
|  1.0|(692,[124,125,126...|
|  1.0|(692,[152,153,154...|
|  1.0|(692,[151,152,153...|
|  0.0|(692,[129,130,131...|
|  1.0|(692,[158,159,160...|
|  1.0|(692,[99,100,101,...|
|  0.0|(692,[154,155,156...|
|  0.0|(692,[127,128,129...|
|  1.0|(692,[154,155,156...|
|  0.0|(692,[153,154,155...|
|  0.0|(692,[151,152,153...|
|  1.0|(692,[129,130,131...|
|  0.0|(692,[154,155,156...|
|  1.0|(692,[150,151,152...|
|  0.0|(692,[124,125,126...|
|  0.0|(692,[152,153,154...|
|  1.0|(692,[97,98,99,12...|
|  1.0|(692,[124,125,126...|
+-----+--------------------+
only showing top 20 rows



In [11]:
data.printSchema()

root
 |-- label: double (nullable = true)
 |-- features: vector (nullable = true)



In [36]:
train_data, test_data = data.randomSplit([0.7,0.3])

# Modelling

In [15]:
dtc = DecisionTreeRegressor()
rfc = RandomForestRegressor(numTrees=100)
gbt = GBTRegressor()

In [16]:
dtc_model = dtc.fit(train_data)
rfc_model = rfc.fit(train_data)
gbt_model = gbt.fit(train_data)

# Model Predictions

In [27]:
dtc_preds = dtc_model.transform(test_data)
rfc_preds = rfc_model.transform(test_data)
gbt_preds = gbt_model.transform(test_data)

In [28]:
dtc_preds.show()

+-----+--------------------+----------+
|label|            features|prediction|
+-----+--------------------+----------+
|  0.0|(692,[122,123,148...|       0.0|
|  0.0|(692,[123,124,125...|       0.0|
|  0.0|(692,[124,125,126...|       0.0|
|  0.0|(692,[124,125,126...|       0.0|
|  0.0|(692,[126,127,128...|       0.0|
|  0.0|(692,[126,127,128...|       0.0|
|  0.0|(692,[126,127,128...|       0.0|
|  0.0|(692,[126,127,128...|       0.0|
|  0.0|(692,[127,128,129...|       0.0|
|  0.0|(692,[127,128,129...|       0.0|
|  0.0|(692,[152,153,154...|       0.0|
|  1.0|(692,[97,98,99,12...|       1.0|
|  1.0|(692,[99,100,101,...|       0.0|
|  1.0|(692,[123,124,125...|       1.0|
|  1.0|(692,[124,125,126...|       1.0|
|  1.0|(692,[125,126,127...|       1.0|
|  1.0|(692,[125,126,127...|       1.0|
|  1.0|(692,[126,127,128...|       1.0|
|  1.0|(692,[127,128,154...|       1.0|
|  1.0|(692,[128,129,130...|       1.0|
+-----+--------------------+----------+
only showing top 20 rows



In [30]:
gbt_preds.show()

+-----+--------------------+----------+
|label|            features|prediction|
+-----+--------------------+----------+
|  0.0|(692,[122,123,148...|       0.0|
|  0.0|(692,[123,124,125...|       0.0|
|  0.0|(692,[124,125,126...|       0.0|
|  0.0|(692,[124,125,126...|       0.0|
|  0.0|(692,[126,127,128...|       0.0|
|  0.0|(692,[126,127,128...|       0.0|
|  0.0|(692,[126,127,128...|       0.0|
|  0.0|(692,[126,127,128...|       0.0|
|  0.0|(692,[127,128,129...|       0.0|
|  0.0|(692,[127,128,129...|       0.0|
|  0.0|(692,[152,153,154...|       0.0|
|  1.0|(692,[97,98,99,12...|       1.0|
|  1.0|(692,[99,100,101,...|       0.0|
|  1.0|(692,[123,124,125...|       1.0|
|  1.0|(692,[124,125,126...|       1.0|
|  1.0|(692,[125,126,127...|       1.0|
|  1.0|(692,[125,126,127...|       1.0|
|  1.0|(692,[126,127,128...|       1.0|
|  1.0|(692,[127,128,154...|       1.0|
|  1.0|(692,[128,129,130...|       1.0|
+-----+--------------------+----------+
only showing top 20 rows



# Model evaluation

In [31]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [32]:
print('dtc accuracy:')
acc_eval = MulticlassClassificationEvaluator(metricName='accuracy')
acc_eval.evaluate(dtc_preds)

dtc accuracy:


0.9629629629629629

In [33]:
print('gbt accuracy:')
acc_eval = MulticlassClassificationEvaluator(metricName='accuracy')
acc_eval.evaluate(gbt_preds)

gbt accuracy:


0.9629629629629629

In [34]:
rfc_model.featureImportances

SparseVector(692, {378: 0.01, 379: 0.0288, 404: 0.0011, 406: 0.0794, 407: 0.0578, 433: 0.0511, 434: 0.16, 435: 0.02, 455: 0.0006, 461: 0.0089, 462: 0.2, 463: 0.02, 489: 0.0395, 490: 0.1811, 491: 0.0011, 511: 0.04, 512: 0.01, 517: 0.0787, 518: 0.009, 519: 0.0006, 551: 0.0005, 565: 0.0006, 664: 0.0006, 665: 0.0005})