In [None]:
from pyspark.sql import SQLContext
from pyspark.sql import DataFrameNaFunctions
from pyspark.ml import Pipeline
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.feature import Binarizer
from pyspark.ml.feature import VectorAssembler, StringIndexer, VectorIndexer

In [None]:
sqlContext = SQLContext(sc)
df = sqlContext.read.load('file:///home/cloudera/Downloads/big-data-4/daily_weather.csv', 
                          format='com.databricks.spark.csv', 
                          header='true',inferSchema='true')
df.columns

In [None]:
featureColumns = ['air_pressure_9am','air_temp_9am','avg_wind_direction_9am','avg_wind_speed_9am',
        'max_wind_direction_9am','max_wind_speed_9am','rain_accumulation_9am',
        'rain_duration_9am']

In [None]:
df = df.drop('number')

In [None]:
df = df.na.drop()

In [None]:
df.count(), len(df.columns)

In [None]:
binarizer = Binarizer(threshold=24.99999, inputCol='relative_humidity_3pm', outputCol='label')
binarizedDF = binarizer.transform(df)

In [None]:
binarizedDF.select('relative_humidity_3pm','label').show(4)

In [None]:
assembler = VectorAssembler(inputCols=featureColumns,outputCol='features')
assembled = assembler.transform(binarizedDF)

In [None]:
(trainingData,testData) = assembled.randomSplit([0.8,0.2], seed = 13234)

In [None]:
trainingData.count(), testData.count()

In [None]:
dt = DecisionTreeClassifier(labelCol='label',featuresCol='features', maxDepth=5,
                            minInstancesPerNode=20,impurity='gini')

In [None]:
pipeline = Pipeline(stages=[dt])
model = pipeline.fit(trainingData)

In [None]:
predictions = model.transform(testData)

In [None]:
predictions.select('prediction','label').show(20)

In [None]:
predictions.select('prediction','label').write.save(path='file:/home/cloudera/Downloads/big-data-4/predictions.csv', format='com.databricks.spark.csv', header='true')