In [None]:
#Group Project Classification

In [2]:
from pyspark.sql import SparkSession 
from pyspark.sql.functions import * 
from pyspark.sql.types import StringType
from pyspark.ml import Pipeline 
from pyspark.ml.feature import VectorAssembler 
from pyspark.ml.feature import StringIndexer 
from pyspark.ml.classification import NaiveBayes 
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.feature import StringIndexer, OneHotEncoder, Tokenizer, StopWordsRemover, Word2Vec, VectorAssembler

#from FeatureImportanceSelector import ExtractFeatureImp, FeatureImpSelector

In [73]:
class_df = df.filter(df['prev_sale_price'] > 0).na.fill(0)

In [100]:
#target variable
class_df = class_df.withColumn('profit_is_pos', (class_df['sale_price'] - class_df['prev_sale_price'] ))

In [103]:
class_df = class_df.withColumn("profit_is_pos", when(F.col('profit_is_pos') >= 0 ,1).otherwise(0))

In [105]:
df_2 = class_df.select('state', 'city', 'zip5', 'year_built', 'CPIHOSNS', 'prev_sale_price',
                       'property_type', 'last_sale_days','profit_is_pos')

In [106]:
# Convert city, state and zip to numeric categorical
for y in ['city', 'state', 'zip5']:
    print(y)
    indexer = StringIndexer(inputCol=str(y), outputCol=str(y)+"Index")
    ohe = OneHotEncoder(inputCol = str(y)+"Index", outputCol = str(y)+"Vector")
    df_2 = indexer.fit(df_2).transform(df_2)
    df_2 = ohe.transform(df_2)

city
state
zip5


In [None]:
predictors = ['year_built', 'prev_sale_price', 'last_sale_days', 'CPIHOSNS', 'cityVector', 'stateVector', 'zip5Vector']

vectorAssembler = VectorAssembler(inputCols = predictors, outputCol = 'features')
vinput_data = vectorAssembler.transform(df_2)
vinput_data = vinput_data.select(['features', 'profit_is_pos'])
vinput_data.show(1)

+--------------------+-------------+
|            features|profit_is_pos|
+--------------------+-------------+
|(1491,[0,1,2,3,11...|            1|
+--------------------+-------------+
only showing top 1 row



In [108]:
train_df, test_df = vinput_data.randomSplit([.7,.3],seed=1234)

### Naive Bayes

In [109]:
train_df.select('profit_is_pos').groupBy('profit_is_pos').count().show()

+-------------+-------+
|profit_is_pos|  count|
+-------------+-------+
|            1|1394414|
|            0| 599969|
+-------------+-------+



In [115]:
nb = NaiveBayes(modelType='multinomial',labelCol = 'profit_is_pos')
nbmodel = nb.fit(train_df)
predictions_nb = nbmodel.transform(test_df)

In [116]:
evaluator = MulticlassClassificationEvaluator(labelCol='profit_is_pos', predictionCol="prediction", metricName="accuracy") 
nbaccuracy = evaluator.evaluate(predictions_nb) 
print("Test accuracy = " + str(nbaccuracy))

Test accuracy = 0.7067928508643422


In [117]:
evaluator = MulticlassClassificationEvaluator(labelCol='profit_is_pos', predictionCol="prediction", metricName="f1") 
nbf1 = evaluator.evaluate(predictions_nb) 
print("Test f1 = " + str(nbf1))

Test f1 = 0.6803030399280371


In [118]:
evaluator = MulticlassClassificationEvaluator(labelCol='profit_is_pos', predictionCol="prediction", metricName="weightedRecall") 
nbcall = evaluator.evaluate(predictions_nb) 
print("Test recall = " + str(nbcall))

Test recall = 0.7067928508643423


In [122]:
evaluator = MulticlassClassificationEvaluator(labelCol='profit_is_pos', predictionCol="prediction", metricName="weightedPrecision") 
nbcall = evaluator.evaluate(predictions_nb) 
print("Precision = " + str(nbcall))

Precision = 0.6787251686730787


In [120]:
bcEvaluator = BinaryClassificationEvaluator(metricName="areaUnderROC",labelCol='profit_is_pos')
print("Area under ROC curve:", bcEvaluator.evaluate(predictions_nb))

Area under ROC curve: 0.5670512348298334


### Logistic Regression

In [125]:
lr = LogisticRegression(maxIter=20, regParam=0.3, elasticNetParam=0,labelCol = 'profit_is_pos')

lrm = lr.fit(train_df)
predictions_lr = lrm.transform(test_df)

In [126]:
evaluator = MulticlassClassificationEvaluator(labelCol='profit_is_pos', predictionCol="prediction", metricName="accuracy") 
nbaccuracy = evaluator.evaluate(predictions_lr) 
print("Test accuracy = " + str(nbaccuracy))

Test accuracy = 0.6994104893055962


In [127]:
evaluator = MulticlassClassificationEvaluator(labelCol='profit_is_pos', predictionCol="prediction", metricName="f1") 
nbf1 = evaluator.evaluate(predictions_lr) 
print("Test f1 = " + str(nbf1))

Test f1 = 0.5765969490729765


In [128]:
evaluator = MulticlassClassificationEvaluator(labelCol='profit_is_pos', predictionCol="prediction", metricName="weightedRecall") 
nbcall = evaluator.evaluate(predictions_lr) 
print("Test recall = " + str(nbcall))

Test recall = 0.6994104893055962


In [129]:
evaluator = MulticlassClassificationEvaluator(labelCol='profit_is_pos', predictionCol="prediction", metricName="weightedPrecision") 
nbcall = evaluator.evaluate(predictions_lr) 
print("Precision = " + str(nbcall))

Precision = 0.7082292566625992


In [131]:
bcEvaluator = BinaryClassificationEvaluator(metricName="areaUnderROC", labelCol='profit_is_pos')
print("Area under ROC curve:", bcEvaluator.evaluate(predictions_lr))

Area under ROC curve: 0.6241186575452338


### Random Forest

In [None]:
tf_score_all = [['profit_is_pos','f1_score','accuracy','Recall', 'ROC']]

rf = RandomForestClassifier(labelCol='profit_is_pos', \
                        featuresCol="features", \
                        numTrees = 50, \
                        maxDepth = 3, \
                        maxBins = 32)
rfm = rf.fit(train_df)
predictions = rfm.transform(test_df)
results = predictions.select(['prediction','profit_is_pos'])
predictionAndLabels=results.rdd

In [139]:
from pyspark.mllib.evaluation import MulticlassMetrics, BinaryClassificationMetrics
metrics = MulticlassMetrics(predictionAndLabels)
metrics_binary = BinaryClassificationMetrics(predictionAndLabels)

In [143]:
evaluator = MulticlassClassificationEvaluator(labelCol='profit_is_pos', predictionCol="prediction", metricName="accuracy") 
nbaccuracy = evaluator.evaluate(predictions) 
print("Test accuracy = " + str(nbaccuracy))

Test accuracy = 0.69898740111339


In [144]:
evaluator = MulticlassClassificationEvaluator(labelCol='profit_is_pos', predictionCol="prediction", metricName="f1") 
nbf1 = evaluator.evaluate(predictions) 
print("Test f1 = " + str(nbf1))

Test f1 = 0.575146568591468


In [142]:
evaluator = MulticlassClassificationEvaluator(labelCol='profit_is_pos', predictionCol="prediction", metricName="weightedPrecision") 
nbcall = evaluator.evaluate(predictions) 
print("Precision = " + str(nbcall))

Precision = 0.48858338691525116


In [145]:
evaluator = MulticlassClassificationEvaluator(labelCol='profit_is_pos', predictionCol="prediction", metricName="weightedRecall") 
nbcall = evaluator.evaluate(predictions) 
print("Test recall = " + str(nbcall))

Test recall = 0.69898740111339


### Feature Importance

In [None]:
ExtractFeatureImp(mod.stages[-1].featureImportances, df_2, "features_subset") #df2 is the model with highest accuracy