In [1]:
cd ..

/home/hantoine/Documents/Cours/Concordia/2019-Winter/SOEN_691_Big_Data_Analytics/project/accident-prediction-montreal


In [2]:
from imblearn.datasets import fetch_datasets
from sklearn.model_selection import train_test_split

satimage = fetch_datasets()['satimage']
X, y = satimage.data, satimage.target
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y,
                                                    random_state=0)

In [3]:
%matplotlib inline
import sys
sys.path.insert(0, '/home/hantoine/concordia/bigdata/project/spark/python')
from utils import init_spark
spark = init_spark()

spark.version

Spark Session created
Parameters:
	spark.driver.extraClassPath: ./data/xgboost4j-spark-0.72.jar:./data/xgboost4j-0.72.jar
	spark.network.timeout: 300s
	spark.master: local[10]
	spark.executor.id: driver
	spark.app.id: local-1555784829140
	spark.driver.host: laptop-hantoine.wireless.concordia.ca
	spark.app.name: Accident prediction
	spark.cleaner.periodicGC.interval: 5min
	spark.driver.port: 46865
	spark.serializer: org.apache.spark.serializer.KryoSerializer
	spark.driver.memory: 7g
	spark.rdd.compress: True
	spark.serializer.objectStreamReset: 100
	spark.submit.pyFiles: 
	spark.submit.deployMode: client
	spark.ui.showConsoleProgress: true


'3.0.0-SNAPSHOT'

In [4]:
from pyspark.ml.linalg import Vectors
from pyspark.sql.functions import lit, monotonically_increasing_id

train_examples = map(lambda r: (Vectors.dense(r[0]), 0 if int(r[1]) == -1 else 1), zip(X_train, y_train))
train_set = spark.createDataFrame(train_examples, schema=["features", "label"])
test_examples = map(lambda r: (Vectors.dense(r[0]), 0 if int(r[1]) == -1 else 1), zip(X_test, y_test))
test_set = spark.createDataFrame(test_examples, schema=["features", "label"])
train_set = train_set.withColumn('id', monotonically_increasing_id())
test_set = test_set.withColumn('id', -1 * monotonically_increasing_id() - 1)

In [5]:
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.tuning import ParamGridBuilder, TrainValidationSplit, CrossValidator
from pyspark.ml import Pipeline
from random_undersampler import RandomUnderSampler
from class_weighter import ClassWeighter
from pyspark.sql.functions import col

imbalance_ratio = (train_set.filter(col('label') == 0).count()
                    / train_set.filter(col('label') == 1).count())

print(imbalance_ratio)
rf = RandomForestClassifier(labelCol="label",
                            featuresCol="features",
                            cacheNodeIds=True,
                            maxDepth=30,
                            impurity='gini',
                            featureSubsetStrategy='sqrt',
                            minInstancesPerNode=2,
                            numTrees=50,
                            subsamplingRate=1.0,
                            maxMemoryInMB=768
                           )
brf = RandomForestClassifier(labelCol="label",
                            featuresCol="features",
                            cacheNodeIds=True,
                            maxDepth=30,
                            impurity='gini',
                            featureSubsetStrategy='sqrt',
                            weightCol='weight',
                            minInstancesPerNode=2,
                            numTrees=50,
                            subsamplingRate=1.0,
                            maxMemoryInMB=768
                           )
cw = ClassWeighter().setClassWeight([1/imbalance_ratio, 1.0])
ru = (RandomUnderSampler()
      .setIndexCol('id')
      .setTargetImbalanceRatio(1.0))

pipeline_urf = Pipeline().setStages([ru, rf])
pipeline_rf = Pipeline().setStages([rf])
pipeline_brf = Pipeline().setStages([cw, brf])

model_rf = pipeline_rf.fit(train_set)
model_urf = pipeline_urf.fit(train_set)
model_brf = pipeline_brf.fit(train_set)
pred_rf = model_rf.transform(test_set)
pred_urf = model_urf.transform(test_set)
pred_brf = model_brf.transform(test_set)

9.289978678038379


In [6]:
from pyspark.sql.functions import when
y_pred_rf_sp = pred_rf.select(when(col('prediction') == 0, -1).otherwise(1)).toPandas().values
y_pred_brf_sp = pred_brf.select(when(col('prediction') == 0, -1).otherwise(1)).toPandas().values
y_pred_urf_sp = pred_urf.select(when(col('prediction') == 0, -1).otherwise(1)).toPandas().values

In [7]:
from sklearn.ensemble import RandomForestClassifier
from imblearn.ensemble import BalancedRandomForestClassifier
rf = RandomForestClassifier(n_estimators=50, random_state=0, n_jobs=-1)
brf = BalancedRandomForestClassifier(n_estimators=50, random_state=0, n_jobs=-1)
rf.fit(X_train, y_train)
brf.fit(X_train, y_train)

y_pred_rf = rf.predict(X_test)
y_pred_brf = brf.predict(X_test)

In [8]:
from sklearn.metrics import balanced_accuracy_score
print('imbalanced-learn rf')
balanced_accuracy_score(y_test, y_pred_rf)

imbalanced-learn rf


0.7266300819427629

In [9]:
print('imbalanced-learn brf')
balanced_accuracy_score(y_test, y_pred_brf)

imbalanced-learn brf


0.8721706058851397

In [10]:
print('spark brf')
balanced_accuracy_score(y_test, y_pred_brf_sp)

spark brf


0.8685581933989577

In [11]:
print('spark rf')
balanced_accuracy_score(y_test, y_pred_rf_sp)

spark rf


0.7178501868716114

In [12]:
print('spark urf')
balanced_accuracy_score(y_test, y_pred_urf_sp)

spark urf


0.4794967626467337

In [13]:
from evaluate import evaluate_binary_classifier
evaluate_binary_classifier(pred_rf)

Area Under PR = 0.7185115980077723
F1 score = 0.9239390618578491


(0.7185115980077723, 0.9239390618578491)

In [14]:
evaluate_binary_classifier(pred_brf)

Area Under PR = 0.7087076267293976
F1 score = 0.91170921022685


(0.7087076267293976, 0.91170921022685)

In [15]:
from evaluate import evaluate_binary_classifier
evaluate_binary_classifier(pred_urf)

Area Under PR = 0.7013375047838261
F1 score = 0.8909451530194905


(0.7013375047838261, 0.8909451530194905)