In [1]:
from pyspark import *

In [2]:
df_train = spark.read.load("/FileStore/tables/train.txt", format="csv" , sep=" ", inferSchema="true", header="false").drop('_c26','_c27')
df_test = spark.read.load("/FileStore/tables/test.txt", format="csv" , sep=" ", inferSchema="true", header="false").drop('_c26','_c27')
df_truth = spark.read.load("/FileStore/tables/truth.txt", format="csv" , sep=" ", inferSchema="true", header="false").drop('_c1')

In [3]:
actual = [col for col in df_train.columns]
new = ['id', 'cycle', 'setting1', 'setting2', 'setting3', 's1','s2', 's3', 's4', 's5', 's6', 's7', 's8', 's9', 's10', 's11', 's12', 's13', 's14', 's15', 's16', 's17', 's18', 's19', 's20', 's21']
for i in range(len(new)):
  df_train = df_train.withColumnRenamed(actual[i],new[i])
  df_test = df_test.withColumnRenamed(actual[i],new[i])
df_truth = df_truth.withColumnRenamed('_c0','ttf')

In [4]:
maximum = df_train.groupby('id').agg({'cycle':'max'}).withColumnRenamed('id','Ids').withColumnRenamed('max(cycle)','Max')
#maximum.show(100)
step1 = df_train.join(maximum, df_train.id == maximum.Ids,how='left').drop('Ids')# Join 'left_outer'
#step1.show(5)
#genrate time to failure after how many stages = max cycle - cycle
step2 = step1.withColumn("ttf", step1.Max-step1.cycle).drop('Max')
#step2.show(5)

In [5]:
from pyspark.ml.feature import Bucketizer, VectorAssembler, MinMaxScaler
from pyspark.ml import Pipeline
bucketizer = [Bucketizer(splits=[-float("inf"),30,float("inf")], inputCol="ttf", outputCol= "binary")]
bucketizer += [Bucketizer(splits=[-float("inf"),15,30,60,float("inf")], inputCol="ttf", outputCol= "ternary")]

assembler = [VectorAssembler(inputCols= [col], outputCol="Vect_"+col)  for col in step2.columns if col not in ['id','cycle','ttf']]
minmax = [MinMaxScaler(inputCol="Vect_"+col, outputCol='scaled_'+col) for col in step2.columns if col not in ['id','cycle','ttf']]
assembler1 = VectorAssembler(inputCols= ['scaled_'+col for col in step2.columns if col not in ['ttf','id','cycle']]+['id','cycle'], outputCol="features")  
stage = bucketizer + assembler + minmax+ [assembler1]
stepf = Pipeline(stages = stage).fit(step2).transform(step2).select(*['scaled_'+col for col in step2.columns if col not in ['id','cycle','ttf','binary','ternary']]+['id','cycle','features','ttf','binary','ternary'])

In [6]:
train1 = stepf.select('features','ttf').withColumnRenamed('ttf','label')
train2 = stepf.select('features','binary').withColumnRenamed('binary','label')
train3 = stepf.select('features','ternary').withColumnRenamed('ternary','label')

In [7]:
import pyspark.sql.functions as sparkf
#df_test = df_test.drop('setting3','s1','s5','s10','s16','s18','s19')
truth = df_truth.withColumn('Ids', sparkf.monotonically_increasing_id()+1).select('Ids','ttf')
maximum = df_test.groupby('id').agg({'cycle':'max'}).withColumnRenamed('id','Ids').withColumnRenamed('max(cycle)','Max')
step1 = df_test.join(maximum, df_test.id == maximum.Ids,how='left').drop('Ids')# Join 'left_outer'
step1 = step1[step1['cycle'] == step1['Max']].drop('Max')
test = step1.join(truth, step1.id == truth.Ids,how='left').drop('Ids')# Join 'left_outer'
test.show()
stept = Pipeline(stages = stage).fit(test).transform(test).select(*['scaled_'+col for col in test.columns if col not in ['id','cycle','ttf','binary','ternary']]+['id','cycle','features','ttf','binary','ternary'])

In [8]:
test1 = stept.select('features','ttf').withColumnRenamed('ttf','label')
test2 = stept.select('features','binary').withColumnRenamed('binary','label')
test3 = stept.select('features','ternary').withColumnRenamed('ternary','label')

# * MACHINE LEARNING

In [10]:
trainDF = train2
testDF = test2

In [11]:
from pyspark.ml.classification import LogisticRegression

# Load training data


lr = LogisticRegression()

# Fit the model
lrModel = lr.fit(trainDF)

# Print the coefficients and intercept for logistic regression
print("Coefficients: " + str(lrModel.coefficients))
print("Intercept: " + str(lrModel.intercept))
lrModel.summary.accuracy

In [12]:
#for test df
result = lrModel.transform(testDF)
result.select("prediction","label","features").show(10)

In [13]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

evaluator = BinaryClassificationEvaluator()
print("AUC: %(result)s" % {"result": evaluator.evaluate(result)})
display(lrModel, trainDF, "ROC")

False Positive Rate,True Positive Rate,Threshold
0.0,0.0,0.9999993808025834
0.0,0.0112359550561797,0.9999993808025834
0.0,0.0224719101123595,0.9999993647332792
0.0,0.0337078651685393,0.9999992472719322
0.0,0.0449438202247191,0.9999988588844384
0.0,0.0561797752808988,0.999998526772801
0.0,0.0674157303370786,0.9999983401197872
0.0,0.0786516853932584,0.999998272305648
0.0,0.0898876404494382,0.9999980411658588
0.0,0.1011235955056179,0.9999976765232944


In [14]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

# Let's use the run-of-the-mill evaluator
evaluator = BinaryClassificationEvaluator(labelCol='label')

# We have only two choices: area under ROC and PR curves :-(
auroc = evaluator.evaluate(result, {evaluator.metricName: "areaUnderROC"})
auprc = evaluator.evaluate(result, {evaluator.metricName: "areaUnderPR"})
print("Area under ROC Curve: {:.4f}".format(auroc))
print("Area under PR Curve: {:.4f}".format(auprc))

In [15]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator
lr.setRegParam(0.08)

model = lr.fit(trainDF)
#training = model.transform(trainDF)
result = model.transform(testDF)
print("evaluations %(result)s" % {"result": BinaryClassificationEvaluator().evaluate(result)})

In [16]:
from pyspark.mllib.evaluation import MulticlassMetrics
from pyspark.sql.types import FloatType
import pyspark.sql.functions as F
#important: need to cast to float type, and order by prediction, else it won't work
preds_and_labels = result.select(['prediction','label']).withColumn('label', F.col('label').cast(FloatType())).orderBy('prediction')
#select only prediction and label columns
preds_and_labels = preds_and_labels.select(['prediction','label'])

metrics = MulticlassMetrics(preds_and_labels.rdd.map(tuple))

print(metrics.confusionMatrix().toArray())


In [17]:
# We are trying for all possible evaluations possible

In [18]:
from pyspark.ml.classification import LogisticRegression
logr = LogisticRegression(featuresCol='features', labelCol='label')


In [19]:
from pyspark.ml.tuning import ParamGridBuilder
param_grid = ParamGridBuilder().\
      addGrid(logr.regParam, [0, 0.1, 0.2, 0.5]).\
      addGrid(logr.elasticNetParam, [0, 0.1, 0.2, 0.5]).\
      addGrid(logr.maxIter, [5,10,20]).\
      build()

In [20]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator
lrevaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction")

In [21]:
from pyspark.ml.tuning import CrossValidator
cv = CrossValidator(estimator=logr, evaluator=evaluator, estimatorParamMaps=param_grid, numFolds=3)
cv_model = cv.fit(trainDF)  # fitiing data to my cross validation model

In [22]:
show_columns = ['features', 'label', 'prediction', 'rawPrediction', 'probability']
pred_training_lr = cv_model.transform(trainDF)
pred_training_lr.select(show_columns).show(5, truncate=False)

In [23]:
pred_test_lr = cv_model.transform(testDF)
pred_test_lr.select(show_columns).show(5, truncate=False)

In [24]:
print('Intercept: ' + str(cv_model.bestModel.intercept) + "\n" 'coefficients: ' + str(cv_model.bestModel.coefficients))

In [25]:
print('Logistic Regression', "\n",'The best RegParam is: ', cv_model.bestModel._java_obj.getRegParam(), "\n",'The best ElasticNetParam is:', cv_model.bestModel._java_obj.getElasticNetParam(), "\n",'The best Iteration is:',cv_model.bestModel._java_obj.getMaxIter() , "\n", 'Area under ROC is:', cv_model.bestModel.summary.areaUnderROC)

In [26]:
lrevaluator.evaluate(pred_test_lr, {lrevaluator.metricName: "areaUnderROC"})
lrevaluator.evaluate(pred_test_lr, {lrevaluator.metricName: "areaUnderPR"})

In [27]:
from pyspark.ml.classification import LinearSVC

# Load training data


lsvc = LinearSVC(maxIter=10, regParam=0.1)

# Fit the model
lsvcModel = lsvc.fit(trainDF)

# Print the coefficients and intercept for linear SVC
print("Coefficients: " + str(lsvcModel.coefficients))
print("Intercept: " + str(lsvcModel.intercept))

lsvcresult = lsvcModel.transform(testDF)
lsvcresult.select("prediction","label","features").show(10)

#Compute accuracy of test
from pyspark.ml.evaluation import BinaryClassificationEvaluator
evaluator = BinaryClassificationEvaluator()
print("evaluation: %(result)s" % {"result": evaluator.evaluate(lsvcresult)})

In [28]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

# Let's use the run-of-the-mill evaluator
svmevaluator = BinaryClassificationEvaluator()

# We have only two choices: area under ROC and PR curves :-(
svmauroc = svmevaluator.evaluate(lsvcresult, {svmevaluator.metricName: "areaUnderROC"})
svmauprc = svmevaluator.evaluate(lsvcresult, {svmevaluator.metricName: "areaUnderPR"})
print("Area under ROC Curve: {:.4f}".format(svmauroc))
print("Area under PR Curve: {:.4f}".format(svmauprc))

In [29]:
#ESTIMATOR
from pyspark.ml.classification import LinearSVC
lsvm = LinearSVC(featuresCol='features', labelCol='label')

#GRID VECTOR

from pyspark.ml.tuning import ParamGridBuilder
param_grid_svm = ParamGridBuilder().\
      addGrid(lsvm.regParam, [0, 0.1, 0.2, 0.5]).\
      addGrid(lsvm.maxIter, [5,10,20,50]).\
      build()

#Evaluator
from pyspark.ml.evaluation import BinaryClassificationEvaluator
svmevaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction")

#CROSS VALIDATION
from pyspark.ml.tuning import CrossValidator
cv_svm = CrossValidator(estimator=lsvm, evaluator=svmevaluator, estimatorParamMaps=param_grid_svm, numFolds=3)
cv_svm_model = cv_svm.fit(trainDF)  # fitiing data to my cross validation model

show_columns = ['features', 'label', 'prediction', 'rawPrediction']
pred_training_svm = cv_svm_model.transform(trainDF)
pred_training_svm.select(show_columns).show(5, truncate=False)

pred_test_svm = cv_svm_model.transform(testDF)
pred_test_svm.select(show_columns).show(5, truncate=False)

print('Support Vector Machine', "\n",'The best RegParam is: ', cv_svm_model.bestModel._java_obj.getRegParam(),  "\n",'The best Iteration is:',cv_svm_model.bestModel._java_obj.getMaxIter() , "\n", 'Area under ROC is:', svmevaluator.evaluate(pred_test_svm, {svmevaluator.metricName: "areaUnderROC"}))

In [30]:
from pyspark.ml.classification import NaiveBayes
from pyspark.ml.evaluation import BinaryClassificationEvaluator

# create the trainer and set its parameters
nb = NaiveBayes(smoothing=1.0, modelType="multinomial")

# train the model
nbmodel = nb.fit(trainDF)

# select example rows to display.
nbresult = model.transform(testDF)
nbresult.select("prediction","label","features").show(10)

# compute accuracy on the test set
nbevaluator = BinaryClassificationEvaluator()
accuracy = nbevaluator.evaluate(nbresult)
print("evaluations: %(nbresult)s" % {"nbresult": nbevaluator.evaluate(nbresult)})

In [31]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

# Let's use the run-of-the-mill evaluator
nbevaluator = BinaryClassificationEvaluator()

# We have only two choices: area under ROC and PR curves :-(
nbauroc = nbevaluator.evaluate(nbresult, {nbevaluator.metricName: "areaUnderROC"})
nbauprc = nbevaluator.evaluate(nbresult, {nbevaluator.metricName: "areaUnderPR"})
print("Area under ROC Curve: {:.4f}".format(nbauroc))
print("Area under PR Curve: {:.4f}".format(nbauprc))

In [32]:
#ESTIMATOR
from pyspark.ml.classification import NaiveBayes
nb = NaiveBayes(featuresCol='features', labelCol='label')

#GRID VECTOR

from pyspark.ml.tuning import ParamGridBuilder
param_grid_nb = ParamGridBuilder().\
      addGrid(nb.smoothing, [0, 0.5, 1,2]).\
      addGrid(nb.modelType, ["multinomial"]).\
      build()

#Evaluator
from pyspark.ml.evaluation import BinaryClassificationEvaluator
nbevaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction")

#CROSS VALIDATION
from pyspark.ml.tuning import CrossValidator
cv_nb = CrossValidator(estimator=nb, evaluator=nbevaluator, estimatorParamMaps=param_grid_nb, numFolds=3)
cv_nb_model = cv_nb.fit(trainDF)  # fitiing data to my cross validation model

show_columns = ['features', 'label', 'prediction', 'rawPrediction', 'probability']
pred_training_nb = cv_nb_model.transform(trainDF)
pred_training_nb.select(show_columns).show(5, truncate=False)

pred_test_nb = cv_nb_model.transform(testDF)
pred_test_nb.select(show_columns).show(5, truncate=False)

print('Naive Bayes ',"\n",'The best Smoothening is: ', cv_nb_model.bestModel._java_obj.getSmoothing(), "\n",'The best model type is:', cv_nb_model.bestModel._java_obj.getModelType(), "\n", 'Area under ROC is:', nbevaluator.evaluate(pred_test_nb, {nbevaluator.metricName: "areaUnderROC"}))

##nbauroc = nbevaluator.evaluate(nbresult, {pred_test_nb.metricName: "areaUnderROC"})
#nbauprc = nbevaluator.evaluate(nbresult, {pred_test_nb.metricName: "areaUnderPR"})


In [34]:
from pyspark.ml.classification import RandomForestClassifier

from pyspark.ml.evaluation import BinaryClassificationEvaluator

# Train a RandomForest model.
rf = RandomForestClassifier( numTrees=10)


# Train model.  This also runs the indexers.
rfmodel = rf.fit(trainDF)

# Make predictions.
rfresult = rfmodel.transform(testDF)

# Select example rows to display.
rfresult.select("prediction","label","features").show(10)

# Select (prediction, true label) and compute test error
rfevaluator = BinaryClassificationEvaluator()
print("evaluations: %(rfresult)s" % {"rfresult": rfevaluator.evaluate(rfresult)})



In [35]:
# We have only two choices: area under ROC and PR curves :-(
rfauroc = rfevaluator.evaluate(rfresult, {rfevaluator.metricName: "areaUnderROC"})
rfauprc = rfevaluator.evaluate(rfresult, {rfevaluator.metricName: "areaUnderPR"})
print("Area under ROC Curve: {:.4f}".format(rfauroc))
print("Area under PR Curve: {:.4f}".format(rfauprc))

In [36]:
#ESTIMATOR
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator

# Train a RandomForest model.
rf = RandomForestClassifier(featuresCol='features', labelCol='label')

#GRID VECTOR
from pyspark.ml.tuning import ParamGridBuilder
param_grid_rf = ParamGridBuilder().addGrid(rf.maxDepth, [2, 3, 4]).\
      addGrid(rf.minInfoGain, [0.0, 0.1, 0.2, 0.3]).\
      addGrid(rf.numTrees,[10,20,30,40,50,60, 100]).\
      build()
 
#Evaluator
from pyspark.ml.evaluation import BinaryClassificationEvaluator
rfevaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction")

#CROSS VALIDATION
from pyspark.ml.tuning import CrossValidator
cv_rf = CrossValidator(estimator=rf, evaluator=rfevaluator, estimatorParamMaps=param_grid_rf, numFolds=3)
cv_rf_model = cv_rf.fit(trainDF)  # fitiing data to my cross validation model

show_columns = ['features', 'label', 'prediction', 'rawPrediction', 'probability']
pred_training_rf = cv_rf_model.transform(trainDF)
pred_training_rf.select(show_columns).show(5, truncate=False)

pred_test_rf = cv_rf_model.transform(testDF)
pred_test_rf.select(show_columns).show(5, truncate=False)

print('Random forest ',"\n",'The best Max Depth is: ', cv_rf_model.bestModel._java_obj.getMaxDepth(), "\n",'The best min Info gain is:', cv_rf_model.bestModel._java_obj.getMinInfoGain(), "\n", 'Area under ROC is:', rfevaluator.evaluate(pred_test_rf, {rfevaluator.metricName: "areaUnderROC"}))
#rfmodel.trees
##nbauroc = rfevaluator.evaluate(pred_test_rf, {rfevaluator.metricName: "areaUnderROC"})
#nbauprc = rfevaluator.evaluate(pred_test_rf, {rfevaluator.metricName: "areaUnderPR"})

In [38]:
from pyspark.ml.classification import GBTClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator

# Train a GBT model.
gb = GBTClassifier()
# Chain indexers and GBT in a Pipeline

# Train model.  This also runs the indexers.
gbmodel = gb.fit(trainDF)

# Make predictions.
gbresult = gbmodel.transform(testDF)

# Select example rows to display.
gbresult.select("prediction","label","features").show(5)

# Select (prediction, true label) and compute test error
gbevaluator = BinaryClassificationEvaluator()

print("evaluations: %(gbresult)s" % {"gbresult": gbevaluator.evaluate(gbresult)})


In [39]:
# We have only two choices: area under ROC and PR curves :-(
gbauroc = gbevaluator.evaluate(gbresult, {gbevaluator.metricName: "areaUnderROC"})
gbauprc = gbevaluator.evaluate(gbresult, {gbevaluator.metricName: "areaUnderPR"})
print("Area under ROC Curve: {:.4f}".format(gbauroc))
print("Area under PR Curve: {:.4f}".format(gbauprc))

In [40]:
from pyspark.ml.classification import GBTClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator

# ESTIMATOR
gbt = GBTClassifier(featuresCol='features', labelCol='label')


#GRID VECTOR
from pyspark.ml.tuning import ParamGridBuilder
param_grid_gbt = ParamGridBuilder().\
    addGrid(gbt.maxDepth, [2, 3]).\
    addGrid(gbt.minInfoGain, [0.0, 0.1]).\
    addGrid(gbt.stepSize, [0.02, 0.05]).\
    build()
 
#Evaluator
from pyspark.ml.evaluation import BinaryClassificationEvaluator
gbtevaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction")

#CROSS VALIDATION
from pyspark.ml.tuning import CrossValidator
cv_gbt = CrossValidator(estimator=gbt, evaluator=gbtevaluator, estimatorParamMaps=param_grid_gbt)
cv_gbt_model = cv_gbt.fit(trainDF)  # fitiing data to my cross validation model

show_columns = ['features', 'label', 'prediction', 'rawPrediction', 'probability']
pred_training_gbt = cv_gbt_model.transform(trainDF)
pred_training_gbt.select(show_columns).show(5, truncate=False)

pred_test_gbt = cv_gbt_model.transform(testDF)
pred_test_gbt.select(show_columns).show(5, truncate=False)


print('Gradient Boosting ',"\n",'The best Max Depth is: ', cv_gbt_model.bestModel._java_obj.getMaxDepth(), "\n",'The best min Info gain is:',cv_gbt_model.bestModel._java_obj.getMinInfoGain(), "\n", 'step size: ', cv_gbt_model.bestModel._java_obj.getStepSize(),"\n" ,'Area under ROC is:', rfevaluator.evaluate(pred_test_rf, {rfevaluator.metricName: "areaUnderROC"}))

In [41]:
from pyspark.mllib.evaluation import MulticlassMetrics
scoreandlabels = gbresult.rdd.map(lambda z: (z["label"], z["prediction"]))
metric = MulticlassMetrics(scoreandlabels)


In [42]:
#model_stages = cv + cv_svm + cv_nb+cv_rf +cv_gbt
#pipelined_models = Pipeline(stages= model_stages).fit(trainDF)
#pipelined_result = pipelined_models.transform(testDF)

In [43]:
print('Models and their Performance',"\n")
print('Logistic Regression',cv_model.bestModel.summary.areaUnderROC)
print('Support Vector Machine',svmevaluator.evaluate(pred_test_svm, {svmevaluator.metricName: "areaUnderROC"}))
print('Naive Bayes', nbevaluator.evaluate(pred_test_nb, {nbevaluator.metricName: "areaUnderROC"}))
print('Random forest', rfevaluator.evaluate(pred_test_rf, {rfevaluator.metricName: "areaUnderROC"}))
print('Gradient Boost', gbtevaluator.evaluate(pred_test_gbt, {rfevaluator.metricName: "areaUnderROC"}))

In [44]:
from pyspark.sql.types import *
schema = StructType([StructField('model', StringType(), False),StructField('train AUC', DoubleType(), True),StructField('test AUC', DoubleType(), True),StructField('train PR', DoubleType(), True),StructField('test PR', DoubleType(), True)])
df = sqlContext.createDataFrame(sc.emptyRDD(), schema)

list = ['lr', 'svm', 'nb','rf','gbt']
columns = df.columns
for model in list:
  evaluators = eval(model+"evaluator")
  training_set = eval("pred_training_"+model)
  test_set = eval("pred_test_"+model)
  trnau = evaluators.evaluate(training_set, {evaluators.metricName: "areaUnderROC"})
  tstau = evaluators.evaluate(test_set, {evaluators.metricName: "areaUnderROC"})
  
  trnpr = evaluators.evaluate(test_set, {evaluators.metricName: "areaUnderPR"})
  tstpr = evaluators.evaluate(test_set, {evaluators.metricName: "areaUnderPR"})
  
  newRow = spark.createDataFrame([(model,trnau,tstau,trnpr,tstpr)], columns)
  df = df.union(newRow)
  #appended.show()
  

In [45]:
df.show()