# Entrenamiento Modelos

### Alejandro Villanueva Noriega
#### 01 de Septiembre de 2021
###### Aplicación de técnicas de Machine Learning a la predicción de fallos de discos mediante el uso de Spark

In [None]:
#Pyspark
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark import StorageLevel, SparkConf

from pyspark.ml.feature import StandardScaler, VectorAssembler, PCA
from pyspark.mllib.linalg import SparseVector, DenseVector, VectorUDT
from sklearn.metrics import classification_report
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.classification import LogisticRegression, LinearSVC, LinearSVCModel, RandomForestClassifier

#Basics
import pandas as pd
from datetime import datetime, timedelta
import seaborn as sns
import copy
import numpy as np
import matplotlib.pyplot as plt

%matplotlib inline

In [None]:
important_attributes = ['smart_Reallocated_Sector_Ct_raw_value','smart_Power_Cycle_Count_raw_value',
                        'smart_Reported_Uncorrect_raw_value', 'smart_Command_Timeout_raw_value',
                        'smart_High_Fly_Writes_raw_value', 'smart_Offline_Uncorrectable_raw_value',
                        'smart_UDMA_CRC_Error_Count_raw_value', 'label'
                       ]

In [None]:
df = spark.read.parquet('etl/etldone_1.parquet')
#df = spark.read.parquet('etl/etldone.parquet') 7 dias lag
#df = spark.read.parquet('etl/etldone_1.parquet') 1 dia lag
#df = spark.read.parquet('etl/etldone_2.parquet') 2 dias lag

In [None]:
df.count()

In [None]:
df.limit(10).toPandas()

In [None]:
important_attributes

In [None]:
input_data = df.select(important_attributes)
#input_data = df.select("smart_Reallocated_Sector_Ct_raw_value", "label")

vecAssembler = VectorAssembler(outputCol="features")

vecAssembler.setInputCols(important_attributes[:7])
#vecAssembler.setInputCols(important_attributes[:1])

output = vecAssembler.transform(input_data).select("label", "features")

In [None]:
#input_data = df.select(important_attributes)
#input_data = input_data.rdd.map(lambda x: (x[7], DenseVector(x[:7])))
#output = spark.createDataFrame(input_data, ["label", "features"])

In [None]:
output.show(5)

In [None]:
scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures", withMean=True, withStd=True)
scalerModel = scaler.fit(output)
scaledData = scalerModel.transform(output).select('label', 'scaledFeatures')

In [None]:
scaledData = scaledData.withColumnRenamed('scaledFeatures', 'features')

In [None]:
scaledData = scaledData.select( "features", scaledData.label.cast(LongType()))

In [None]:
scaledData.show(5)

In [None]:
scaledData.count()

In [None]:
train, test = scaledData.randomSplit([0.6,0.4], seed = 2)

### Undersampling

In [None]:
failDf = train.filter(train.label==1)
nofailDf = train.filter(train.label==0)
sampleRatio = float(failDf.count()) / float(train.count())
nofailDfSampleDf = nofailDf.sample(False, sampleRatio, seed=123)

In [None]:
nofailDfSampleDf.count()

In [None]:
failDf.count()

In [None]:
train.count()

In [None]:
test.count()

In [None]:
test.where('label="1"').count()

In [None]:
train = failDf.unionAll(nofailDfSampleDf)

In [None]:
train = train.select(train.features, train.label.cast('double'))

In [None]:
train.createOrReplaceTempView("data55")

In [None]:
spark.sql("SELECT COUNT(*) as L0 FROM data55 where label=0").show()

In [None]:
spark.sql("SELECT COUNT(*) as L1 FROM data55 where label=1").show()

In [None]:
test.createOrReplaceTempView("data56")

In [None]:
spark.sql("SELECT COUNT(*) as L0 FROM data56 where label=0").show() 

In [None]:
spark.sql("SELECT COUNT(*) as L1 FROM data56 where label=1").show()

### PCA

In [None]:
pca = PCA(k=3, inputCol="features", outputCol="pcaFeatures")
model = pca.fit(train)
train_pca = model.transform(train).select("pcaFeatures", 'label')
train_pca = train_pca.withColumnRenamed("pcaFeatures", 'features')
train_pca.limit(10).show(truncate=False)

In [None]:
pca = PCA(k=3, inputCol="features", outputCol="pcaFeatures")
model = pca.fit(test)
test_pca = model.transform(test).select("pcaFeatures",'label')
test_pca = test_pca.withColumnRenamed("pcaFeatures", 'features')
test_pca.limit(10).show(truncate=False)

In [None]:
print train_pca.count()
print test_pca.count()

# Logistic Regression

In [None]:
lr = LogisticRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8, featuresCol='features', labelCol='label')

# Fit the model
lrModel = lr.fit(train_pca)

# Print the coefficients and intercept for logistic regression
print("Coefficients: " + str(lrModel.coefficients))
print("Intercept: " + str(lrModel.intercept))

In [None]:
predictions_train = lrModel.transform(train_pca)

In [None]:
predictions_test = lrModel.transform(test_pca)

In [None]:
evaluator = BinaryClassificationEvaluator()
print 'Test Area Under ROC ' + str(evaluator.evaluate(predictions_test))

In [None]:
evaluator = BinaryClassificationEvaluator()
print 'Train Area Under ROC ' + str(evaluator.evaluate(predictions_train))

In [None]:
paramGrid = (ParamGridBuilder().addGrid(lr.regParam, [0.001,0.01])
             .addGrid(lr.elasticNetParam, [0.0,0.5,1.0])
             .addGrid(lr.maxIter, [20,30,100])
             .addGrid(lr.threshold, [0.5])
             .build())

In [None]:
cv = CrossValidator(estimator=lr, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=10)

In [None]:
cvModel = cv.fit(train_pca)

In [None]:
predictions_test_2 = cvModel.transform(test_pca)
print 'Test Area Under ROC ' + str(evaluator.evaluate(predictions_test_2))

In [None]:
predictions_train_2 = cvModel.transform(train_pca)
print 'Train Area Under ROC ' + str(evaluator.evaluate(predictions_train_2))

In [None]:
best_model = cvModel.bestModel

In [None]:
print best_model._java_obj.getRegParam()
print best_model._java_obj.getMaxIter()
print best_model._java_obj.getElasticNetParam()

In [None]:
ccm_lg = predictions_train_2.select('label', 'prediction').toPandas()
confusion_matrix = pd.crosstab(ccm_lg['label'], ccm_lg['prediction'], rownames=['Actual'], colnames=['Prediccion'])
print (confusion_matrix)
sns.heatmap(confusion_matrix, annot=True, cmap="Blues", fmt='g', cbar_kws={"orientation": "horizontal"})

In [None]:
ccm_lg = predictions_test_2.select('label', 'prediction').toPandas()
confusion_matrix = pd.crosstab(ccm_lg['label'], ccm_lg['prediction'], rownames=['Actual'], colnames=['Prediccion'])
print (confusion_matrix)
sns.heatmap(confusion_matrix, annot=True, cmap="Blues", fmt='g', cbar_kws={"orientation": "horizontal"})
plt.savefig('results/CM_LG_2DAYS.pdf')


In [None]:
report = classification_report(ccm_lg['label'], ccm_lg['prediction'], output_dict=True)

In [None]:
pd.DataFrame(report).transpose()

In [None]:
plt.figure(figsize=(5,5))
plt.plot([0, 1], [0, 1], 'r--')
plt.plot(best_model.summary.roc.select('FPR').collect(),
         best_model.summary.roc.select('TPR').collect())
plt.xlabel('FP')
plt.ylabel('TP')
plt.savefig('results/ROC_LG_2DAYS.pdf')

# SVM

In [None]:
lsvc = LinearSVC(maxIter=10, regParam=0.1)

In [None]:
lsvcModel = lsvc.fit(train_pca)

In [None]:
coefficients = lsvcModel.coefficients
intercept = lsvcModel.intercept
print("Some coefficients: " + str(coefficients[250:300]))
print("Intercept: " + str(intercept))

In [None]:
predictions_train_SVM = lsvcModel.transform(train_pca)

In [None]:
predictions_test_SVM = lsvcModel.transform(test_pca)

In [None]:
evaluator = BinaryClassificationEvaluator()
print 'Test Area Under ROC ' + str(evaluator.evaluate(predictions_test_SVM))

In [None]:
evaluator = BinaryClassificationEvaluator()
print 'Train Area Under ROC ' + str(evaluator.evaluate(predictions_train_SVM))

In [None]:
print 'Accuracy ' + str(predictions_test_SVM.filter(predictions_test_SVM.label == predictions_test_SVM.prediction).count()/float(predictions_test_SVM.count()))

In [None]:
paramGrid = (ParamGridBuilder().addGrid(lsvc.regParam, [ 0.1])
                                       .addGrid(lr.maxIter, [1,5,10])
             .build())

In [None]:
cv = CrossValidator(estimator=lsvc, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=3)

In [None]:
cvModel = cv.fit(train_pca)

In [None]:
predictions_test_2_SVM = cvModel.transform(test_pca)
print 'Test Area Under ROC ' + str(evaluator.evaluate(predictions_test_2_SVM))

In [None]:
predictions_train_2_SVM = cvModel.transform(train_pca)
print 'Train Area Under ROC ' + str(evaluator.evaluate(predictions_train_2_SVM))

In [None]:
print 'Accuracy ' + str(predictions_test_2_SVM.filter(predictions_test_2_SVM.label == predictions_test_2_SVM.prediction).count()/float(predictions_test_2_SVM.count()))

In [None]:
best_model_SVM = cvModel.bestModel

In [None]:
print best_model_SVM._java_obj.getRegParam()
print best_model_SVM._java_obj.getMaxIter()

In [None]:
ccm_lg = predictions_train_2.select('label', 'prediction').toPandas()
confusion_matrix = pd.crosstab(ccm_lg['label'], ccm_lg['prediction'], rownames=['Actual'], colnames=['Prediccion'])
print (confusion_matrix)
sns.heatmap(confusion_matrix, annot=True, cmap="Blues", fmt='g', cbar_kws={"orientation": "horizontal"})

In [None]:
ccm_svm = predictions_test_2_SVM.select('label', 'prediction').toPandas()
confusion_matrix = pd.crosstab(ccm_svm['label'], ccm_svm['prediction'], rownames=['Actual'], colnames=['Prediccion'])
print (confusion_matrix)
sns.heatmap(confusion_matrix, annot=True, cmap="Blues", fmt='g', cbar_kws={"orientation": "horizontal"})
plt.savefig('results/CM_SVM_2DAYS.pdf')

In [None]:
report = classification_report(ccm_svm['label'], ccm_svm['prediction'], output_dict=True)

In [None]:
pd.DataFrame(report).transpose()

# Random Forest

In [None]:
rf = RandomForestClassifier(featuresCol = 'features', labelCol = 'label', numTrees=100 )
rfModel = rf.fit(train_pca)

In [None]:
predictions_train_RF = rfModel.transform(train_pca)

In [None]:
predictions_test_RF = rfModel.transform(test_pca)

In [None]:
evaluator = BinaryClassificationEvaluator()
print 'Test Area Under ROC ' + str(evaluator.evaluate(predictions_test_RF))

In [None]:
evaluator = BinaryClassificationEvaluator()
print 'Train Area Under ROC ' + str(evaluator.evaluate(predictions_train_RF))

In [None]:
print 'Accuracy ' + str(predictions_test_RF.filter(predictions_test_RF.label == predictions_test_RF.prediction).count()/float(predictions_test_RF.count()))

In [None]:
paramGrid = (ParamGridBuilder()
             .addGrid(rf.numTrees, [50,100,150,300])
             .addGrid(rf.maxDepth, [1,2,3])
             .build())

In [None]:
cv = CrossValidator(estimator=rf, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=20)

In [None]:
cvModel = cv.fit(train_pca)

In [None]:
predictions_test_2 = cvModel.transform(test_pca)
print 'Test Area Under ROC ' + str(evaluator.evaluate(predictions_test_2))

In [None]:
predictions_train_2 = cvModel.transform(train_pca)
print 'Train Area Under ROC ' + str(evaluator.evaluate(predictions_train_2))

In [None]:
best_model_RF = cvModel.bestModel

In [None]:
print best_model_RF._java_obj.getMaxDepth()
print best_model_RF._java_obj.getNumTrees()

In [None]:
ccm_lg = predictions_train_2.select('label', 'prediction').toPandas()
confusion_matrix = pd.crosstab(ccm_lg['label'], ccm_lg['prediction'], rownames=['Actual'], colnames=['Prediccion'])
print (confusion_matrix)
sns.heatmap(confusion_matrix, annot=True, cmap="Blues", fmt='g', cbar_kws={"orientation": "horizontal"})

In [None]:
ccm_lg = predictions_test_2.select('label', 'prediction').toPandas()
confusion_matrix = pd.crosstab(ccm_lg['label'], ccm_lg['prediction'], rownames=['Actual'], colnames=['Prediccion'])
print (confusion_matrix)
sns.heatmap(confusion_matrix, annot=True, cmap="Blues", fmt='g', cbar_kws={"orientation": "horizontal"})
plt.savefig('results/CM_RF_UNDER.pdf')

In [None]:
report = classification_report(ccm_lg['label'], ccm_lg['prediction'], output_dict=True)

In [None]:
pd.DataFrame(report).transpose()