# Final predictions


In [None]:
import pyspark
from pyspark.sql.types import *
from pyspark.sql import SparkSession
spark = SparkSession.builder \
    .appName("testca") \
    .getOrCreate()

steel_schema = StructType([StructField("id", IntegerType(), True),
                           StructField("X_minimum", IntegerType(), True),
                           StructField("X_maximum", IntegerType(), True),
                           StructField("Y_minimum", IntegerType(), True),
                           StructField("Y_maximum", IntegerType(), True),
                           StructField("Pixels_areas", IntegerType(), True),
                           StructField("X_perimeter", IntegerType(), True),
                           StructField("Y_perimeter", IntegerType(), True),
                           StructField("Sum_of_luminosity", IntegerType(), True),
                           StructField("Minimum_luminosity", IntegerType(), True),
                           StructField("Maximum_luminosity", IntegerType(), True),
                           StructField("Length_conveyer", IntegerType(), True),
                           StructField("TypeSteel_A300", IntegerType(), True),
                           StructField("TypeSteel_A400", IntegerType(), True),
                           StructField("SteelPlate_thickness", IntegerType(), True),
                           StructField("Edges_index", FloatType(), True),
                           StructField("Empty_index", FloatType(), True),
                           StructField("Square_index", FloatType(), True),
                           StructField("OutsideX_index", FloatType(), True),
                           StructField("EdgesX_index", FloatType(), True),
                           StructField("EdgesY_index", FloatType(), True),
                           StructField("OutsideGlobal_index", FloatType(), True),
                           StructField("LogOfAreas", FloatType(), True),
                           StructField("LogX_index", FloatType(), True),
                           StructField("LogY_index", FloatType(), True),
                           StructField("Orientation_index", FloatType(), True),
                           StructField("Luminosity_index", FloatType(), True),
                           StructField("SigmoidOfAreas", FloatType(), True),
                           StructField("Pastry", IntegerType(), True),
                           StructField("Z_scratch", IntegerType(), True),
                           StructField("K_scratch", IntegerType(), True),
                           StructField("Stains", IntegerType(), True),
                           StructField("Dirtiness", IntegerType(), True),
                           StructField("Bumps", IntegerType(), True),
                           StructField("Other_faults", IntegerType(), True),
                           StructField("target_col", StringType(), True),
                           ])
steelpred_schema = StructType([StructField("id", IntegerType(), True),
                           StructField("X_minimum", IntegerType(), True),
                           StructField("X_maximum", IntegerType(), True),
                           StructField("Y_minimum", IntegerType(), True),
                           StructField("Y_maximum", IntegerType(), True),
                           StructField("Pixels_areas", IntegerType(), True),
                           StructField("X_perimeter", IntegerType(), True),
                           StructField("Y_perimeter", IntegerType(), True),
                           StructField("Sum_of_luminosity", IntegerType(), True),
                           StructField("Minimum_luminosity", IntegerType(), True),
                           StructField("Maximum_luminosity", IntegerType(), True),
                           StructField("Length_conveyer", IntegerType(), True),
                           StructField("TypeSteel_A300", IntegerType(), True),
                           StructField("TypeSteel_A400", IntegerType(), True),
                           StructField("SteelPlate_thickness", IntegerType(), True),
                           StructField("Edges_index", FloatType(), True),
                           StructField("Empty_index", FloatType(), True),
                           StructField("Square_index", FloatType(), True),
                           StructField("OutsideX_index", FloatType(), True),
                           StructField("EdgesX_index", FloatType(), True),
                           StructField("EdgesY_index", FloatType(), True),
                           StructField("OutsideGlobal_index", FloatType(), True),
                           StructField("LogOfAreas", FloatType(), True),
                           StructField("LogX_index", FloatType(), True),
                           StructField("LogY_index", FloatType(), True),
                           StructField("Orientation_index", FloatType(), True),
                           StructField("Luminosity_index", FloatType(), True),
                           StructField("SigmoidOfAreas", FloatType(), True),])

#import dataset
steeltrain_path = "dataset/train_clean.csv"
steelpred_path = "dataset/test.csv"
steel_train = spark.read.csv(steeltrain_path, header=True, schema=steel_schema)
steel_pred = spark.read.csv(steelpred_path, header=True, schema=steelpred_schema)

steel_train.cache
steel_pred.cache

#extract col names
target_cols = steel_train.columns[-8:-1]
feature_cols = steel_train.columns[1:-8]
print(feature_cols,"\n", target_cols)

In [None]:
##cross validator

from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.ml.classification import MultilayerPerceptronClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.functions import vector_to_array
from pyspark.sql.functions import col, udf

## MODEL CREATION
#define scaler & assembler
assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")
scaler = StandardScaler(inputCol="features", outputCol="scaled_features")

#create pipelines for each target class
pipelines = []
for label_col in target_cols:
    mlp = MultilayerPerceptronClassifier(featuresCol="scaled_features", labelCol=label_col, layers=[len(feature_cols), 13, 6, 2])

    evaluator = MulticlassClassificationEvaluator(labelCol=label_col, metricName="f1") 
    param_grid = ParamGridBuilder().addGrid(mlp.maxIter, [50, 100, 200]).addGrid(mlp.stepSize, [0.03, 0.1]).build()
    cv = CrossValidator(estimator=mlp, estimatorParamMaps=param_grid, evaluator=evaluator, numFolds=3)

    mlp_pipeline = Pipeline(stages=[assembler, scaler, cv])
    pipelines.append(mlp_pipeline)

#fit model
models = []
for pipeline in pipelines:
    model = pipeline.fit(steel_train)
    models.append(model)


##EXTRACTING & CONCATENATING PROBABILITIES FOR SUBMISSION
combined_probs = steel_pred.select('id')
#iterate through each model and extract predicted probabilities
for i, model in enumerate(models):
    prediction = model.transform(steel_pred) ##redudant FIX
    
    #rename probability columns
    prob_col_name = f"probability_{target_cols[i]}" 
    prediction = prediction.withColumnRenamed("probability", prob_col_name)

    #join each prediction + new col names to combined_probability df
    combined_probs = combined_probs.join(prediction.select('id', prob_col_name), on='id', how='inner')

#extract column vectors
comb_cols = combined_probs.columns[1:]
combined_df1 = combined_probs.select("id")

#loop to convert dense vector into array
for i in comb_cols:
    array_col = vector_to_array(col(i), "float32").alias(i + "_vec")
    combined_df1 = combined_df1.join(
        combined_probs.select("id", array_col),
        on="id",
        how="inner"
    )

#define function to return element index 1 = true probability from arrays
def get_true_prob(array):
    if array is not None and len(array) > 1:
        return array[1]
    else:
        return None
get_udf = udf(get_true_prob)

#array column names of probabilities
new_cols = combined_df1.columns[1:]

#loop through array column names and target col names to append columns to new df
for col_vec, i in zip(new_cols, target_cols):
    combined_df1 = combined_df1.withColumn(i, get_udf(col(col_vec)))

#select relevant columns for submission in final new df
final_cols = target_cols.copy()
final_cols.append("id")
final = combined_df1.select(final_cols)

#check all is well
final.show(5)

In [None]:
#write to csv file
final.write.option("header",True).csv("dataset/final_submission.csv")

In [None]:
spark.stop()