In [1]:
# upload data and print schema
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('Clean Metadata').getOrCreate()

df_structured_raw = spark.read.parquet("/expanse/lustre/projects/uci150/hzhao16/clean_metadata_v2.parquet")

df_structured_raw.printSchema()

root
 |-- processid: string (nullable = true)
 |-- sampleid: string (nullable = true)
 |-- taxon: string (nullable = true)
 |-- phylum: string (nullable = true)
 |-- class: string (nullable = true)
 |-- order: string (nullable = true)
 |-- family: string (nullable = true)
 |-- dna_bin: string (nullable = true)
 |-- dna_barcode: string (nullable = true)
 |-- country: string (nullable = true)
 |-- province_state: string (nullable = true)
 |-- coord-lat: float (nullable = true)
 |-- coord-lon: float (nullable = true)
 |-- image_measurement_value: double (nullable = true)
 |-- area_fraction: double (nullable = true)
 |-- scale_factor: double (nullable = true)
 |-- inferred_ranks: integer (nullable = true)
 |-- split: string (nullable = true)
 |-- k_mer: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- one_hot_encoded: array (nullable = true)
 |    |-- element: array (containsNull = true)
 |    |    |-- element: integer (containsNull = true)
 |-- sequence_length:

In [2]:
# select several columns to make predictions on 'order'
selected_columns = [
    'coord-lat', 'coord-lon', 
    'image_measurement_value', 
    'area_fraction', 'scale_factor',
    'sequence_length', 'inferred_ranks',
    'order'
]

# drop missing values
df_structured_model =df_structured_raw.select(*selected_columns).dropna()

In [3]:

from pyspark.ml.feature import StringIndexer
label_indexer = StringIndexer(inputCol= 'order', outputCol = 'label')

from pyspark.ml.feature import VectorAssembler

assmeber = VectorAssembler(
    inputCols = [
    'coord-lat', 'coord-lon', 
    'image_measurement_value', 
    'area_fraction', 'scale_factor', 
    'sequence_length', 'inferred_ranks',
],
    outputCol = 'features'
)

from pyspark.ml.classification import RandomForestClassifier

rf = RandomForestClassifier(labelCol='label', featuresCol='features', numTrees=15, maxDepth=5)

In [4]:
from pyspark.ml import Pipeline

pipeline = Pipeline(stages = [label_indexer, assmeber, rf])

# split data into training data and test data
df_structured_train, df_structured_test = df_structured_model.randomSplit([0.8, 0.2], seed = 42)
model_structured_rf = pipeline.fit(df_structured_train)

In [5]:
df_structured_pred = model_structured_rf.transform(df_structured_test)

In [6]:
# make predictions on test data
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

evaluator_acc = MulticlassClassificationEvaluator(
    labelCol='label', predictionCol='prediction', metricName='accuracy'
)

evaluator_f1 = MulticlassClassificationEvaluator(
    labelCol='label', predictionCol='prediction', metricName='f1'
)

# evaluate test accuracy and macro-F1
accuracy = evaluator_acc.evaluate(df_structured_pred)
f1 = evaluator_f1.evaluate(df_structured_pred)

print(f'Accuracy: {accuracy:.4f}')
print(f'Macro F1 Score: {f1:.4f}')

Accuracy: 0.7556
Macro F1 Score: 0.6777


In [8]:
# make predictions on training data
evaluator_acc_train = MulticlassClassificationEvaluator(
    labelCol='label', predictionCol='prediction', metricName='accuracy'
)

evaluator_f1_train = MulticlassClassificationEvaluator(
    labelCol='label', predictionCol='prediction', metricName='f1'
)

# evaluate traing accuracy and macro-F1
train_predictions = model_structured_rf.transform(df_structured_train)

train_accuracy = evaluator_acc_train.evaluate(train_predictions)
train_f1 = evaluator_f1_train.evaluate(train_predictions)

print(f'Training Accuracy: {train_accuracy:.4f}')
print(f'Training Macro F1 Score: {train_f1:.4f}')


Training Accuracy: 0.7552
Training Macro F1 Score: 0.6772


In [None]:
# Fitting Graph Analysis:
# The training accuracy (0.7552) and test accuracy (0.7556) are very close,
# and the macro F1 scores are also similar. This indicates that the model is
# neither underfitting nor overfitting â€” it fits the data well.

# Our model is currently in the "Good Fit" zone on the fitting graph.
