In [1]:
# Import packages
import os
from google.colab import drive

# Find the latest version of spark 3.x  from http://www.apache.org/dist/spark/ and enter as the spark version
# For example:
# spark_version = 'spark-3.4.0'
spark_version = 'spark-3.4.0'
os.environ['SPARK_VERSION']=spark_version

# Install Spark and Java
!apt-get update
!apt-get install openjdk-11-jdk-headless -qq > /dev/null
!wget -q http://www.apache.org/dist/spark/$SPARK_VERSION/$SPARK_VERSION-bin-hadoop3.tgz
!tar xf $SPARK_VERSION-bin-hadoop3.tgz
!pip install -q findspark

# Set Environment Variables
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"
os.environ["SPARK_HOME"] = f"/content/{spark_version}-bin-hadoop3"

# Start a SparkSession
import findspark
findspark.init()

# Import Remaining Packages (Post-Install)
from pyspark.sql import SparkSession
from pyspark import SparkFiles
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.sql import functions as F
from pyspark.sql.functions import round, col
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

0% [Working]            Get:1 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,626 B]
Hit:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease
Hit:3 http://archive.ubuntu.com/ubuntu jammy InRelease
Get:4 http://security.ubuntu.com/ubuntu jammy-security InRelease [110 kB]
Get:5 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [119 kB]
Get:6 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ Packages [43.3 kB]
Hit:7 https://ppa.launchpadcontent.net/c2d4u.team/c2d4u4.0+/ubuntu jammy InRelease
Get:8 http://archive.ubuntu.com/ubuntu jammy-backports InRelease [109 kB]
Get:9 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease [18.1 kB]
Get:10 http://security.ubuntu.com/ubuntu jammy-security/restricted amd64 Packages [850 kB]
Get:11 http://archive.ubuntu.com/ubuntu jammy-updates/universe amd64 Packages [1,235 kB]
Hit:12 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease
Get

In [2]:
# Create a SparkSession
spark = SparkSession.builder.appName("SparkML").getOrCreate()
spark

In [3]:
# Connect to google drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [7]:
# Read in the csv data from google drive
# Double check to make sure your pathway to the csv's is the same!!!!
# spark.sparkContext.addFile('/content/gdrive/MyDrive/Bootcamp/Project4/Data/test_timeseries.csv')
# test_timeseries_df = spark.read.csv(SparkFiles.get('/content/gdrive/MyDrive/Bootcamp/Project4/Data/test_timeseries.csv'), sep=",", header=True, inferSchema=True)

spark.sparkContext.addFile('/content/gdrive/MyDrive/Colab Notebooks/Project4/Data/test_timeseries.csv')
test_timeseries_df = spark.read.csv(SparkFiles.get('/content/gdrive/MyDrive/Colab Notebooks/Project4/Data/test_timeseries.csv'), sep=",", header=True, inferSchema=True)

In [8]:
# Columns that we do not want to include as features
exclude_columns = ['score', 'fips', 'date']

# Set feature columns (excluding non-feature columns)
feature_columns = [col_name for col_name in test_timeseries_df.columns if col_name not in exclude_columns]
feature_columns

['PRECTOT',
 'PS',
 'QV2M',
 'T2M',
 'T2MDEW',
 'T2MWET',
 'T2M_MAX',
 'T2M_MIN',
 'T2M_RANGE',
 'TS',
 'WS10M',
 'WS10M_MAX',
 'WS10M_MIN',
 'WS10M_RANGE',
 'WS50M',
 'WS50M_MAX',
 'WS50M_MIN',
 'WS50M_RANGE']

In [9]:
# PREPARE TEST DATA Part 1
# Drop NA in score column
test_timeseries_df = test_timeseries_df.na.drop(subset=["score"])

# Round score column
test_timeseries_df = test_timeseries_df.withColumn('score', round(test_timeseries_df['score']))

# Cast score column to integer
test_timeseries_df = test_timeseries_df.withColumn('score', col('score').cast('integer'))

# Show adjusted score column
test_timeseries_df.groupBy('score').count().orderBy(F.desc('count')).show()

+-----+------+
|score| count|
+-----+------+
|    0|247720|
|    1| 42486|
|    2| 22159|
|    3|  9590|
|    4|  3578|
|    5|   807|
+-----+------+



In [10]:
# PREPARE TEST DATA Part 2
# Create VectorAssembler instance to combine feature columns into a single dense vector column
assembler_test = VectorAssembler(inputCols=feature_columns, outputCol="features")

# Apply VectorAssembler to create the dense vector column
test_df_with_features = assembler_test.transform(test_timeseries_df)

# Apply feature scaling using StandardScaler
test_scaler = StandardScaler(inputCol="features", outputCol="scaled_features", withMean=True, withStd=True)

# Fit the StandardScaler to compute mean and standard deviation
test_scaler_model = test_scaler.fit(test_df_with_features)

# Transform the DataFrame to get scaled feature vectors
test_df_with_scaled_features = test_scaler_model.transform(test_df_with_features)
test_df_with_scaled_features.show()

+----+----------+-------+------+-----+-----+------+------+-------+-------+---------+-----+-----+---------+---------+-----------+-----+---------+---------+-----------+-----+--------------------+--------------------+
|fips|      date|PRECTOT|    PS| QV2M|  T2M|T2MDEW|T2MWET|T2M_MAX|T2M_MIN|T2M_RANGE|   TS|WS10M|WS10M_MAX|WS10M_MIN|WS10M_RANGE|WS50M|WS50M_MAX|WS50M_MIN|WS50M_RANGE|score|            features|     scaled_features|
+----+----------+-------+------+-----+-----+------+------+-------+-------+---------+-----+-----+---------+---------+-----------+-----+---------+---------+-----------+-----+--------------------+--------------------+
|1001|2019-01-01|   2.25|100.51| 9.69|14.71| 13.55| 13.52|  17.38|  10.92|     6.46|14.63|  1.2|      1.5|     0.79|       0.71| 2.74|     4.01|     1.23|       2.78|    0|[2.25,100.51,9.69...|[-0.0834925938573...|
|1001|2019-01-08|   0.05|100.57| 8.75|13.15| 12.01| 11.98|  20.04|   9.36|    10.68|13.05| 2.33|      3.4|     1.68|       1.72| 5.13|     6

In [12]:
from pyspark.ml.classification import RandomForestClassificationModel
# model_path = "/content/gdrive/MyDrive/Bootcamp/Project4/Trained_Models/rf_model"
model_path = "/content/gdrive/MyDrive/Colab Notebooks/Project4/Trained_Models/rf_model"

# Load the saved model
rf_model = RandomForestClassificationModel.load(model_path)

In [14]:
# Make predictions
predictions = rf_model.transform(test_df_with_scaled_features)
predictions

DataFrame[fips: int, date: date, PRECTOT: double, PS: double, QV2M: double, T2M: double, T2MDEW: double, T2MWET: double, T2M_MAX: double, T2M_MIN: double, T2M_RANGE: double, TS: double, WS10M: double, WS10M_MAX: double, WS10M_MIN: double, WS10M_RANGE: double, WS50M: double, WS50M_MAX: double, WS50M_MIN: double, WS50M_RANGE: double, score: int, features: vector, scaled_features: vector, rawPrediction: vector, probability: vector, prediction: double]

In [15]:
# Evaluate the model
evaluator = MulticlassClassificationEvaluator(labelCol="score", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Test set accuracy = " + str(accuracy))

Test set accuracy = 0.7590856162284734


In [16]:
# Get feature importance
importances = rf_model.featureImportances
for i in range(len(feature_columns)):
    print(f"Importance of feature {feature_columns[i]}: {importances[i]}")

Importance of feature PRECTOT: 0.02393133981054897
Importance of feature PS: 0.22410796588629708
Importance of feature QV2M: 0.04032705773102391
Importance of feature T2M: 0.09925200784951221
Importance of feature T2MDEW: 0.02695971018692035
Importance of feature T2MWET: 0.018289601649166758
Importance of feature T2M_MAX: 0.2671125308908474
Importance of feature T2M_MIN: 0.0027053454561875904
Importance of feature T2M_RANGE: 0.13774979316465383
Importance of feature TS: 0.1488162902104497
Importance of feature WS10M: 0.00018410829219831372
Importance of feature WS10M_MAX: 0.000622954668455873
Importance of feature WS10M_MIN: 0.0
Importance of feature WS10M_RANGE: 0.009524886096630906
Importance of feature WS50M: 5.3818632006306046e-05
Importance of feature WS50M_MAX: 0.00036258947510059533
Importance of feature WS50M_MIN: 0.0
Importance of feature WS50M_RANGE: 0.0
