In [3]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [1]:
# read training and test data from csv files
TRAINING_PATH = "./train.csv"
TESTING_PATH = "./test.csv"

In [4]:

spark = SparkSession.builder.appName("RandomForestExample").getOrCreate()
sc = spark.sparkContext

In [27]:
training_rdd = spark.read.csv(TRAINING_PATH, header=True, inferSchema=True)
testing_rdd = spark.read.csv(TESTING_PATH, header=True, inferSchema=True)

In [10]:
# training_rdd.show(10)
# testing_rdd.show(10)

# print dataset length
print("Training data length: ", training_rdd.count())
print("Testing data length: ", testing_rdd.count())

# print the count of label values in training data
training_rdd.groupBy("label").count().show()
testing_rdd.groupBy("label").count().show()


Training data length:  177041
Testing data length:  58329
+-----+------+
|label| count|
+-----+------+
|    0| 75561|
|    1|101480|
+-----+------+

+-----+-----+
|label|count|
+-----+-----+
|    0|24959|
|    1|33370|
+-----+-----+



In [26]:
# print column data types
training_rdd.printSchema()

root
 |-- URLLength: integer (nullable = true)
 |-- URLSimilarityIndex: double (nullable = true)
 |-- CharContinuationRate: double (nullable = true)
 |-- TLDLegitimateProb: double (nullable = true)
 |-- URLCharProb: double (nullable = true)
 |-- NoOfSubDomain: integer (nullable = true)
 |-- LetterRatioInURL: double (nullable = true)
 |-- DegitRatioInURL: double (nullable = true)
 |-- SpacialCharRatioInURL: double (nullable = true)
 |-- LineOfCode: integer (nullable = true)
 |-- LargestLineLength: integer (nullable = true)
 |-- URLTitleMatchScore: double (nullable = true)
 |-- NoOfURLRedirect: integer (nullable = true)
 |-- NoOfSelfRedirect: integer (nullable = true)
 |-- NoOfPopup: integer (nullable = true)
 |-- NoOfiFrame: integer (nullable = true)
 |-- NoOfImage: integer (nullable = true)
 |-- NoOfCSS: integer (nullable = true)
 |-- NoOfJS: integer (nullable = true)
 |-- NoOfSelfRef: integer (nullable = true)
 |-- NoOfEmptyRef: integer (nullable = true)
 |-- NoOfExternalRef: integer 

In [16]:
# #convert all columns to integers
# for column in training_rdd.columns:
#     training_rdd = training_rdd.withColumn(column, training_rdd[column].cast("int"))
    
# for column in testing_rdd.columns:
#     testing_rdd = testing_rdd.withColumn(column, testing_rdd[column].cast("int"))
    
# training_rdd.printSchema()
# testing_rdd.printSchema()


root
 |-- URLLength: integer (nullable = true)
 |-- URLSimilarityIndex: integer (nullable = true)
 |-- CharContinuationRate: integer (nullable = true)
 |-- TLDLegitimateProb: integer (nullable = true)
 |-- URLCharProb: integer (nullable = true)
 |-- NoOfSubDomain: integer (nullable = true)
 |-- LetterRatioInURL: integer (nullable = true)
 |-- DegitRatioInURL: integer (nullable = true)
 |-- SpacialCharRatioInURL: integer (nullable = true)
 |-- LineOfCode: integer (nullable = true)
 |-- LargestLineLength: integer (nullable = true)
 |-- URLTitleMatchScore: integer (nullable = true)
 |-- NoOfURLRedirect: integer (nullable = true)
 |-- NoOfSelfRedirect: integer (nullable = true)
 |-- NoOfPopup: integer (nullable = true)
 |-- NoOfiFrame: integer (nullable = true)
 |-- NoOfImage: integer (nullable = true)
 |-- NoOfCSS: integer (nullable = true)
 |-- NoOfJS: integer (nullable = true)
 |-- NoOfSelfRef: integer (nullable = true)
 |-- NoOfEmptyRef: integer (nullable = true)
 |-- NoOfExternalRef: 

In [17]:
training_rdd.show(10)

+---------+------------------+--------------------+-----------------+-----------+-------------+----------------+---------------+---------------------+----------+-----------------+------------------+---------------+----------------+---------+----------+---------+-------+------+-----------+------------+---------------+-------+--------+----------+------+------------+--------------+---------------------+------------+---------------+---------------+----------------+----+---+------+----------------+-----+
|URLLength|URLSimilarityIndex|CharContinuationRate|TLDLegitimateProb|URLCharProb|NoOfSubDomain|LetterRatioInURL|DegitRatioInURL|SpacialCharRatioInURL|LineOfCode|LargestLineLength|URLTitleMatchScore|NoOfURLRedirect|NoOfSelfRedirect|NoOfPopup|NoOfiFrame|NoOfImage|NoOfCSS|NoOfJS|NoOfSelfRef|NoOfEmptyRef|NoOfExternalRef|IsHTTPS|HasTitle|HasFavicon|Robots|IsResponsive|HasDescription|HasExternalFormSubmit|HasSocialNet|HasSubmitButton|HasHiddenFields|HasPasswordField|Bank|Pay|Crypto|HasCopyrightIn

In [14]:
# Specify the feature columns, excluding the label column
feature_columns = [col for col in training_rdd.columns if col != 'label']
assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")

# Assemble features for both train and test datasets
train_df = assembler.transform(training_rdd).select("features", "label")
test_df = assembler.transform(testing_rdd).select("features", "label")


# Create and train the Random Forest classifier
rf = RandomForestClassifier(labelCol="label", featuresCol="features", numTrees=10)
rf_model = rf.fit(train_df)

# Make predictions on the test set
predictions = rf_model.transform(test_df)

# Evaluate the model
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print(f"Test Accuracy = {accuracy:.4f}")



IllegalArgumentException: Data type string of column URLLength is not supported.
Data type string of column URLSimilarityIndex is not supported.
Data type string of column CharContinuationRate is not supported.
Data type string of column TLDLegitimateProb is not supported.
Data type string of column URLCharProb is not supported.
Data type string of column NoOfSubDomain is not supported.
Data type string of column LetterRatioInURL is not supported.
Data type string of column DegitRatioInURL is not supported.
Data type string of column SpacialCharRatioInURL is not supported.
Data type string of column LineOfCode is not supported.
Data type string of column LargestLineLength is not supported.
Data type string of column URLTitleMatchScore is not supported.
Data type string of column NoOfURLRedirect is not supported.
Data type string of column NoOfSelfRedirect is not supported.
Data type string of column NoOfPopup is not supported.
Data type string of column NoOfiFrame is not supported.
Data type string of column NoOfImage is not supported.
Data type string of column NoOfCSS is not supported.
Data type string of column NoOfJS is not supported.
Data type string of column NoOfSelfRef is not supported.
Data type string of column NoOfEmptyRef is not supported.
Data type string of column NoOfExternalRef is not supported.
Data type string of column IsHTTPS is not supported.
Data type string of column HasTitle is not supported.
Data type string of column HasFavicon is not supported.
Data type string of column Robots is not supported.
Data type string of column IsResponsive is not supported.
Data type string of column HasDescription is not supported.
Data type string of column HasExternalFormSubmit is not supported.
Data type string of column HasSocialNet is not supported.
Data type string of column HasSubmitButton is not supported.
Data type string of column HasHiddenFields is not supported.
Data type string of column HasPasswordField is not supported.
Data type string of column Bank is not supported.
Data type string of column Pay is not supported.
Data type string of column Crypto is not supported.
Data type string of column HasCopyrightInfo is not supported.