In [23]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.ml.feature import StandardScaler, VectorAssembler
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [24]:
spark = SparkSession.builder.getOrCreate()

In [29]:
# Load Data
train_df = spark.read.option("inferschema", "true").csv("BigFootTraining.csv", header=True)
test_df = spark.read.option("inferschema", "true").csv("BigFootTesting.csv", header=True)

# Select Features
train_df = train_df.select("fur_color", "weight", "eye_color", "prehistoric")
test_df = test_df.select("fur_color", "weight", "eye_color", "prehistoric")

# Data Preprocessing
train_df = train_df.na.drop()
test_df = test_df.na.drop()

# Data Transform
train_df = train_df.withColumn("fur_color", when(train_df["fur_color"]=="orange",0).
                              when(train_df["fur_color"]=="grey",1).
                              when(train_df["fur_color"]=="calico",2).
                              when(train_df["fur_color"]=="white",3).
                              when(train_df["fur_color"]=="red",4).
                              when(train_df["fur_color"]=="black",5).
                              when(train_df["fur_color"]=="brown",6).
                              when(train_df["fur_color"]=="dark grey",7)
                              )

train_df = train_df.withColumn("eye_color", when(train_df["eye_color"]=="dark brown",0).
                              when(train_df["eye_color"]=="green",1).
                              when(train_df["eye_color"]=="black",2).
                              when(train_df["eye_color"]=="brown",3).
                              when(train_df["eye_color"]=="blue",4)
                              )

train_df = train_df.withColumn("prehistoric", when(train_df["prehistoric"]=="false",0).
                              when(train_df["prehistoric"]=="true",1)
                              )

train_df = train_df.withColumn("weight", regexp_replace(train_df["weight"], " kg", "").cast("int"))

test_df = test_df.withColumn("fur_color", when(test_df["fur_color"]=="orange",0).
                              when(test_df["fur_color"]=="grey",1).
                              when(test_df["fur_color"]=="calico",2).
                              when(test_df["fur_color"]=="white",3).
                              when(test_df["fur_color"]=="red",4).
                              when(test_df["fur_color"]=="black",5).
                              when(test_df["fur_color"]=="brown",6).
                              when(test_df["fur_color"]=="dark grey",7)
                              )

test_df = test_df.withColumn("eye_color", when(test_df["eye_color"]=="dark brown",0).
                              when(test_df["eye_color"]=="green",1).
                              when(test_df["eye_color"]=="black",2).
                              when(test_df["eye_color"]=="brown",3).
                              when(test_df["eye_color"]=="blue",4)
                            )

test_df = test_df.withColumn("prehistoric", when(test_df["prehistoric"]=="false",0).
                              when(test_df["prehistoric"]=="true",1)
                            )

test_df = test_df.withColumn("weight", regexp_replace(test_df["weight"], " kg", "").cast("int"))


# Normalization
col = train_df.columns
col.remove("prehistoric")

train_df = VectorAssembler(inputCols=col, outputCol="features").transform(train_df)
train_df = StandardScaler(inputCol="features", outputCol="scaled_feature").fit(train_df).transform(train_df)

test_df = VectorAssembler(inputCols=col, outputCol="features").transform(test_df)
test_df = StandardScaler(inputCol="features", outputCol="scaled_feature").fit(test_df).transform(test_df)

# Model
model = LogisticRegression(featuresCol="scaled_feature", labelCol="prehistoric", maxIter=10).fit(train_df)

prediction = model.transform(test_df)
prediction.select("prehistoric", "prediction").show()

# Accuracy
accuracy = BinaryClassificationEvaluator(labelCol="prehistoric")
print(accuracy.evaluate(prediction)*100)


+-----------+----------+
|prehistoric|prediction|
+-----------+----------+
|          0|       0.0|
|          1|       1.0|
|          1|       1.0|
|          0|       0.0|
|          1|       1.0|
|          0|       0.0|
|          1|       1.0|
|          0|       0.0|
|          0|       1.0|
|          0|       1.0|
|          0|       0.0|
|          0|       0.0|
|          1|       1.0|
|          0|       0.0|
|          0|       0.0|
|          0|       0.0|
|          0|       1.0|
|          1|       1.0|
|          1|       1.0|
|          1|       1.0|
+-----------+----------+
only showing top 20 rows

89.72011270264959


In [8]:
train_df.groupBy("eye_color").count().show()
train_df.groupBy("fur_color").count().show()

+----------+-----+
| eye_color|count|
+----------+-----+
|dark brown|  912|
|     green| 1648|
|     black| 1918|
|     brown| 1819|
|      blue| 1671|
+----------+-----+

+---------+-----+
|fur_color|count|
+---------+-----+
|   orange|  843|
|     grey| 1063|
|   calico|  907|
|    white| 1061|
|      red|  864|
|    black| 1058|
|    brown| 1071|
|dark grey| 1101|
+---------+-----+



In [11]:
train_df.show()
test_df.show()

+---------+------+---------+-----------+
|fur_color|weight|eye_color|prehistoric|
+---------+------+---------+-----------+
|        1|   122|        1|          1|
|        4|   135|        4|          1|
|        3|    94|        2|          1|
|        1|    69|        3|          0|
|        6|   126|        4|          1|
|        5|   111|        2|          1|
|        6|   104|        1|          0|
|        1|   162|        3|          1|
|        4|   210|        3|          0|
|        4|   105|        2|          0|
|        6|   212|        2|          1|
|        5|   271|        2|          1|
|        6|   238|        4|          1|
|        4|   280|        3|          1|
|        5|    80|        4|          1|
|        0|    79|        4|          1|
|        6|   171|        1|          1|
|        7|   216|        1|          0|
|        6|   205|        2|          1|
|        7|   179|        3|          1|
+---------+------+---------+-----------+
only showing top