In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import when
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [2]:
spark = SparkSession.builder.getOrCreate()

In [3]:
df_train = spark.read.csv("Classification/Planet_Training.csv", header=True, inferSchema=True)
df_train = df_train.select("Temperature", "Water", "Atmosphere Color", "Habitable")
df_train = df_train.na.drop()

In [4]:
df_test = spark.read.csv("Classification/Planet_Testing.csv", header=True, inferSchema=True)
df_test = df_test.select("Temperature", "Water", "Atmosphere Color", "Habitable")
df_test = df_test.na.drop()

In [7]:
# df_train.show()

In [8]:
def parse(df):

    df = df.withColumn("Water", when(df["Water"] == "Low", 0).
                      when(df["Water"] == "Medium",1).
                      when(df["Water"] == "High",2))
    
    df = df.withColumn("Atmosphere Color", when(df["Atmosphere Color"] == "Red", 0).
                      when(df["Atmosphere Color"] == "Blue",1).
                      when(df["Atmosphere Color"] == "Yellow",2))
    cols = df.columns
    cols.remove("Habitable")
    df = VectorAssembler(inputCols = cols, outputCol = "Features").transform(df)
    
    scaler = StandardScaler(inputCol = "Features", outputCol = "Scaled_Features")
    df = scaler.fit(df).transform(df)
    
    return df

In [9]:
# df_train.show()
# df_test.show()

In [10]:
df_train = parse(df_train)
df_test = parse(df_test)

In [11]:
model = LogisticRegression(featuresCol = "Scaled_Features", labelCol = "Habitable", maxIter=10).fit(df_train)

prediction = model.transform(df_test)

In [12]:
evaluator = BinaryClassificationEvaluator(labelCol="Habitable")
acc = evaluator.evaluate(prediction) * 100
print("Accuracy : {}%".format(acc))

Accuracy : 91.71043337232418%
