In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import when
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [6]:
def transform_data(data) :
    data=data.withColumn("Water", when (data["Water"]=="Low",0).when(data["Water"]=="Medium",1).otherwise(2))
    data=data.withColumn("Atmosphere Color", when (data["Atmosphere Color"]=="Red",0).when(data["Atmosphere Color"]=="Blue",1).otherwise(2))
    
    
    
    cols = data.columns
    cols.remove("Habitable")
    data=VectorAssembler(inputCols=cols, outputCol="Features").transform(data)
    
    scaler = StandardScaler(inputCol="Features", outputCol="Scaler_Features")
    data = scaler.fit(data).transform(data)
    
    return data
    

In [2]:
spark = SparkSession.builder.getOrCreate()

In [3]:
train_data = spark.read.option("inferSchema","true").csv("Classification/Planet_Training.csv", header=True)
test_data = spark.read.option("inferSchema","true").csv("Classification/Planet_Testing.csv", header=True)

In [4]:
train_data = train_data.select("Temperature", "Water", "Atmosphere Color", "Habitable")
test_data = test_data.select("Temperature", "Water", "Atmosphere Color", "Habitable")

In [5]:
train_data = train_data.na.drop()
test_data = test_data.na.drop()

In [7]:
train_data = transform_data(train_data)
test_data = transform_data(test_data)

In [8]:
model = LogisticRegression(featuresCol="Scaler_Features", labelCol="Habitable", maxIter=10).fit(train_data)

In [9]:
prediction = model.transform(test_data)

In [10]:
evaluator = BinaryClassificationEvaluator(labelCol="Habitable")

In [11]:
print("Accuracy : {}".format(evaluator.evaluate(prediction)))

Accuracy : 0.9171043337232417
