In [None]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://www-us.apache.org/dist/spark/spark-2.4.5/spark-2.4.5-bin-hadoop2.7.tgz
!tar xf /content/spark-2.4.5-bin-hadoop2.7.tgz
!pip install -q findspark

In [None]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-2.4.5-bin-hadoop2.7"

In [None]:
import findspark
findspark.init()

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import when
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [None]:
def transform_data(data):
  data = data.withColumn("Education Level", when(data["Education Level"] == "Low", 0).when(data["Education Level"] == "Intermediate", 1).otherwise(2))
  data = data.withColumn("Married", when(data["Married"] == "Yes", 0).otherwise(1))
  data = data.withColumn("Depressed", when(data["Depressed"] == "Yes", 0).otherwise(1))

  cols = data.columns
  cols.remove("Depressed")
  data = VectorAssembler(inputCols=cols, outputCol="Features").transform(data)

  scaler = StandardScaler(inputCol="Features", outputCol="Scaler_Features")
  data = scaler.fit(data).transform(data)

  return data

In [None]:
spark = SparkSession.builder.getOrCreate()

In [None]:
train_data = spark.read.option("inferSchema", "true").csv("Classification_Train.csv", header=True)
test_data = spark.read.option("inferSchema", "true").csv("Classification_Test.csv", header=True)

In [None]:
print(train_data.toPandas())

                      Name  Gender  Height  ... Married Salary Income Depressed
0            Sax Tesseyman  Female     174  ...     Yes      85000000        No
1              Niels Greet    Male     165  ...      No      14000000        No
2           Minetta Santry  Female     160  ...      No     148000000       Yes
3           Sherm Gossipin  Female     144  ...      No      50000000       Yes
4         Cathie Blackmuir    Male     168  ...     Yes     101000000        No
...                    ...     ...     ...  ...     ...           ...       ...
9995  Dorotea Sonnenschein  Female     162  ...      No     104000000        No
9996        Nevile Alecock  Female     162  ...      No      51000000       Yes
9997           Arleta Epps  Female     165  ...      No      25000000       Yes
9998      Moyna Leftbridge    Male     165  ...      No      56000000       Yes
9999     Rosemaria Elleray  Female     152  ...     Yes      31000000        No

[10000 rows x 8 columns]


In [None]:
train_data = train_data.select("Education Level", "Married", "Salary Income", "Depressed")
test_data = train_data.select("Education Level", "Married", "Salary Income", "Depressed")

In [None]:
train_data = train_data.na.drop()
test_data = test_data.na.drop()

In [None]:
train_data = transform_data(train_data)
test_data = transform_data(test_data)

In [None]:
model = LogisticRegression(featuresCol="Scaler_Features", labelCol="Depressed", maxIter=10).fit(train_data)

In [None]:
prediction = model.transform(test_data)

In [None]:
print(prediction.toPandas())

      Education Level  ...  prediction
0                   1  ...         1.0
1                   1  ...         0.0
2                   0  ...         0.0
3                   2  ...         0.0
4                   1  ...         1.0
...               ...  ...         ...
9995                1  ...         0.0
9996                1  ...         0.0
9997                2  ...         0.0
9998                0  ...         0.0
9999                0  ...         1.0

[10000 rows x 9 columns]


In [None]:
evaluator = BinaryClassificationEvaluator(labelCol="Depressed")

In [None]:
print("Accuracy : {}".format(evaluator.evaluate(prediction)))

Accuracy : 0.8809802485148361
