# Use Pyspark to view dataset 

In [2]:
!apt-get -y install openjdk-8-jre-headless
!pip install pyspark

Reading package lists... Done
Building dependency tree       
Reading state information... Done
Suggested packages:
  libnss-mdns fonts-dejavu-extra fonts-ipafont-gothic fonts-ipafont-mincho
  fonts-wqy-microhei fonts-wqy-zenhei fonts-indic
The following NEW packages will be installed:
  openjdk-8-jre-headless
0 upgraded, 1 newly installed, 0 to remove and 39 not upgraded.
Need to get 28.2 MB of archives.
After this operation, 104 MB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu bionic-updates/universe amd64 openjdk-8-jre-headless amd64 8u292-b10-0ubuntu1~18.04 [28.2 MB]
Fetched 28.2 MB in 2s (16.2 MB/s)
Selecting previously unselected package openjdk-8-jre-headless:amd64.
(Reading database ... 160772 files and directories currently installed.)
Preparing to unpack .../openjdk-8-jre-headless_8u292-b10-0ubuntu1~18.04_amd64.deb ...
Unpacking openjdk-8-jre-headless:amd64 (8u292-b10-0ubuntu1~18.04) ...
Setting up openjdk-8-jre-headless:amd64 (8u292-b10-0ubunt

In [3]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('Churn_Modelling').getOrCreate()
df = spark.read.csv('public.csv', header=True, inferSchema=True)
df.printSchema()

root
 |-- CustomerId: integer (nullable = true)
 |-- Surname: string (nullable = true)
 |-- CreditScore: integer (nullable = true)
 |-- Geography: string (nullable = true)
 |-- Gender: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Tenure: integer (nullable = true)
 |-- Balance: double (nullable = true)
 |-- NumOfProducts: integer (nullable = true)
 |-- HasCrCard: integer (nullable = true)
 |-- IsActiveMember: integer (nullable = true)
 |-- EstimatedSalary: double (nullable = true)
 |-- Exited: integer (nullable = true)



# Training Part

In [4]:
from pyspark.sql.functions import col
from pyspark.ml import Pipeline
from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler
from pyspark.ml.classification import RandomForestClassifierfrom sklearn import metrics
import numpy as np


In [5]:
customer_df = df.select(df['CustomerId'],
             df['CreditScore'], 
             df['Gender'], 
             df['Age'],
             df['Tenure'], 
             df['Balance'], 
             df['NumOfProducts'], 
             df['HasCrCard'], 
             df['IsActiveMember'], 
             df['EstimatedSalary'],
             df['Exited'])

In [6]:
sI1 = StringIndexer(inputCol='Gender', outputCol='GenderIndex')
en1 = OneHotEncoder(dropLast=False, inputCol='GenderIndex', outputCol='GenderVec')

customer_final_df = Pipeline(stages=[sI1, en1]).fit(customer_df).transform(customer_df)

In [7]:
required_features = ['CreditScore', 'GenderVec', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard', 'IsActiveMember','EstimatedSalary']
assembler = VectorAssembler(inputCols=required_features, outputCol='features')
transformed_data = assembler.transform(customer_final_df)
(training_data, test_data) = transformed_data.randomSplit([0.8, 0.2])

rf = RandomForestClassifier(labelCol='Exited', featuresCol='features', maxDepth=5)
model = rf.fit(training_data)

# Evaluation Part

In [8]:

data_array =  np.array(test_data.select('Exited').collect())

predict = model.transform(test_data)
predict = predict.select(col('CustomerId').cast('int').alias('CustomerId'),
          col('prediction').cast('int').alias('Exited'))
predict_array = np.array(predict.select('Exited').collect())

metrics.f1_score(data_array, predict_array, average='micro')

0.839924670433145

## Load private dataset, the same structure as public dataset

In [9]:
df_private = spark.read.csv('public.csv', header=True, inferSchema=True)

In [10]:
customer_test_df = df_private.select(df_private['CustomerId'],
             df_private['CreditScore'], 
             df_private['Gender'], 
             df_private['Age'],
             df_private['Tenure'], 
             df_private['Balance'], 
             df_private['NumOfProducts'], 
             df_private['HasCrCard'], 
             df_private['IsActiveMember'], 
             df_private['EstimatedSalary'],
             df_private['Exited'])

sI1 = StringIndexer(inputCol='Gender', outputCol='GenderIndex')
en1 = OneHotEncoder(dropLast=False, inputCol='GenderIndex', outputCol='GenderVec')
customer_final_df = Pipeline(stages=[sI1, en1]).fit(customer_test_df).transform(customer_test_df)

required_features = ['CreditScore', 'GenderVec', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard', 'IsActiveMember','EstimatedSalary']
assembler_test = VectorAssembler(inputCols=required_features, outputCol='features')
transformed_test_data = assembler_test.transform(customer_final_df)

predict_test = model.transform(transformed_test_data)
predict_test = predict_test.select(col('CustomerId').cast('int').alias('CustomerId'), col('prediction').cast('int').alias('Exited'))

## Use the following function to get your prediction result (f-1 score)

In [11]:

data_array =  np.array(customer_final_df.select('Exited').collect())
predict_array = np.array(predict_test.select('Exited').collect())

metrics.f1_score(data_array, predict_array, average='micro')

0.8576250000000001