# Dataset Overview

In [None]:
import pandas as pd
df = pd.read_csv('public.csv')
df

Unnamed: 0,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,15565701,Ferri,698,Spain,Female,39,9,161993.89,1,0,0,90212.38,0
1,15565706,Akobundu,612,Spain,Male,35,1,0.00,1,1,1,83256.26,1
2,15565796,Docherty,745,Germany,Male,48,10,96048.55,1,1,0,74510.65,0
3,15565806,Toosey,532,France,Male,38,9,0.00,2,0,0,30583.95,0
4,15565878,Bates,631,Spain,Male,29,3,0.00,2,1,1,197963.46,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
7995,15815628,Moysey,711,France,Female,37,8,113899.92,1,0,0,80215.20,0
7996,15815645,Akhtar,481,France,Male,37,8,152303.66,2,1,1,175082.20,0
7997,15815656,Hopkins,541,Germany,Female,39,9,100116.67,1,1,1,199808.10,1
7998,15815660,Mazzi,758,France,Female,34,1,154139.45,1,1,1,60728.89,0


# Use Pyspark to view dataset 

In [None]:
# These part is  for windows version, if you use ubuntu, remember to edit import pyspark part
# ----
#import findspark
#findspark.init()
#findspark.find()
import pyspark
#findspark.find()
# ----
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Churn_Modelling").getOrCreate()
df = spark.read.csv('public.csv',header=True,inferSchema=True)
df.printSchema()

root
 |-- CustomerId: integer (nullable = true)
 |-- Surname: string (nullable = true)
 |-- CreditScore: integer (nullable = true)
 |-- Geography: string (nullable = true)
 |-- Gender: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Tenure: integer (nullable = true)
 |-- Balance: double (nullable = true)
 |-- NumOfProducts: integer (nullable = true)
 |-- HasCrCard: integer (nullable = true)
 |-- IsActiveMember: integer (nullable = true)
 |-- EstimatedSalary: double (nullable = true)
 |-- Exited: integer (nullable = true)



# Do your work here

In [None]:
from pyspark.ml.feature import VectorAssembler, MinMaxScaler
from pyspark.ml.classification import LogisticRegression

In [None]:
#把很多feature集合成一個vector
df_feature = VectorAssembler(inputCols=['CreditScore','Age','Tenure','Balance',
'NumOfProducts','HasCrCard','IsActiveMember','EstimatedSalary'],outputCol='Features')
df_feature= df_feature.transform(df)
df_feature.show()

+----------+--------+-----------+---------+------+---+------+---------+-------------+---------+--------------+---------------+------+--------------------+
|CustomerId| Surname|CreditScore|Geography|Gender|Age|Tenure|  Balance|NumOfProducts|HasCrCard|IsActiveMember|EstimatedSalary|Exited|            Features|
+----------+--------+-----------+---------+------+---+------+---------+-------------+---------+--------------+---------------+------+--------------------+
|  15565701|   Ferri|        698|    Spain|Female| 39|     9|161993.89|            1|        0|             0|       90212.38|     0|[698.0,39.0,9.0,1...|
|  15565706|Akobundu|        612|    Spain|  Male| 35|     1|      0.0|            1|        1|             1|       83256.26|     1|[612.0,35.0,1.0,0...|
|  15565796|Docherty|        745|  Germany|  Male| 48|    10| 96048.55|            1|        1|             0|       74510.65|     0|[745.0,48.0,10.0,...|
|  15565806|  Toosey|        532|   France|  Male| 38|     9|      0.0

In [None]:
#normalize
temp_select=df_feature.select(['CustomerId','Features', 'Exited'])
nor = MinMaxScaler(inputCol = 'Features' ,outputCol = 'Features_nor')
data_nor= nor.fit(temp_select).transform(temp_select)
data_nor.show()

+----------+--------------------+------+--------------------+
|CustomerId|            Features|Exited|        Features_nor|
+----------+--------------------+------+--------------------+
|  15565701|[698.0,39.0,9.0,1...|     0|[0.69600000000000...|
|  15565706|[612.0,35.0,1.0,0...|     1|[0.524,0.22972972...|
|  15565796|[745.0,48.0,10.0,...|     0|[0.79,0.405405405...|
|  15565806|[532.0,38.0,9.0,0...|     0|[0.364,0.27027027...|
|  15565878|[631.0,29.0,3.0,0...|     0|[0.562,0.14864864...|
|  15565879|[845.0,28.0,9.0,0...|     0|[0.99,0.135135135...|
|  15565996|[653.0,44.0,8.0,0...|     0|[0.606,0.35135135...|
|  15566030|[497.0,41.0,5.0,8...|     1|[0.294,0.31081081...|
|  15566091|[545.0,32.0,4.0,0...|     0|[0.39,0.189189189...|
|  15566111|[596.0,39.0,9.0,0...|     0|[0.492,0.28378378...|
|  15566139|[526.0,37.0,5.0,5...|     0|[0.352,0.25675675...|
|  15566251|[618.0,37.0,5.0,9...|     1|[0.536,0.25675675...|
|  15566253|[580.0,44.0,9.0,1...|     1|[0.46,0.351351351...|
|  15566

In [None]:
#設定多層分類器
from pyspark.ml.classification import MultilayerPerceptronClassifier
train = MultilayerPerceptronClassifier(featuresCol = 'Features_nor', 
labelCol = 'Exited',maxIter=100, layers=[8,28,66,88,2], blockSize=128, seed=1200)

# Evaluation Part

## Load private dataset, the same structure as public dataset

In [None]:
df_private = spark.read.csv('public.csv',header=True,inferSchema=True)  # TA takes public dataset as example

## Do prediction with your PySpark model here

In [None]:
#training
temp1 = train.fit(data_nor)
temp2 = temp1.evaluate(data_nor).predictions
model_pre= temp2.select('CustomerId','prediction')
model_pre.show()

+----------+----------+
|CustomerId|prediction|
+----------+----------+
|  15565701|       0.0|
|  15565706|       0.0|
|  15565796|       1.0|
|  15565806|       0.0|
|  15565878|       0.0|
|  15565879|       0.0|
|  15565996|       0.0|
|  15566030|       0.0|
|  15566091|       0.0|
|  15566111|       0.0|
|  15566139|       0.0|
|  15566251|       0.0|
|  15566253|       0.0|
|  15566269|       0.0|
|  15566295|       0.0|
|  15566312|       0.0|
|  15566378|       0.0|
|  15566380|       0.0|
|  15566467|       0.0|
|  15566494|       0.0|
+----------+----------+
only showing top 20 rows



## Print Your result as the following type

In [None]:
df_private.select('CustomerId','Exited').show(5)

+----------+------+
|CustomerId|Exited|
+----------+------+
|  15565701|     0|
|  15565706|     1|
|  15565796|     0|
|  15565806|     0|
|  15565878|     0|
+----------+------+
only showing top 5 rows



## TA will use the following function to get your prediction result (f-1 score)

In [None]:
#計算f1score(兩組數據差異)
from sklearn import metrics
import numpy as np
data_arr =  np.array(df_private.select('Exited').collect())
data_pre = np.array(model_pre.select(['prediction']).collect())
metrics.f1_score(data_arr,data_pre, average='micro')  

0.850875