In [1]:
import findspark
findspark.init("/opt/manual/spark/")

In [2]:
from pyspark.sql import SparkSession, functions as F
import pandas as pd

In [3]:
spark = SparkSession.builder \
.appName("clustering") \
.master("yarn") \
.config("spark.sql.shuffle.partitions","2") \
.enableHiveSupport() \
.getOrCreate()

In [4]:
#Read data
df = spark.read \
.format("csv") \
.option("header",True) \
.option("inferSchema", True) \
.load("/user/train/datasets/Churn_Modelling.csv")

In [5]:
df.limit(5).toPandas()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [6]:
# Schema check
df.printSchema()

root
 |-- RowNumber: integer (nullable = true)
 |-- CustomerId: integer (nullable = true)
 |-- Surname: string (nullable = true)
 |-- CreditScore: integer (nullable = true)
 |-- Geography: string (nullable = true)
 |-- Gender: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Tenure: integer (nullable = true)
 |-- Balance: double (nullable = true)
 |-- NumOfProducts: integer (nullable = true)
 |-- HasCrCard: integer (nullable = true)
 |-- IsActiveMember: integer (nullable = true)
 |-- EstimatedSalary: double (nullable = true)
 |-- Exited: integer (nullable = true)



In [7]:
df.persist()

DataFrame[RowNumber: int, CustomerId: int, Surname: string, CreditScore: int, Geography: string, Gender: string, Age: int, Tenure: int, Balance: double, NumOfProducts: int, HasCrCard: int, IsActiveMember: int, EstimatedSalary: double, Exited: int]

In [8]:
df_count = df.count()
print(df_count)

10000


In [9]:
len(df.columns)

14

In [10]:
# null check
for col_name in df.dtypes:
    null_count = df.filter( (F.col(col_name[0]).isNull()) | (F.col(col_name[0]) == "")).count()
    
    if null_count > 0:
        print(f"{col_name[0]} {col_name[1]} type null values: {null_count} % {null_count/df_count * 100}")

In [10]:
# Grouping by attribute types

cat_cols = []
num_cols = []
discarted_cols = ["RowNumber", "CustomerId", "Surname","Exited"]

In [11]:
for col_name in df.dtypes:
    if (col_name[0] not in label_col + discarted_cols):
        if col_name[1] == "string":
            cat_cols.append(col_name[0])
        else:
            num_cols.append(col_name[0])

In [12]:
print(cat_cols)
print(len(cat_cols))

['Geography', 'Gender']
2


In [13]:
print(num_cols)
print(len(num_cols))

['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'EstimatedSalary']
8


In [14]:
# Trim

for col_name in cat_cols:
    df1 = df.withColumn(col_name, F.trim(col_name))

In [15]:
df1.select(label_col[0]).groupBy(label_col[0]).count().show()

+------+-----+
|Exited|count|
+------+-----+
|     1| 2037|
|     0| 7963|
+------+-----+



In [16]:
# Examine Categoricals

for col_name in cat_cols:
    print(df1.select(col_name).groupBy(col_name).count().show())

+---------+-----+
|Geography|count|
+---------+-----+
|   France| 5014|
|  Germany| 2509|
|    Spain| 2477|
+---------+-----+

None
+------+-----+
|Gender|count|
+------+-----+
|Female| 4543|
|  Male| 5457|
+------+-----+

None


In [18]:
# Verify columns

len(df1.columns) == (len(cat_cols) + len(num_cols) + len(discarted_cols))

True

In [19]:
#Sorting: StringIndexer > OneHotEncoding > VectorAssembler > Scaler > Estimator > Trasnformer > Evaluator

# select cols to ohe

to_be_oheded = []

for col_name in cat_cols:
    
    count = df1.select(col_name).distinct().count()
    
    if count > 2:
        to_be_oheded.append(col_name)
        

In [20]:
print(to_be_oheded)

['Geography']


In [21]:
# StringIndexer

from pyspark.ml.feature import StringIndexer

In [22]:
my_dict = {}
string_indexer_obj = []
string_indexer_output_names = []
ohe_input_names = []
ohe_output_names = []

for col_name in cat_cols:
    my_dict[col_name+"_indexedobj"] = StringIndexer() \
    .setHandleInvalid("skip") \
    .setInputCol(col_name) \
    .setOutputCol(col_name+"_indexed")
    

    string_indexer_obj.append(my_dict.get(col_name+"_indexedobj"))
    string_indexer_output_names.append(col_name+"_indexed")

    
    if col_name in to_be_oheded:
        ohe_input_names.append(col_name+"_indexed")
        ohe_output_names.append(col_name+"_ohe")

In [23]:
print(string_indexer_obj)
print(string_indexer_output_names)
print(ohe_input_names)
print(ohe_output_names)

[StringIndexer_2cfa2fc4b460, StringIndexer_21a9e4bddf12]
['Geography_indexed', 'Gender_indexed']
['Geography_indexed']
['Geography_ohe']


In [24]:
#onehotencodied
from pyspark.ml.feature import OneHotEncoder

In [25]:
string_indexer_ohe_excluded = list(set(string_indexer_output_names).difference(set(ohe_input_names)))
print(string_indexer_ohe_excluded)

['Gender_indexed']


In [26]:
encoder = OneHotEncoder() \
.setInputCols(ohe_input_names) \
.setOutputCols(ohe_output_names)

In [27]:
#vector assembler
from pyspark.ml.feature import VectorAssembler

In [28]:
assembler = VectorAssembler().setInputCols(num_cols+string_indexer_ohe_excluded+ohe_output_names).setOutputCol("unscaled_features")

In [29]:
#Scaler
from pyspark.ml.feature import StandardScaler

In [30]:
scaler = StandardScaler().setInputCol("unscaled_features").setOutputCol("features")

In [31]:
#pipeline
from pyspark.ml import Pipeline

In [32]:
pipeline_obj = Pipeline().setStages(string_indexer_obj+[encoder,assembler,scaler])

In [34]:
pipeline_model = pipeline_obj.fit(df1)

In [36]:
pipeline_df = pipeline_model.transform(df1)

In [37]:
pipeline_df.limit(5).toPandas()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Geography_indexed,Gender_indexed,Geography_ohe,unscaled_features,features
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1,0.0,1.0,"(1.0, 0.0)","[619.0, 42.0, 2.0, 0.0, 1.0, 1.0, 1.0, 101348....","[6.404333924389993, 4.0046505619078925, 0.6915..."
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0,2.0,1.0,"(0.0, 0.0)","[608.0, 41.0, 1.0, 83807.86, 1.0, 0.0, 1.0, 11...","[6.2905250824379895, 3.9093017390053237, 0.345..."
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1,0.0,1.0,"(1.0, 0.0)","[502.0, 42.0, 8.0, 159660.8, 3.0, 1.0, 0.0, 11...","[5.193821696355051, 4.0046505619078925, 2.7660..."
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0,0.0,1.0,"(1.0, 0.0)","[699.0, 39.0, 1.0, 0.0, 2.0, 0.0, 0.0, 93826.6...","[7.232034593131834, 3.718604093200186, 0.34576..."
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0,2.0,1.0,"(0.0, 0.0)","[850.0, 43.0, 2.0, 125510.82, 1.0, 1.0, 1.0, 7...","[8.794319605382057, 4.099999384810461, 0.69152..."


In [38]:
#Model training, test and evaluate


from pyspark.ml.clustering import KMeans

In [44]:
def compute_kmeans_model(df,k):
    kmeans_obj = KMeans().setSeed(142).setK(k)
    
    return kmeans_obj.fit(df)

In [40]:
#model evaluate
from pyspark.ml.evaluation import ClusteringEvaluator

In [41]:
evaluator = ClusteringEvaluator()

In [47]:
for k in range(2,11):
    kmeans_model = compute_kmeans_model(pipeline_df,k)
    
    transformed_df = kmeans_model.transform(pipeline_df)
    
    score = evaluator.evaluate(transformed_df)
    
    print(f"k: {k}, score: {score} ")

k: 2, score: 0.24590625242752776 
k: 3, score: 0.22645984369837002 
k: 4, score: 0.21109416946633186 
k: 5, score: 0.2016827064122783 
k: 6, score: 0.18932939099895102 
k: 7, score: 0.1852897706405449 
k: 8, score: 0.1758095043959162 
k: 9, score: 0.1855896305798431 
k: 10, score: 0.167440153785295 


In [48]:
kmeans_model = compute_kmeans_model(pipeline_df,2)

In [50]:
#prediction
transformed_df = kmeans_model.transform(pipeline_df)

In [51]:
transformed_df.limit(10).toPandas()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Geography_indexed,Gender_indexed,Geography_ohe,unscaled_features,features,prediction
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1,0.0,1.0,"(1.0, 0.0)","[619.0, 42.0, 2.0, 0.0, 1.0, 1.0, 1.0, 101348....","[6.404333924389993, 4.0046505619078925, 0.6915...",1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0,2.0,1.0,"(0.0, 0.0)","[608.0, 41.0, 1.0, 83807.86, 1.0, 0.0, 1.0, 11...","[6.2905250824379895, 3.9093017390053237, 0.345...",0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1,0.0,1.0,"(1.0, 0.0)","[502.0, 42.0, 8.0, 159660.8, 3.0, 1.0, 0.0, 11...","[5.193821696355051, 4.0046505619078925, 2.7660...",1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0,0.0,1.0,"(1.0, 0.0)","[699.0, 39.0, 1.0, 0.0, 2.0, 0.0, 0.0, 93826.6...","[7.232034593131834, 3.718604093200186, 0.34576...",1
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0,2.0,1.0,"(0.0, 0.0)","[850.0, 43.0, 2.0, 125510.82, 1.0, 1.0, 1.0, 7...","[8.794319605382057, 4.099999384810461, 0.69152...",0
5,6,15574012,Chu,645,Spain,Male,44,8,113755.78,2,1,0,149756.71,1,2.0,0.0,"(0.0, 0.0)","[645.0, 44.0, 8.0, 113755.78, 2.0, 1.0, 0.0, 1...","[6.673336641731091, 4.19534820771303, 2.766084...",0
6,7,15592531,Bartlett,822,France,Male,50,7,0.0,2,1,1,10062.8,0,0.0,0.0,"(1.0, 0.0)","[822.0, 50.0, 7.0, 0.0, 2.0, 1.0, 1.0, 10062.8...","[8.504624371322414, 4.767441145128443, 2.42032...",1
7,8,15656148,Obinna,376,Germany,Female,29,4,115046.74,4,1,0,119346.88,1,1.0,1.0,"(0.0, 1.0)","[376.0, 29.0, 4.0, 115046.74, 4.0, 1.0, 0.0, 1...","[3.8901931430866514, 2.765115864174497, 1.3830...",0
8,9,15792365,He,501,France,Male,44,4,142051.07,2,0,1,74940.5,0,0.0,0.0,"(1.0, 0.0)","[501.0, 44.0, 4.0, 142051.07, 2.0, 0.0, 1.0, 7...","[5.183475437995778, 4.19534820771303, 1.383042...",1
9,10,15592389,H?,684,France,Male,27,2,134603.88,1,1,1,71725.73,0,0.0,0.0,"(1.0, 0.0)","[684.0, 27.0, 2.0, 134603.88, 1.0, 1.0, 1.0, 7...","[7.076840717742739, 2.5744182183693596, 0.6915...",1


In [52]:
spark.stop()