# Spark ML для задачи оттока

Источник данных: [Kaggle](https://www.kaggle.com/sakshigoyal7/credit-card-customers)

Описания полей:

Название | Описание
:--------|:--------
CLIENTNUM | Client number. Unique identifier for the customer holding the account
Attrition_Flag | Internal event (customer activity) variable
Customer_Age | Demographic variable - Customer's Age in Years
Gender | Demographic variable - M=Male, F=Female
Dependent_count | Demographic variable - Number of dependents
Education_Level | Demographic variable - Educational Qualification of the account holder (example: high school, college graduate, etc.)
Marital_Status | Demographic variable - Married, Single, Divorced, Unknown
Income_Category | Demographic variable - Annual Income Category of the account holder (< 40K, 40K - 60K, 60K - 80K, 80K-120K, > 120K, Unknown)
Card_Category | Product Variable - Type of Card (Blue, Silver, Gold, Platinum)
Months_on_book | Period of relationship with bank

In [1]:
import $ivy.`org.apache.spark:spark-sql_2.12:3.0.1`
import $ivy.`org.apache.spark:spark-mllib_2.12:3.0.1`
import $ivy.`sh.almond:almond-spark_2.12:0.10.9`

[32mimport [39m[36m$ivy.$                                      
[39m
[32mimport [39m[36m$ivy.$                                        
[39m
[32mimport [39m[36m$ivy.$                                   [39m

In [2]:
import org.apache.log4j.{Level, Logger}
Logger.getLogger("org").setLevel(Level.OFF)

[32mimport [39m[36morg.apache.log4j.{Level, Logger}
[39m

In [3]:
import org.apache.spark.sql._
import org.apache.spark.sql.functions._

val spark = {
      SparkSession.builder()
        .master("local[*]")
        .getOrCreate()
    }

import spark.implicits._

Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties


[32mimport [39m[36morg.apache.spark.sql._
[39m
[32mimport [39m[36morg.apache.spark.sql.functions._

[39m
[36mspark[39m: [32mSparkSession[39m = org.apache.spark.sql.SparkSession@6811e244
[32mimport [39m[36mspark.implicits._[39m

## Загрузка и знакомство с данными

In [4]:
val raw = spark
        .read
        .option("header", "true")
        .option("inferSchema", "true")
        .csv("BankChurners.csv")

val columns: Array[String] = raw.columns
val columnsLen: Int = columns.length
val colsToDrop: Array[String] = columns.slice(columnsLen - 2, columnsLen) ++ Array("CLIENTNUM")

val df = raw.drop(colsToDrop: _*)

df.show(5, truncate = false)

+-----------------+------------+------+---------------+---------------+--------------+---------------+-------------+--------------+------------------------+----------------------+---------------------+------------+-------------------+---------------+--------------------+---------------+--------------+-------------------+---------------------+
|Attrition_Flag   |Customer_Age|Gender|Dependent_count|Education_Level|Marital_Status|Income_Category|Card_Category|Months_on_book|Total_Relationship_Count|Months_Inactive_12_mon|Contacts_Count_12_mon|Credit_Limit|Total_Revolving_Bal|Avg_Open_To_Buy|Total_Amt_Chng_Q4_Q1|Total_Trans_Amt|Total_Trans_Ct|Total_Ct_Chng_Q4_Q1|Avg_Utilization_Ratio|
+-----------------+------------+------+---------------+---------------+--------------+---------------+-------------+--------------+------------------------+----------------------+---------------------+------------+-------------------+---------------+--------------------+---------------+--------------+--------

[36mraw[39m: [32mDataFrame[39m = [CLIENTNUM: int, Attrition_Flag: string ... 21 more fields]
[36mcolumns[39m: [32mArray[39m[[32mString[39m] = [33mArray[39m(
  [32m"CLIENTNUM"[39m,
  [32m"Attrition_Flag"[39m,
  [32m"Customer_Age"[39m,
  [32m"Gender"[39m,
  [32m"Dependent_count"[39m,
  [32m"Education_Level"[39m,
  [32m"Marital_Status"[39m,
  [32m"Income_Category"[39m,
  [32m"Card_Category"[39m,
  [32m"Months_on_book"[39m,
  [32m"Total_Relationship_Count"[39m,
  [32m"Months_Inactive_12_mon"[39m,
  [32m"Contacts_Count_12_mon"[39m,
  [32m"Credit_Limit"[39m,
  [32m"Total_Revolving_Bal"[39m,
  [32m"Avg_Open_To_Buy"[39m,
  [32m"Total_Amt_Chng_Q4_Q1"[39m,
  [32m"Total_Trans_Amt"[39m,
  [32m"Total_Trans_Ct"[39m,
  [32m"Total_Ct_Chng_Q4_Q1"[39m,
  [32m"Avg_Utilization_Ratio"[39m,
  [32m"Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1"[39m,
  [32m"Naive_Bayes_

### Определяем типы колонок

In [5]:
df.printSchema

root
 |-- Attrition_Flag: string (nullable = true)
 |-- Customer_Age: integer (nullable = true)
 |-- Gender: string (nullable = true)
 |-- Dependent_count: integer (nullable = true)
 |-- Education_Level: string (nullable = true)
 |-- Marital_Status: string (nullable = true)
 |-- Income_Category: string (nullable = true)
 |-- Card_Category: string (nullable = true)
 |-- Months_on_book: integer (nullable = true)
 |-- Total_Relationship_Count: integer (nullable = true)
 |-- Months_Inactive_12_mon: integer (nullable = true)
 |-- Contacts_Count_12_mon: integer (nullable = true)
 |-- Credit_Limit: double (nullable = true)
 |-- Total_Revolving_Bal: integer (nullable = true)
 |-- Avg_Open_To_Buy: double (nullable = true)
 |-- Total_Amt_Chng_Q4_Q1: double (nullable = true)
 |-- Total_Trans_Amt: integer (nullable = true)
 |-- Total_Trans_Ct: integer (nullable = true)
 |-- Total_Ct_Chng_Q4_Q1: double (nullable = true)
 |-- Avg_Utilization_Ratio: double (nullable = true)



In [6]:
df.dtypes.groupBy(_._2).mapValues(_.length)

[36mres5[39m: [32mMap[39m[[32mString[39m, [32mInt[39m] = [33mMap[39m(
  [32m"DoubleType"[39m -> [32m5[39m,
  [32m"StringType"[39m -> [32m6[39m,
  [32m"IntegerType"[39m -> [32m9[39m
)

### Проверяем числовые колонки

In [7]:
val numericColumns = df.dtypes.filter(!_._2.equals("StringType")).map(d => d._1)
df.select(numericColumns.map(col): _*).summary().show

+-------+-----------------+------------------+------------------+------------------------+----------------------+---------------------+-----------------+-------------------+-----------------+--------------------+-----------------+-----------------+-------------------+---------------------+
|summary|     Customer_Age|   Dependent_count|    Months_on_book|Total_Relationship_Count|Months_Inactive_12_mon|Contacts_Count_12_mon|     Credit_Limit|Total_Revolving_Bal|  Avg_Open_To_Buy|Total_Amt_Chng_Q4_Q1|  Total_Trans_Amt|   Total_Trans_Ct|Total_Ct_Chng_Q4_Q1|Avg_Utilization_Ratio|
+-------+-----------------+------------------+------------------+------------------------+----------------------+---------------------+-----------------+-------------------+-----------------+--------------------+-----------------+-----------------+-------------------+---------------------+
|  count|            10127|             10127|             10127|                   10127|                 10127|              

[36mnumericColumns[39m: [32mArray[39m[[32mString[39m] = [33mArray[39m(
  [32m"Customer_Age"[39m,
  [32m"Dependent_count"[39m,
  [32m"Months_on_book"[39m,
  [32m"Total_Relationship_Count"[39m,
  [32m"Months_Inactive_12_mon"[39m,
  [32m"Contacts_Count_12_mon"[39m,
  [32m"Credit_Limit"[39m,
  [32m"Total_Revolving_Bal"[39m,
  [32m"Avg_Open_To_Buy"[39m,
  [32m"Total_Amt_Chng_Q4_Q1"[39m,
  [32m"Total_Trans_Amt"[39m,
  [32m"Total_Trans_Ct"[39m,
  [32m"Total_Ct_Chng_Q4_Q1"[39m,
  [32m"Avg_Utilization_Ratio"[39m
)

In [8]:
df.groupBy($"Customer_Age").count().show(100)

+------------+-----+
|Customer_Age|count|
+------------+-----+
|          31|   91|
|          65|  101|
|          53|  387|
|          34|  146|
|          28|   29|
|          26|   78|
|          27|   32|
|          44|  500|
|          47|  479|
|          52|  376|
|          40|  361|
|          57|  223|
|          54|  307|
|          48|  472|
|          64|   43|
|          41|  379|
|          43|  473|
|          37|  260|
|          61|   93|
|          35|  184|
|          59|  157|
|          55|  279|
|          39|  333|
|          49|  495|
|          51|  398|
|          63|   65|
|          50|  452|
|          45|  486|
|          38|  303|
|          73|    1|
|          70|    1|
|          62|   93|
|          29|   56|
|          32|  106|
|          60|  127|
|          56|  262|
|          58|  157|
|          33|  127|
|          68|    2|
|          42|  426|
|          30|   70|
|          66|    2|
|          46|  490|
|          67|    4|
|          36

## Целевая колонка

In [9]:
val dft = df.withColumn("target", when($"Attrition_Flag" === "Existing Customer", 0).otherwise(1))

dft.show(5, truncate = false)

+-----------------+------------+------+---------------+---------------+--------------+---------------+-------------+--------------+------------------------+----------------------+---------------------+------------+-------------------+---------------+--------------------+---------------+--------------+-------------------+---------------------+------+
|Attrition_Flag   |Customer_Age|Gender|Dependent_count|Education_Level|Marital_Status|Income_Category|Card_Category|Months_on_book|Total_Relationship_Count|Months_Inactive_12_mon|Contacts_Count_12_mon|Credit_Limit|Total_Revolving_Bal|Avg_Open_To_Buy|Total_Amt_Chng_Q4_Q1|Total_Trans_Amt|Total_Trans_Ct|Total_Ct_Chng_Q4_Q1|Avg_Utilization_Ratio|target|
+-----------------+------------+------+---------------+---------------+--------------+---------------+-------------+--------------+------------------------+----------------------+---------------------+------------+-------------------+---------------+--------------------+---------------+---------

[36mdft[39m: [32mDataFrame[39m = [Attrition_Flag: string, Customer_Age: int ... 19 more fields]

### Несбалансированное распределение данных

In [10]:
dft.groupBy("target").count.show

+------+-----+
|target|count|
+------+-----+
|     1| 1627|
|     0| 8500|
+------+-----+



### Oversampling

In [11]:
val df1 = dft.filter($"target" === 1)
val df0 = dft.filter($"target" === 0)

val df1count = df1.count
val df0count = df0.count

df0count / df1count

[36mdf1[39m: [32mDataset[39m[[32mRow[39m] = [Attrition_Flag: string, Customer_Age: int ... 19 more fields]
[36mdf0[39m: [32mDataset[39m[[32mRow[39m] = [Attrition_Flag: string, Customer_Age: int ... 19 more fields]
[36mdf1count[39m: [32mLong[39m = [32m1627L[39m
[36mdf0count[39m: [32mLong[39m = [32m8500L[39m
[36mres10_4[39m: [32mLong[39m = [32m5L[39m

In [12]:
val df1Over = df1
        .withColumn("dummy", explode(lit((1 to (df0count / df1count).toInt).toArray)))
        .drop("dummy")

df1Over.show(10, truncate = false)

+-----------------+------------+------+---------------+---------------+--------------+---------------+-------------+--------------+------------------------+----------------------+---------------------+------------+-------------------+---------------+--------------------+---------------+--------------+-------------------+---------------------+------+
|Attrition_Flag   |Customer_Age|Gender|Dependent_count|Education_Level|Marital_Status|Income_Category|Card_Category|Months_on_book|Total_Relationship_Count|Months_Inactive_12_mon|Contacts_Count_12_mon|Credit_Limit|Total_Revolving_Bal|Avg_Open_To_Buy|Total_Amt_Chng_Q4_Q1|Total_Trans_Amt|Total_Trans_Ct|Total_Ct_Chng_Q4_Q1|Avg_Utilization_Ratio|target|
+-----------------+------------+------+---------------+---------------+--------------+---------------+-------------+--------------+------------------------+----------------------+---------------------+------------+-------------------+---------------+--------------------+---------------+---------

[36mdf1Over[39m: [32mDataFrame[39m = [Attrition_Flag: string, Customer_Age: int ... 19 more fields]

In [13]:
val data = df0.unionAll(df1Over)
data.groupBy("target").count.show

+------+-----+
|target|count|
+------+-----+
|     1| 8135|
|     0| 8500|
+------+-----+



[36mdata[39m: [32mDataset[39m[[32mRow[39m] = [Attrition_Flag: string, Customer_Age: int ... 19 more fields]

## Работа с признаками

### Проверяем корреляции числовых признаков

In [14]:
val pairs = numericColumns
        .flatMap(f1 => numericColumns.map(f2 => (f1, f2)))
        .filter { p => !p._1.equals(p._2) }
        .map { p => if (p._1 < p._2) (p._1, p._2) else (p._2, p._1) }
        .distinct

val corr = pairs
        .map { p => (p._1, p._2, data.stat.corr(p._1, p._2)) }
        .filter(_._3 > 0.6)

[36mpairs[39m: [32mArray[39m[([32mString[39m, [32mString[39m)] = [33mArray[39m(
  ([32m"Customer_Age"[39m, [32m"Dependent_count"[39m),
  ([32m"Customer_Age"[39m, [32m"Months_on_book"[39m),
  ([32m"Customer_Age"[39m, [32m"Total_Relationship_Count"[39m),
  ([32m"Customer_Age"[39m, [32m"Months_Inactive_12_mon"[39m),
  ([32m"Contacts_Count_12_mon"[39m, [32m"Customer_Age"[39m),
  ([32m"Credit_Limit"[39m, [32m"Customer_Age"[39m),
  ([32m"Customer_Age"[39m, [32m"Total_Revolving_Bal"[39m),
  ([32m"Avg_Open_To_Buy"[39m, [32m"Customer_Age"[39m),
  ([32m"Customer_Age"[39m, [32m"Total_Amt_Chng_Q4_Q1"[39m),
  ([32m"Customer_Age"[39m, [32m"Total_Trans_Amt"[39m),
  ([32m"Customer_Age"[39m, [32m"Total_Trans_Ct"[39m),
  ([32m"Customer_Age"[39m, [32m"Total_Ct_Chng_Q4_Q1"[39m),
  ([32m"Avg_Utilization_Ratio"[39m, [32m"Customer_Age"[39m),
  ([32m"Dependent_count"[39m, [32m"Months_on_book"[39m),
  ([32m"Dependent_count"[39m, [32m"Total_Re

In [15]:
corr.sortBy(_._3).reverse.foreach { c => println(s"${c._1}\t\t${c._2}\t\t${c._3}") }

Avg_Open_To_Buy		Credit_Limit		0.9952040726156253
Total_Trans_Amt		Total_Trans_Ct		0.8053901681243808
Customer_Age		Months_on_book		0.7805047706891142
Avg_Utilization_Ratio		Total_Revolving_Bal		0.6946855441968229


In [16]:
val numericColumnsFinal = numericColumns.diff(corr.map(_._2))

[36mnumericColumnsFinal[39m: [32mArray[39m[[32mString[39m] = [33mArray[39m(
  [32m"Customer_Age"[39m,
  [32m"Dependent_count"[39m,
  [32m"Total_Relationship_Count"[39m,
  [32m"Months_Inactive_12_mon"[39m,
  [32m"Contacts_Count_12_mon"[39m,
  [32m"Avg_Open_To_Buy"[39m,
  [32m"Total_Amt_Chng_Q4_Q1"[39m,
  [32m"Total_Trans_Amt"[39m,
  [32m"Total_Ct_Chng_Q4_Q1"[39m,
  [32m"Avg_Utilization_Ratio"[39m
)

### Категориальные признаки

#### Индексируем строковые колонки

In [17]:
import org.apache.spark.ml.feature.StringIndexer

val stringColumns = data
        .dtypes
        .filter(_._2.equals("StringType"))
        .map(_._1)
        .filter(!_.equals("Attrition_Flag"))

val stringColumnsIndexed = stringColumns.map(_ + "_Indexed")

val indexer = new StringIndexer()
        .setInputCols(stringColumns)
        .setOutputCols(stringColumnsIndexed)

val indexed = indexer.fit(data).transform(data)
indexed.show(5)

+-----------------+------------+------+---------------+---------------+--------------+---------------+-------------+--------------+------------------------+----------------------+---------------------+------------+-------------------+---------------+--------------------+---------------+--------------+-------------------+---------------------+------+----------------------+-----------------------+--------------+-----------------------+---------------------+
|   Attrition_Flag|Customer_Age|Gender|Dependent_count|Education_Level|Marital_Status|Income_Category|Card_Category|Months_on_book|Total_Relationship_Count|Months_Inactive_12_mon|Contacts_Count_12_mon|Credit_Limit|Total_Revolving_Bal|Avg_Open_To_Buy|Total_Amt_Chng_Q4_Q1|Total_Trans_Amt|Total_Trans_Ct|Total_Ct_Chng_Q4_Q1|Avg_Utilization_Ratio|target|Marital_Status_Indexed|Income_Category_Indexed|Gender_Indexed|Education_Level_Indexed|Card_Category_Indexed|
+-----------------+------------+------+---------------+---------------+---------

[32mimport [39m[36morg.apache.spark.ml.feature.StringIndexer

[39m
[36mstringColumns[39m: [32mArray[39m[[32mString[39m] = [33mArray[39m(
  [32m"Gender"[39m,
  [32m"Education_Level"[39m,
  [32m"Marital_Status"[39m,
  [32m"Income_Category"[39m,
  [32m"Card_Category"[39m
)
[36mstringColumnsIndexed[39m: [32mArray[39m[[32mString[39m] = [33mArray[39m(
  [32m"Gender_Indexed"[39m,
  [32m"Education_Level_Indexed"[39m,
  [32m"Marital_Status_Indexed"[39m,
  [32m"Income_Category_Indexed"[39m,
  [32m"Card_Category_Indexed"[39m
)
[36mindexer[39m: [32mStringIndexer[39m = strIdx_c2e309055038
[36mindexed[39m: [32mDataFrame[39m = [Attrition_Flag: string, Customer_Age: int ... 24 more fields]

#### Кодируем категориальные признаки

In [18]:
import org.apache.spark.ml.feature.OneHotEncoder

val catColumns = stringColumnsIndexed.map(_ + "_Coded")
    
val encoder = new OneHotEncoder()
        .setInputCols(stringColumnsIndexed)
        .setOutputCols(catColumns)

val encoded = encoder.fit(indexed).transform(indexed)
encoded.show(5)

+-----------------+------------+------+---------------+---------------+--------------+---------------+-------------+--------------+------------------------+----------------------+---------------------+------------+-------------------+---------------+--------------------+---------------+--------------+-------------------+---------------------+------+----------------------+-----------------------+--------------+-----------------------+---------------------+-----------------------------+----------------------------+--------------------+-----------------------------+---------------------------+
|   Attrition_Flag|Customer_Age|Gender|Dependent_count|Education_Level|Marital_Status|Income_Category|Card_Category|Months_on_book|Total_Relationship_Count|Months_Inactive_12_mon|Contacts_Count_12_mon|Credit_Limit|Total_Revolving_Bal|Avg_Open_To_Buy|Total_Amt_Chng_Q4_Q1|Total_Trans_Amt|Total_Trans_Ct|Total_Ct_Chng_Q4_Q1|Avg_Utilization_Ratio|target|Marital_Status_Indexed|Income_Category_Indexed|Gend

[32mimport [39m[36morg.apache.spark.ml.feature.OneHotEncoder

[39m
[36mcatColumns[39m: [32mArray[39m[[32mString[39m] = [33mArray[39m(
  [32m"Gender_Indexed_Coded"[39m,
  [32m"Education_Level_Indexed_Coded"[39m,
  [32m"Marital_Status_Indexed_Coded"[39m,
  [32m"Income_Category_Indexed_Coded"[39m,
  [32m"Card_Category_Indexed_Coded"[39m
)
[36mencoder[39m: [32mOneHotEncoder[39m = oneHotEncoder_3083310f4ee1
[36mencoded[39m: [32mDataFrame[39m = [Attrition_Flag: string, Customer_Age: int ... 29 more fields]

### Собираем признаки в вектор

In [19]:
import org.apache.spark.ml.feature.VectorAssembler

val featureColumns = numericColumnsFinal ++ catColumns

val assembler = new VectorAssembler()
  .setInputCols(featureColumns)
  .setOutputCol("features")

val assembled = assembler.transform(encoded)
assembled.show(5, truncate = false)

+-----------------+------------+------+---------------+---------------+--------------+---------------+-------------+--------------+------------------------+----------------------+---------------------+------------+-------------------+---------------+--------------------+---------------+--------------+-------------------+---------------------+------+----------------------+-----------------------+--------------+-----------------------+---------------------+-----------------------------+----------------------------+--------------------+-----------------------------+---------------------------+--------------------------------------------------------------------------------------------------------------------+
|Attrition_Flag   |Customer_Age|Gender|Dependent_count|Education_Level|Marital_Status|Income_Category|Card_Category|Months_on_book|Total_Relationship_Count|Months_Inactive_12_mon|Contacts_Count_12_mon|Credit_Limit|Total_Revolving_Bal|Avg_Open_To_Buy|Total_Amt_Chng_Q4_Q1|Total_Trans_Am

[32mimport [39m[36morg.apache.spark.ml.feature.VectorAssembler

[39m
[36mfeatureColumns[39m: [32mArray[39m[[32mString[39m] = [33mArray[39m(
  [32m"Customer_Age"[39m,
  [32m"Dependent_count"[39m,
  [32m"Total_Relationship_Count"[39m,
  [32m"Months_Inactive_12_mon"[39m,
  [32m"Contacts_Count_12_mon"[39m,
  [32m"Avg_Open_To_Buy"[39m,
  [32m"Total_Amt_Chng_Q4_Q1"[39m,
  [32m"Total_Trans_Amt"[39m,
  [32m"Total_Ct_Chng_Q4_Q1"[39m,
  [32m"Avg_Utilization_Ratio"[39m,
  [32m"Gender_Indexed_Coded"[39m,
  [32m"Education_Level_Indexed_Coded"[39m,
  [32m"Marital_Status_Indexed_Coded"[39m,
  [32m"Income_Category_Indexed_Coded"[39m,
  [32m"Card_Category_Indexed_Coded"[39m
)
[36massembler[39m: [32mVectorAssembler[39m = VectorAssembler: uid=vecAssembler_323bd81ae46a, handleInvalid=error, numInputCols=15
[36massembled[39m: [32mDataFrame[39m = [Attrition_Flag: string, Customer_Age: int ... 30 more fields]

### Нормализация

In [20]:
import org.apache.spark.ml.feature.MinMaxScaler

val scaler = new MinMaxScaler()
  .setInputCol("features")
  .setOutputCol("scaledFeatures")

val scaled = scaler.fit(assembled).transform(assembled)
scaled.show(5, truncate = false)

+-----------------+------------+------+---------------+---------------+--------------+---------------+-------------+--------------+------------------------+----------------------+---------------------+------------+-------------------+---------------+--------------------+---------------+--------------+-------------------+---------------------+------+----------------------+-----------------------+--------------+-----------------------+---------------------+-----------------------------+----------------------------+--------------------+-----------------------------+---------------------------+--------------------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|Attrition_Flag   |Customer_Age|Gender|Dependent_count|E




[32mimport [39m[36morg.apache.spark.ml.feature.MinMaxScaler

[39m
[36mscaler[39m: [32mMinMaxScaler[39m = minMaxScal_0583594a8218
[36mscaled[39m: [32mDataFrame[39m = [Attrition_Flag: string, Customer_Age: int ... 31 more fields]

## Feature Selection (отбор признаков)

In [21]:
import org.apache.spark.ml.feature.ChiSqSelector

val selector = new ChiSqSelector()
  .setNumTopFeatures(10)
  .setFeaturesCol("scaledFeatures")
  .setLabelCol("target")
  .setOutputCol("selectedFeatures")

val dataF = selector.fit(scaled).transform(scaled)
dataF.show(5, truncate = false)

+-----------------+------------+------+---------------+---------------+--------------+---------------+-------------+--------------+------------------------+----------------------+---------------------+------------+-------------------+---------------+--------------------+---------------+--------------+-------------------+---------------------+------+----------------------+-----------------------+--------------+-----------------------+---------------------+-----------------------------+----------------------------+--------------------+-----------------------------+---------------------------+--------------------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------

+-----------------+------------+------+---------------+---------------+--------------+---------------+-------------+--------------+------------------------+----------------------+---------------------+------------+-------------------+---------------+--------------------+---------------+--------------+-------------------+---------------------+------+----------------------+-----------------------+--------------+-----------------------+---------------------+-----------------------------+----------------------------+--------------------+-----------------------------+---------------------------+--------------------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------

[32mimport [39m[36morg.apache.spark.ml.feature.ChiSqSelector

[39m
[36mselector[39m: [32mChiSqSelector[39m = chiSqSelector_cd73554525dd
[36mdataF[39m: [32mDataFrame[39m = [Attrition_Flag: string, Customer_Age: int ... 32 more fields]

## Моделирование

### Обучающая и тестовая выборки

In [22]:
val tt = dataF.randomSplit(Array(0.7, 0.3))
val training = tt(0)
val test = tt(1)

println(s"training\t${training.count}\ntest\t${test.count}")

training	11732
test	4903


[36mtt[39m: [32mArray[39m[[32mDataset[39m[[32mRow[39m]] = [33mArray[39m(
  [Attrition_Flag: string, Customer_Age: int ... 32 more fields],
  [Attrition_Flag: string, Customer_Age: int ... 32 more fields]
)
[36mtraining[39m: [32mDataset[39m[[32mRow[39m] = [Attrition_Flag: string, Customer_Age: int ... 32 more fields]
[36mtest[39m: [32mDataset[39m[[32mRow[39m] = [Attrition_Flag: string, Customer_Age: int ... 32 more fields]

### Логистическая регрессия

In [23]:
import org.apache.spark.ml.classification.LogisticRegression

val lr = new LogisticRegression()
        .setMaxIter(1000)
        .setRegParam(0.2)
        .setElasticNetParam(0.8)
        .setFeaturesCol("selectedFeatures")
        .setLabelCol("target")

val lrModel = lr.fit(training)

println(s"Coefficients: ${lrModel.coefficients} Intercept: ${lrModel.intercept}")

21/01/30 13:14:39 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeSystemBLAS
21/01/30 13:14:39 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeRefBLAS
21/01/30 13:14:40 INFO OWLQN: Step Size: 0,1463
21/01/30 13:14:40 INFO OWLQN: Val and Grad Norm: 0,692577 (rel: 0,000210) 0,0242979
21/01/30 13:14:40 INFO OWLQN: Step Size: 1,000
21/01/30 13:14:40 INFO OWLQN: Val and Grad Norm: 0,692508 (rel: 9,95e-05) 0,0304871
21/01/30 13:14:40 INFO OWLQN: Step Size: 1,000
21/01/30 13:14:40 INFO OWLQN: Val and Grad Norm: 0,692448 (rel: 8,64e-05) 0,0153372
21/01/30 13:14:40 INFO OWLQN: Step Size: 1,000
21/01/30 13:14:40 INFO OWLQN: Val and Grad Norm: 0,692418 (rel: 4,40e-05) 0,0176589
21/01/30 13:14:40 INFO OWLQN: Step Size: 1,000
21/01/30 13:14:40 INFO OWLQN: Val and Grad Norm: 0,692396 (rel: 3,13e-05) 0,0104927
21/01/30 13:14:40 INFO OWLQN: Step Size: 1,000
21/01/30 13:14:40 INFO OWLQN: Val and Grad Norm: 0,692377 (rel: 2,74e-05) 0,0118180
21

Coefficients: (10,[7],[-1.7920856148299178]) Intercept: 0.25501100216440425


[32mimport [39m[36morg.apache.spark.ml.classification.LogisticRegression

[39m
[36mlr[39m: [32mLogisticRegression[39m = logreg_eaf3838d71b1
[36mlrModel[39m: [32morg[39m.[32mapache[39m.[32mspark[39m.[32mml[39m.[32mclassification[39m.[32mLogisticRegressionModel[39m = LogisticRegressionModel: uid=logreg_eaf3838d71b1, numClasses=2, numFeatures=10

### Training Summary

In [24]:
val trainingSummary = lrModel.binarySummary
println(s"accuracy: ${trainingSummary.accuracy}")
println(s"areaUnderROC: ${trainingSummary.areaUnderROC}")

accuracy: 0.7006478008864644
areaUnderROC: 0.7480135302596236


[36mtrainingSummary[39m: [32morg[39m.[32mapache[39m.[32mspark[39m.[32mml[39m.[32mclassification[39m.[32mBinaryLogisticRegressionTrainingSummary[39m = org.apache.spark.ml.classification.BinaryLogisticRegressionTrainingSummaryImpl@88e8e07

### Проверяем модель на тестовой выборке

In [25]:
val predicted = lrModel.transform(test)
predicted.select("target", "rawPrediction", "probability", "prediction").show(10, truncate = false)

+------+------------------------------------------+----------------------------------------+----------+
|target|rawPrediction                             |probability                             |prediction|
+------+------------------------------------------+----------------------------------------+----------+
|0     |[0.05139028093117942,-0.05139028093117942]|[0.5128447434850257,0.48715525651497427]|0.0       |
|0     |[-0.08371579665427481,0.08371579665427481]|[0.47908326536604856,0.5209167346339514]|1.0       |
|0     |[0.07117367086333526,-0.07117367086333526]|[0.5177859101885983,0.48221408981140157]|0.0       |
|0     |[-0.10446422999775526,0.10446422999775526]|[0.47390766650047045,0.5260923334995296]|1.0       |
|0     |[-0.03208597414840475,0.03208597414840475]|[0.4919791945758656,0.5080208054241345] |1.0       |
|0     |[-0.0571771028428463,0.0571771028428463]  |[0.48570961727889,0.5142903827211099]   |1.0       |
|0     |[0.13100636236546503,-0.13100636236546503]|[0.5327048287

[36mpredicted[39m: [32mDataFrame[39m = [Attrition_Flag: string, Customer_Age: int ... 35 more fields]

In [26]:
import org.apache.spark.ml.evaluation.BinaryClassificationEvaluator

val evaluator = new BinaryClassificationEvaluator().setLabelCol("target")

println(s"areaUnderROC: ${evaluator.evaluate(predicted)}\n")

areaUnderROC: 0.7383373102241036



[32mimport [39m[36morg.apache.spark.ml.evaluation.BinaryClassificationEvaluator

[39m
[36mevaluator[39m: [32mBinaryClassificationEvaluator[39m = BinaryClassificationEvaluator: uid=binEval_8f06c4d8be3e, metricName=areaUnderROC, numBins=1000

#### Confusion Matrix (матрица ошибок)

* True Positive (TP) - label is positive and prediction is also positive
* True Negative (TN) - label is negative and prediction is also negative
* False Positive (FP) - label is negative but prediction is positive
* False Negative (FN) - label is positive but prediction is negative

In [27]:
val tp = predicted.filter(($"target" === 1) and ($"prediction" === 1)).count
val tn = predicted.filter(($"target" === 0) and ($"prediction" === 0)).count
val fp = predicted.filter(($"target" === 0) and ($"prediction" === 1)).count
val fn = predicted.filter(($"target" === 1) and ($"prediction" === 0)).count

println(s"Confusion Matrix:\n$tp\t$fp\n$fn\t$tn\n")

Confusion Matrix:
1166	277
1274	2186



[36mtp[39m: [32mLong[39m = [32m1166L[39m
[36mtn[39m: [32mLong[39m = [32m2186L[39m
[36mfp[39m: [32mLong[39m = [32m277L[39m
[36mfn[39m: [32mLong[39m = [32m1274L[39m

#### Accuracy, Precision, Recall

* Accuracy (доля правильных ответов) = TP + TN / TP + TN + FP + FN
* Precision (точность) = TP / TP + FP
* Recall (полнота) = TP / TP + FN

In [28]:
val accuracy = (tp + tn) / (tp + tn + fp + fn).toDouble
val precision = tp / (tp + fp).toDouble
val recall = tp / (tp + fn).toDouble

println(s"Accuracy = $accuracy")
println(s"Precision = $precision")
println(s"Recall = $recall\n")

Accuracy = 0.6836630634305527
Precision = 0.808038808038808
Recall = 0.4778688524590164



[36maccuracy[39m: [32mDouble[39m = [32m0.6836630634305527[39m
[36mprecision[39m: [32mDouble[39m = [32m0.808038808038808[39m
[36mrecall[39m: [32mDouble[39m = [32m0.4778688524590164[39m

### Настраиваем модель (подбираем гиперпараметры)

In [29]:
import org.apache.spark.ml.tuning.{ParamGridBuilder, TrainValidationSplit}

val paramGrid = new ParamGridBuilder()
  .addGrid(lr.regParam, Array(0.01, 0.1, 0.5))
  .addGrid(lr.fitIntercept)
  .addGrid(lr.elasticNetParam, Array(0.0, 0.5, 1.0))
  .build()

val trainValidationSplit = new TrainValidationSplit()
  .setEstimator(lr)
  .setEvaluator(evaluator)
  .setEstimatorParamMaps(paramGrid)
  .setTrainRatio(0.7)
  .setParallelism(2)

val model = trainValidationSplit.fit(dataF)

21/01/30 13:14:46 INFO StrongWolfeLineSearch: Line search t: 1.1178981010068636 fval: 0.6319393949326716 rhs: 0.6927715463534929 cdd: 0.0011184221443388625
21/01/30 13:14:46 INFO LBFGS: Step Size: 1,118
21/01/30 13:14:46 INFO LBFGS: Val and Grad Norm: 0,631939 (rel: 0,0878) 0,888493
21/01/30 13:14:46 INFO OWLQN: Step Size: 0,1468
21/01/30 13:14:46 INFO OWLQN: Val and Grad Norm: 0,678266 (rel: 0,0210) 0,304465
21/01/30 13:14:46 INFO OWLQN: Step Size: 1,000
21/01/30 13:14:46 INFO OWLQN: Val and Grad Norm: 0,570451 (rel: 0,159) 0,356071
21/01/30 13:14:46 INFO LBFGS: Step Size: 1,000
21/01/30 13:14:46 INFO LBFGS: Val and Grad Norm: 0,550542 (rel: 0,129) 0,376918
21/01/30 13:14:46 INFO LBFGS: Step Size: 1,000
21/01/30 13:14:46 INFO LBFGS: Val and Grad Norm: 0,521130 (rel: 0,0534) 0,0671982
21/01/30 13:14:46 INFO LBFGS: Step Size: 1,000
21/01/30 13:14:46 INFO LBFGS: Val and Grad Norm: 0,517558 (rel: 0,00686) 0,0505441
21/01/30 13:14:46 INFO OWLQN: Step Size: 0,2500
21/01/30 13:14:46 INFO OWL

21/01/30 13:14:48 INFO OWLQN: Step Size: 0,1468
21/01/30 13:14:48 INFO OWLQN: Val and Grad Norm: 0,679448 (rel: 0,0193) 0,291989
21/01/30 13:14:48 INFO OWLQN: Step Size: 1,000
21/01/30 13:14:48 INFO OWLQN: Val and Grad Norm: 0,583195 (rel: 0,142) 0,370425
21/01/30 13:14:48 INFO OWLQN: Step Size: 0,5000
21/01/30 13:14:48 INFO OWLQN: Val and Grad Norm: 0,522132 (rel: 0,000174) 0,0234388
21/01/30 13:14:48 INFO OWLQN: Step Size: 1,000
21/01/30 13:14:48 INFO OWLQN: Val and Grad Norm: 0,521823 (rel: 0,000590) 0,0330782
21/01/30 13:14:48 INFO OWLQN: Step Size: 0,2500
21/01/30 13:14:48 INFO OWLQN: Val and Grad Norm: 0,570759 (rel: 0,0213) 0,241592
21/01/30 13:14:48 INFO OWLQN: Step Size: 0,5000
21/01/30 13:14:48 INFO OWLQN: Val and Grad Norm: 0,521666 (rel: 0,000301) 0,0709613
21/01/30 13:14:48 INFO OWLQN: Step Size: 0,2500
21/01/30 13:14:48 INFO OWLQN: Step Size: 0,2500
21/01/30 13:14:48 INFO OWLQN: Val and Grad Norm: 0,564566 (rel: 0,0108) 0,170597
21/01/30 13:14:48 INFO OWLQN: Val and Grad 

21/01/30 13:14:50 INFO OWLQN: Step Size: 0,2500
21/01/30 13:14:50 INFO OWLQN: Val and Grad Norm: 0,517473 (rel: 1,79e-05) 0,0244084
21/01/30 13:14:50 INFO OWLQN: Step Size: 0,1250
21/01/30 13:14:50 INFO OWLQN: Val and Grad Norm: 0,531625 (rel: 0,000265) 0,0419761
21/01/30 13:14:50 INFO OWLQN: Step Size: 0,2500
21/01/30 13:14:50 INFO OWLQN: Val and Grad Norm: 0,517419 (rel: 0,000103) 0,0299235
21/01/30 13:14:50 INFO OWLQN: Step Size: 0,2500
21/01/30 13:14:50 INFO OWLQN: Val and Grad Norm: 0,531595 (rel: 5,63e-05) 0,0311413
21/01/30 13:14:50 INFO OWLQN: Step Size: 0,2500
21/01/30 13:14:50 INFO OWLQN: Val and Grad Norm: 0,517374 (rel: 8,61e-05) 0,0145936
21/01/30 13:14:50 INFO OWLQN: Step Size: 0,2500
21/01/30 13:14:50 INFO OWLQN: Val and Grad Norm: 0,531536 (rel: 0,000111) 0,0827145
21/01/30 13:14:50 INFO OWLQN: Step Size: 0,2500
21/01/30 13:14:50 INFO OWLQN: Val and Grad Norm: 0,517342 (rel: 6,21e-05) 0,0245317
21/01/30 13:14:50 INFO OWLQN: Step Size: 0,5000
21/01/30 13:14:50 INFO OWLQN

21/01/30 13:14:52 INFO OWLQN: Step Size: 0,5000
21/01/30 13:14:52 INFO OWLQN: Val and Grad Norm: 0,516563 (rel: 4,75e-08) 0,000632037
21/01/30 13:14:52 INFO OWLQN: Step Size: 0,5000
21/01/30 13:14:52 INFO OWLQN: Val and Grad Norm: 0,529262 (rel: 9,59e-05) 0,0239066
21/01/30 13:14:52 INFO OWLQN: Step Size: 0,5000
21/01/30 13:14:52 INFO OWLQN: Val and Grad Norm: 0,516563 (rel: 8,68e-08) 0,000427459
21/01/30 13:14:52 INFO OWLQN: Step Size: 0,5000
21/01/30 13:14:52 INFO OWLQN: Val and Grad Norm: 0,529243 (rel: 3,61e-05) 0,0201443
21/01/30 13:14:52 INFO OWLQN: Step Size: 0,5000
21/01/30 13:14:52 INFO OWLQN: Val and Grad Norm: 0,516563 (rel: 3,19e-08) 0,000392308
21/01/30 13:14:52 INFO OWLQN: Step Size: 0,5000
21/01/30 13:14:52 INFO OWLQN: Val and Grad Norm: 0,529203 (rel: 7,48e-05) 0,0205280
21/01/30 13:14:52 INFO OWLQN: Step Size: 0,5000
21/01/30 13:14:52 INFO OWLQN: Val and Grad Norm: 0,516563 (rel: 2,86e-08) 0,000331144
21/01/30 13:14:52 INFO OWLQN: Step Size: 0,5000
21/01/30 13:14:52 IN

21/01/30 13:14:53 INFO OWLQN: Step Size: 0,5000
21/01/30 13:14:53 INFO OWLQN: Val and Grad Norm: 0,516563 (rel: 1,25e-09) 6,03246e-05
21/01/30 13:14:53 INFO OWLQN: Step Size: 1,000
21/01/30 13:14:53 INFO OWLQN: Val and Grad Norm: 0,528856 (rel: 6,02e-06) 0,00996669
21/01/30 13:14:54 INFO OWLQN: Step Size: 1,000
21/01/30 13:14:54 INFO OWLQN: Val and Grad Norm: 0,528852 (rel: 8,41e-06) 0,0102706
21/01/30 13:14:54 INFO OWLQN: Step Size: 0,5000
21/01/30 13:14:54 INFO OWLQN: Val and Grad Norm: 0,516563 (rel: 1,34e-09) 8,25658e-05
21/01/30 13:14:54 INFO OWLQN: Step Size: 1,000
21/01/30 13:14:54 INFO OWLQN: Val and Grad Norm: 0,528840 (rel: 2,27e-05) 0,00425547
21/01/30 13:14:54 INFO OWLQN: Step Size: 1,000
21/01/30 13:14:54 INFO OWLQN: Val and Grad Norm: 0,528834 (rel: 1,11e-05) 0,00308100
21/01/30 13:14:54 INFO OWLQN: Step Size: 0,5000
21/01/30 13:14:54 INFO OWLQN: Val and Grad Norm: 0,516563 (rel: 5,29e-10) 5,52650e-05
21/01/30 13:14:54 INFO OWLQN: Step Size: 1,000
21/01/30 13:14:54 INFO O

21/01/30 13:14:55 INFO OWLQN: Step Size: 0,5000
21/01/30 13:14:55 INFO OWLQN: Val and Grad Norm: 0,516563 (rel: 2,23e-10) 4,50955e-05
21/01/30 13:14:55 INFO OWLQN: Step Size: 0,5000
21/01/30 13:14:55 INFO OWLQN: Val and Grad Norm: 0,528756 (rel: 1,48e-06) 0,00130050
21/01/30 13:14:55 INFO OWLQN: Step Size: 0,5000
21/01/30 13:14:55 INFO OWLQN: Val and Grad Norm: 0,516563 (rel: 1,24e-10) 4,52123e-05
21/01/30 13:14:55 INFO OWLQN: Step Size: 0,5000
21/01/30 13:14:55 INFO OWLQN: Val and Grad Norm: 0,528755 (rel: 7,06e-07) 0,00317690
21/01/30 13:14:55 INFO OWLQN: Step Size: 0,5000
21/01/30 13:14:55 INFO OWLQN: Val and Grad Norm: 0,528755 (rel: 1,36e-06) 0,00107137
21/01/30 13:14:55 INFO OWLQN: Step Size: 0,2500
21/01/30 13:14:55 INFO OWLQN: Val and Grad Norm: 0,516563 (rel: 5,66e-10) 1,59767e-05
21/01/30 13:14:55 INFO OWLQN: Step Size: 1,000
21/01/30 13:14:55 INFO OWLQN: Val and Grad Norm: 0,516563 (rel: 5,80e-10) 4,05858e-05
21/01/30 13:14:55 INFO OWLQN: Step Size: 0,5000
21/01/30 13:14:55 

21/01/30 13:14:56 INFO OWLQN: Step Size: 1,000
21/01/30 13:14:56 INFO OWLQN: Val and Grad Norm: 0,528747 (rel: 2,13e-07) 0,000829552
21/01/30 13:14:56 INFO OWLQN: Step Size: 1,000
21/01/30 13:14:56 INFO OWLQN: Val and Grad Norm: 0,528747 (rel: 1,20e-07) 0,000427734
21/01/30 13:14:56 INFO OWLQN: Step Size: 0,5000
21/01/30 13:14:56 INFO OWLQN: Val and Grad Norm: 0,516563 (rel: 4,89e-13) 1,42282e-06
21/01/30 13:14:56 INFO OWLQN: Step Size: 1,000
21/01/30 13:14:56 INFO OWLQN: Val and Grad Norm: 0,528747 (rel: 1,34e-07) 0,000389224
21/01/30 13:14:56 INFO OWLQN: Step Size: 1,000
21/01/30 13:14:56 INFO OWLQN: Val and Grad Norm: 0,528747 (rel: 1,77e-07) 0,000246196
21/01/30 13:14:56 INFO OWLQN: Step Size: 0,5000
21/01/30 13:14:56 INFO OWLQN: Val and Grad Norm: 0,516563 (rel: 3,31e-13) 8,79405e-07
21/01/30 13:14:56 INFO OWLQN: Step Size: 1,000
21/01/30 13:14:56 INFO OWLQN: Val and Grad Norm: 0,528747 (rel: 4,53e-07) 0,000208645
21/01/30 13:14:56 INFO OWLQN: Step Size: 1,000
21/01/30 13:14:56 IN

21/01/30 13:14:58 INFO OWLQN: Val and Grad Norm: 0,542563 (rel: 0,00628) 0,243772
21/01/30 13:14:58 INFO OWLQN: Step Size: 0,2500
21/01/30 13:14:58 INFO OWLQN: Val and Grad Norm: 0,528747 (rel: 7,05e-10) 4,88533e-05
21/01/30 13:14:58 INFO OWLQN: Step Size: 0,5000
21/01/30 13:14:58 INFO OWLQN: Val and Grad Norm: 0,537679 (rel: 0,00900) 0,118846
21/01/30 13:14:58 INFO OWLQN: Step Size: 0,5000
21/01/30 13:14:58 INFO OWLQN: Val and Grad Norm: 0,536808 (rel: 0,00162) 0,176392
21/01/30 13:14:58 INFO OWLQN: Step Size: 0,2500
21/01/30 13:14:58 INFO OWLQN: Val and Grad Norm: 0,528747 (rel: 1,69e-09) 8,93991e-05
21/01/30 13:14:58 INFO OWLQN: Step Size: 0,5000
21/01/30 13:14:58 INFO OWLQN: Val and Grad Norm: 0,533508 (rel: 0,00615) 0,0853047
21/01/30 13:14:58 INFO OWLQN: Step Size: 0,2500
21/01/30 13:14:58 INFO OWLQN: Val and Grad Norm: 0,528747 (rel: 9,57e-10) 4,54335e-05
21/01/30 13:14:58 INFO OWLQN: Step Size: 0,5000
21/01/30 13:14:58 INFO OWLQN: Val and Grad Norm: 0,533117 (rel: 0,000732) 0,1

21/01/30 13:14:59 INFO OWLQN: Val and Grad Norm: 0,528747 (rel: 7,16e-10) 5,85962e-05
21/01/30 13:14:59 INFO OWLQN: Step Size: 0,2500
21/01/30 13:14:59 INFO OWLQN: Val and Grad Norm: 0,522670 (rel: 3,87e-10) 5,36870e-05
21/01/30 13:14:59 INFO OWLQN: Step Size: 0,5000
21/01/30 13:14:59 INFO OWLQN: Val and Grad Norm: 0,528747 (rel: 3,26e-10) 4,01663e-05
21/01/30 13:14:59 INFO OWLQN: Step Size: 0,2500
21/01/30 13:14:59 INFO OWLQN: Val and Grad Norm: 0,522670 (rel: 3,98e-10) 3,99527e-05
21/01/30 13:14:59 INFO OWLQN: Step Size: 0,5000
21/01/30 13:14:59 INFO OWLQN: Val and Grad Norm: 0,528747 (rel: 3,49e-10) 6,52621e-05
21/01/30 13:14:59 INFO OWLQN: Step Size: 0,5000
21/01/30 13:14:59 INFO OWLQN: Val and Grad Norm: 0,528747 (rel: 5,49e-10) 3,54725e-05
21/01/30 13:14:59 INFO OWLQN: Step Size: 0,2500
21/01/30 13:14:59 INFO OWLQN: Val and Grad Norm: 0,522670 (rel: 4,84e-10) 3,45918e-05
21/01/30 13:15:00 INFO OWLQN: Step Size: 0,5000
21/01/30 13:15:00 INFO OWLQN: Val and Grad Norm: 0,528747 (rel

21/01/30 13:15:01 INFO OWLQN: Val and Grad Norm: 0,528747 (rel: 1,06e-10) 1,70860e-05
21/01/30 13:15:01 INFO OWLQN: Step Size: 0,5000
21/01/30 13:15:01 INFO OWLQN: Val and Grad Norm: 0,528747 (rel: 5,45e-11) 3,75262e-05
21/01/30 13:15:01 INFO OWLQN: Step Size: 0,5000
21/01/30 13:15:01 INFO OWLQN: Val and Grad Norm: 0,528747 (rel: 1,49e-10) 2,69830e-05
21/01/30 13:15:01 INFO OWLQN: Step Size: 0,5000
21/01/30 13:15:01 INFO OWLQN: Val and Grad Norm: 0,528747 (rel: 1,28e-10) 3,20719e-05
21/01/30 13:15:01 INFO OWLQN: Step Size: 0,5000
21/01/30 13:15:01 INFO OWLQN: Val and Grad Norm: 0,528747 (rel: 1,17e-10) 2,16805e-05
21/01/30 13:15:01 INFO OWLQN: Step Size: 0,5000
21/01/30 13:15:01 INFO OWLQN: Val and Grad Norm: 0,528747 (rel: 9,28e-11) 2,43543e-05
21/01/30 13:15:01 INFO OWLQN: Step Size: 0,5000
21/01/30 13:15:01 INFO OWLQN: Val and Grad Norm: 0,528747 (rel: 1,33e-10) 1,34241e-05
21/01/30 13:15:01 INFO OWLQN: Step Size: 0,5000
21/01/30 13:15:01 INFO OWLQN: Val and Grad Norm: 0,528747 (rel

21/01/30 13:15:02 INFO OWLQN: Val and Grad Norm: 0,528747 (rel: 9,81e-12) 4,97189e-06
21/01/30 13:15:02 INFO OWLQN: Step Size: 1,000
21/01/30 13:15:02 INFO OWLQN: Val and Grad Norm: 0,535205 (rel: 4,30e-08) 0,000806425
21/01/30 13:15:02 INFO OWLQN: Step Size: 1,000
21/01/30 13:15:02 INFO OWLQN: Val and Grad Norm: 0,535205 (rel: 1,20e-07) 0,000315998
21/01/30 13:15:02 INFO OWLQN: Step Size: 0,5000
21/01/30 13:15:02 INFO OWLQN: Val and Grad Norm: 0,528747 (rel: 3,97e-12) 9,46577e-06
21/01/30 13:15:02 INFO OWLQN: Step Size: 1,000
21/01/30 13:15:02 INFO OWLQN: Val and Grad Norm: 0,535205 (rel: 2,07e-08) 0,000433058
21/01/30 13:15:02 INFO OWLQN: Step Size: 1,000
21/01/30 13:15:02 INFO OWLQN: Val and Grad Norm: 0,535205 (rel: 2,60e-08) 0,000552497
21/01/30 13:15:02 INFO OWLQN: Step Size: 0,5000
21/01/30 13:15:02 INFO OWLQN: Val and Grad Norm: 0,528747 (rel: 1,30e-11) 3,48334e-06
21/01/30 13:15:02 INFO OWLQN: Step Size: 1,000
21/01/30 13:15:02 INFO OWLQN: Val and Grad Norm: 0,535205 (rel: 2,6

21/01/30 13:15:03 INFO OWLQN: Step Size: 0,5000
21/01/30 13:15:03 INFO OWLQN: Val and Grad Norm: 0,535205 (rel: 3,91e-11) 2,47197e-05
21/01/30 13:15:03 INFO OWLQN: Step Size: 0,5000
21/01/30 13:15:03 INFO OWLQN: Val and Grad Norm: 0,528747 (rel: 6,02e-12) 2,37845e-06
21/01/30 13:15:03 INFO OWLQN: Step Size: 0,5000
21/01/30 13:15:03 INFO OWLQN: Val and Grad Norm: 0,535205 (rel: 7,41e-11) 1,14808e-05
21/01/30 13:15:03 INFO OWLQN: Step Size: 0,5000
21/01/30 13:15:03 INFO OWLQN: Val and Grad Norm: 0,528747 (rel: 3,90e-13) 3,91237e-06
21/01/30 13:15:03 INFO OWLQN: Step Size: 0,5000
21/01/30 13:15:03 INFO OWLQN: Val and Grad Norm: 0,535205 (rel: 1,68e-11) 1,83382e-05
21/01/30 13:15:03 INFO OWLQN: Step Size: 0,5000
21/01/30 13:15:03 INFO OWLQN: Val and Grad Norm: 0,528747 (rel: 3,08e-12) 1,84004e-06
21/01/30 13:15:03 INFO OWLQN: Step Size: 0,5000
21/01/30 13:15:03 INFO OWLQN: Val and Grad Norm: 0,535205 (rel: 3,59e-11) 1,08102e-05
21/01/30 13:15:03 INFO OWLQN: Step Size: 0,5000
21/01/30 13:15

21/01/30 13:15:05 INFO OWLQN: Step Size: 0,5000
21/01/30 13:15:05 INFO OWLQN: Val and Grad Norm: 0,535205 (rel: 8,77e-14) 4,56043e-07
21/01/30 13:15:05 INFO OWLQN: Converged because gradient converged
21/01/30 13:15:05 INFO StrongWolfeLineSearch: Line search t: 0.9988968227842492 fval: 0.6376332187453239 rhs: 0.6927729261664859 cdd: 0.0010193163396261292
21/01/30 13:15:05 INFO LBFGS: Step Size: 0,9989
21/01/30 13:15:05 INFO LBFGS: Val and Grad Norm: 0,637633 (rel: 0,0796) 0,806212
21/01/30 13:15:05 INFO LBFGS: Step Size: 1,000
21/01/30 13:15:05 INFO LBFGS: Val and Grad Norm: 0,573583 (rel: 0,100) 0,245556
21/01/30 13:15:05 INFO LBFGS: Step Size: 1,000
21/01/30 13:15:05 INFO LBFGS: Val and Grad Norm: 0,564291 (rel: 0,0162) 0,0441601
21/01/30 13:15:05 INFO LBFGS: Step Size: 1,000
21/01/30 13:15:05 INFO LBFGS: Val and Grad Norm: 0,563594 (rel: 0,00123) 0,0385177
21/01/30 13:15:05 INFO LBFGS: Step Size: 1,000
21/01/30 13:15:05 INFO LBFGS: Val and Grad Norm: 0,562224 (rel: 0,00243) 0,032666

21/01/30 13:15:06 INFO OWLQN: Val and Grad Norm: 0,677365 (rel: 0,000172) 0,0273037
21/01/30 13:15:06 INFO OWLQN: Step Size: 0,2500
21/01/30 13:15:06 INFO OWLQN: Val and Grad Norm: 0,634420 (rel: 4,07e-05) 0,0273758
21/01/30 13:15:06 INFO OWLQN: Step Size: 0,5000
21/01/30 13:15:06 INFO OWLQN: Val and Grad Norm: 0,677300 (rel: 9,55e-05) 0,0212168
21/01/30 13:15:06 INFO OWLQN: Step Size: 1,000
21/01/30 13:15:06 INFO OWLQN: Val and Grad Norm: 0,677247 (rel: 7,84e-05) 0,0416836
21/01/30 13:15:06 INFO OWLQN: Step Size: 0,2500
21/01/30 13:15:06 INFO OWLQN: Val and Grad Norm: 0,634173 (rel: 0,000390) 0,0316551
21/01/30 13:15:06 INFO OWLQN: Step Size: 1,000
21/01/30 13:15:06 INFO OWLQN: Val and Grad Norm: 0,677136 (rel: 0,000164) 0,0309018
21/01/30 13:15:06 INFO OWLQN: Step Size: 1,000
21/01/30 13:15:06 INFO OWLQN: Val and Grad Norm: 0,677002 (rel: 0,000197) 0,0210913
21/01/30 13:15:06 INFO OWLQN: Step Size: 1,000
21/01/30 13:15:06 INFO OWLQN: Val and Grad Norm: 0,676907 (rel: 0,000140) 0,0183

21/01/30 13:15:08 INFO OWLQN: Step Size: 0,5000
21/01/30 13:15:08 INFO OWLQN: Val and Grad Norm: 0,673465 (rel: 1,97e-05) 0,00478036
21/01/30 13:15:08 INFO OWLQN: Step Size: 0,5000
21/01/30 13:15:08 INFO OWLQN: Val and Grad Norm: 0,632469 (rel: 0,000107) 0,0134247
21/01/30 13:15:08 INFO OWLQN: Step Size: 0,5000
21/01/30 13:15:08 INFO OWLQN: Val and Grad Norm: 0,673460 (rel: 6,73e-06) 0,00636883
21/01/30 13:15:08 INFO OWLQN: Step Size: 0,5000
21/01/30 13:15:08 INFO OWLQN: Val and Grad Norm: 0,632460 (rel: 1,53e-05) 0,0181639
21/01/30 13:15:08 INFO OWLQN: Step Size: 0,5000
21/01/30 13:15:08 INFO OWLQN: Val and Grad Norm: 0,673453 (rel: 1,02e-05) 0,00295020
21/01/30 13:15:08 INFO OWLQN: Step Size: 1,000
21/01/30 13:15:08 INFO OWLQN: Val and Grad Norm: 0,673451 (rel: 2,98e-06) 0,00722851
21/01/30 13:15:08 INFO OWLQN: Step Size: 0,5000
21/01/30 13:15:08 INFO OWLQN: Val and Grad Norm: 0,632408 (rel: 8,17e-05) 0,00827689
21/01/30 13:15:08 INFO OWLQN: Step Size: 1,000
21/01/30 13:15:08 INFO OW

21/01/30 13:15:09 INFO OWLQN: Step Size: 0,5000
21/01/30 13:15:09 INFO OWLQN: Val and Grad Norm: 0,673427 (rel: 1,70e-08) 0,000184502
21/01/30 13:15:09 INFO OWLQN: Step Size: 0,2500
21/01/30 13:15:09 INFO OWLQN: Val and Grad Norm: 0,632340 (rel: 2,32e-08) 0,000429096
21/01/30 13:15:09 INFO OWLQN: Step Size: 0,2500
21/01/30 13:15:09 INFO OWLQN: Val and Grad Norm: 0,673427 (rel: 6,80e-09) 9,05748e-05
21/01/30 13:15:09 INFO OWLQN: Step Size: 0,2500
21/01/30 13:15:09 INFO OWLQN: Val and Grad Norm: 0,632340 (rel: 1,52e-08) 0,000329998
21/01/30 13:15:09 INFO OWLQN: Step Size: 1,000
21/01/30 13:15:09 INFO OWLQN: Val and Grad Norm: 0,673427 (rel: 7,51e-09) 0,000260978
21/01/30 13:15:09 INFO OWLQN: Step Size: 1,000
21/01/30 13:15:09 INFO OWLQN: Val and Grad Norm: 0,673427 (rel: 4,32e-09) 0,000207345
21/01/30 13:15:09 INFO OWLQN: Step Size: 0,2500
21/01/30 13:15:09 INFO OWLQN: Val and Grad Norm: 0,632340 (rel: 2,37e-08) 0,000385187
21/01/30 13:15:09 INFO OWLQN: Step Size: 0,5000
21/01/30 13:15:0

21/01/30 13:15:10 INFO OWLQN: Val and Grad Norm: 0,632340 (rel: 8,93e-10) 2,83339e-05
21/01/30 13:15:10 INFO OWLQN: Step Size: 0,5000
21/01/30 13:15:10 INFO OWLQN: Val and Grad Norm: 0,673427 (rel: 4,00e-12) 2,65847e-06
21/01/30 13:15:10 INFO OWLQN: Step Size: 1,000
21/01/30 13:15:10 INFO OWLQN: Val and Grad Norm: 0,632340 (rel: 4,05e-10) 7,40535e-05
21/01/30 13:15:10 INFO OWLQN: Step Size: 1,000
21/01/30 13:15:10 INFO OWLQN: Val and Grad Norm: 0,673427 (rel: 3,58e-12) 5,87186e-06
21/01/30 13:15:10 INFO OWLQN: Step Size: 1,000
21/01/30 13:15:10 INFO OWLQN: Val and Grad Norm: 0,632340 (rel: 8,38e-10) 2,54618e-05
21/01/30 13:15:10 INFO OWLQN: Step Size: 0,5000
21/01/30 13:15:10 INFO OWLQN: Val and Grad Norm: 0,673427 (rel: 5,82e-12) 2,11516e-06
21/01/30 13:15:11 INFO OWLQN: Step Size: 0,5000
21/01/30 13:15:11 INFO OWLQN: Val and Grad Norm: 0,632340 (rel: 1,17e-10) 1,95643e-05
21/01/30 13:15:11 INFO OWLQN: Step Size: 1,000
21/01/30 13:15:11 INFO OWLQN: Val and Grad Norm: 0,673427 (rel: 4,

21/01/30 13:15:12 INFO LBFGS: Step Size: 1,000
21/01/30 13:15:12 INFO LBFGS: Val and Grad Norm: 0,562803 (rel: 0,00318) 0,0318279
21/01/30 13:15:12 INFO LBFGS: Step Size: 1,000
21/01/30 13:15:12 INFO LBFGS: Val and Grad Norm: 0,561014 (rel: 0,00318) 0,00687583
21/01/30 13:15:12 INFO LBFGS: Step Size: 1,000
21/01/30 13:15:12 INFO LBFGS: Val and Grad Norm: 0,560923 (rel: 0,000161) 0,000814142
21/01/30 13:15:12 INFO LBFGS: Step Size: 1,000
21/01/30 13:15:12 INFO LBFGS: Val and Grad Norm: 0,560923 (rel: 9,87e-07) 0,000456640
21/01/30 13:15:12 INFO LBFGS: Step Size: 1,000
21/01/30 13:15:12 INFO LBFGS: Val and Grad Norm: 0,560923 (rel: 3,84e-07) 7,20345e-05
21/01/30 13:15:12 INFO LBFGS: Step Size: 1,000
21/01/30 13:15:12 INFO LBFGS: Val and Grad Norm: 0,560923 (rel: 1,55e-08) 3,01901e-05
21/01/30 13:15:12 INFO LBFGS: Step Size: 1,000
21/01/30 13:15:12 INFO LBFGS: Val and Grad Norm: 0,560923 (rel: 3,91e-09) 3,56014e-06
21/01/30 13:15:12 INFO LBFGS: Step Size: 1,000
21/01/30 13:15:12 INFO LBFG

21/01/30 13:15:13 INFO OWLQN: Step Size: 1,000
21/01/30 13:15:13 INFO OWLQN: Val and Grad Norm: 0,676542 (rel: 1,71e-07) 0,000611829
21/01/30 13:15:13 INFO OWLQN: Step Size: 1,000
21/01/30 13:15:13 INFO OWLQN: Val and Grad Norm: 0,636078 (rel: 3,83e-08) 0,000226578
21/01/30 13:15:13 INFO OWLQN: Step Size: 1,000
21/01/30 13:15:13 INFO OWLQN: Val and Grad Norm: 0,676542 (rel: 1,93e-07) 0,000749336
21/01/30 13:15:13 INFO OWLQN: Step Size: 1,000
21/01/30 13:15:13 INFO OWLQN: Val and Grad Norm: 0,636078 (rel: 1,94e-08) 0,000171183
21/01/30 13:15:13 INFO OWLQN: Step Size: 1,000
21/01/30 13:15:13 INFO OWLQN: Val and Grad Norm: 0,676542 (rel: 2,60e-07) 0,000543948
21/01/30 13:15:13 INFO OWLQN: Step Size: 1,000
21/01/30 13:15:13 INFO OWLQN: Val and Grad Norm: 0,636078 (rel: 4,18e-08) 0,000108471
21/01/30 13:15:13 INFO OWLQN: Step Size: 1,000
21/01/30 13:15:13 INFO OWLQN: Val and Grad Norm: 0,636078 (rel: 2,84e-08) 0,000167274
21/01/30 13:15:13 INFO OWLQN: Step Size: 0,5000
21/01/30 13:15:13 INF

21/01/30 13:15:14 INFO OWLQN: Step Size: 1,000
21/01/30 13:15:14 INFO OWLQN: Val and Grad Norm: 0,676541 (rel: 3,96e-13) 1,08811e-06
21/01/30 13:15:14 INFO OWLQN: Step Size: 1,000
21/01/30 13:15:14 INFO OWLQN: Val and Grad Norm: 0,676541 (rel: 8,50e-14) 7,84402e-07
21/01/30 13:15:14 INFO OWLQN: Step Size: 1,000
21/01/30 13:15:14 INFO OWLQN: Val and Grad Norm: 0,676541 (rel: 6,60e-14) 4,70327e-07
21/01/30 13:15:14 INFO OWLQN: Converged because gradient converged
21/01/30 13:15:15 INFO OWLQN: Converged because gradient converged
21/01/30 13:15:15 INFO StrongWolfeLineSearch: Line search t: 0.6926436411633152 fval: 0.6533712794724852 rhs: 0.6927764771545549 cdd: 4.0594802958929407E-4
21/01/30 13:15:15 INFO LBFGS: Step Size: 0,6926
21/01/30 13:15:15 INFO LBFGS: Val and Grad Norm: 0,653371 (rel: 0,0569) 0,578588
21/01/30 13:15:15 INFO LBFGS: Step Size: 1,000
21/01/30 13:15:15 INFO LBFGS: Val and Grad Norm: 0,628930 (rel: 0,0374) 0,101760
21/01/30 13:15:15 INFO LBFGS: Step Size: 1,000
21/01/3

21/01/30 13:15:17 INFO LBFGS: Step Size: 1,000
21/01/30 13:15:17 INFO LBFGS: Val and Grad Norm: 0,504931 (rel: 2,73e-10) 3,02990e-06
21/01/30 13:15:17 INFO LBFGS: Step Size: 1,000
21/01/30 13:15:17 INFO LBFGS: Val and Grad Norm: 0,504931 (rel: 7,87e-11) 7,08950e-07
21/01/30 13:15:17 INFO StrongWolfeLineSearch: Line search t: 0.3593667548004782 fval: 0.5049306246222138 rhs: 0.5049306246228531 cdd: -1.1237904468019169E-16
21/01/30 13:15:17 INFO LBFGS: Step Size: 0,3594
21/01/30 13:15:17 INFO LBFGS: Val and Grad Norm: 0,504931 (rel: 1,27e-12) 2,04092e-06
21/01/30 13:15:17 INFO LBFGS: Step Size: 1,000
21/01/30 13:15:17 INFO LBFGS: Val and Grad Norm: 0,504931 (rel: 1,19e-12) 8,86215e-08
21/01/30 13:15:17 INFO LBFGS: Converged because gradient converged


[32mimport [39m[36morg.apache.spark.ml.tuning.{ParamGridBuilder, TrainValidationSplit}

[39m
[36mparamGrid[39m: [32mArray[39m[[32morg[39m.[32mapache[39m.[32mspark[39m.[32mml[39m.[32mparam[39m.[32mParamMap[39m] = [33mArray[39m(
  {
	logreg_eaf3838d71b1-elasticNetParam: 0.0,
	logreg_eaf3838d71b1-fitIntercept: true,
	logreg_eaf3838d71b1-regParam: 0.01
},
  {
	logreg_eaf3838d71b1-elasticNetParam: 0.5,
	logreg_eaf3838d71b1-fitIntercept: true,
	logreg_eaf3838d71b1-regParam: 0.01
},
  {
	logreg_eaf3838d71b1-elasticNetParam: 1.0,
	logreg_eaf3838d71b1-fitIntercept: true,
	logreg_eaf3838d71b1-regParam: 0.01
},
  {
	logreg_eaf3838d71b1-elasticNetParam: 0.0,
	logreg_eaf3838d71b1-fitIntercept: false,
	logreg_eaf3838d71b1-regParam: 0.01
},
  {
	logreg_eaf3838d71b1-elasticNetParam: 0.5,
	logreg_eaf3838d71b1-fitIntercept: false,
	logreg_eaf3838d71b1-regParam: 0.01
},
  {
	logreg_eaf3838d71b1-elasticNetParam: 1.0,
	logreg_eaf3838d71b1-fitIntercept: false,
	logreg_eaf3838d71b1-regP

In [30]:
model.bestModel.extractParamMap()

[36mres29[39m: [32morg[39m.[32mapache[39m.[32mspark[39m.[32mml[39m.[32mparam[39m.[32mParamMap[39m = {
	logreg_eaf3838d71b1-aggregationDepth: 2,
	logreg_eaf3838d71b1-elasticNetParam: 0.0,
	logreg_eaf3838d71b1-family: auto,
	logreg_eaf3838d71b1-featuresCol: selectedFeatures,
	logreg_eaf3838d71b1-fitIntercept: true,
	logreg_eaf3838d71b1-labelCol: target,
	logreg_eaf3838d71b1-maxIter: 1000,
	logreg_eaf3838d71b1-predictionCol: prediction,
	logreg_eaf3838d71b1-probabilityCol: probability,
	logreg_eaf3838d71b1-rawPredictionCol: rawPrediction,
	logreg_eaf3838d71b1-regParam: 0.01,
	logreg_eaf3838d71b1-standardization: true,
	logreg_eaf3838d71b1-threshold: 0.5,
	logreg_eaf3838d71b1-tol: 1.0E-6
}

In [31]:
val bestML = new LogisticRegression()
        .setMaxIter(1000)
        .setRegParam(0.01)
        .setElasticNetParam(0.0)
        .setFeaturesCol("selectedFeatures")
        .setLabelCol("target")

[36mbestML[39m: [32mLogisticRegression[39m = logreg_0648cd3d9e7f

## Собираем всё вместе (Pipeline)

1. Отобрали числовые признаки: numericColumnsFinal
2. Проиндексировали строковые признаки: indexer
3. Закодировали категориальные признки: encoder
4. Собрали признаки в вектор: assembler
5. Нормализовали признаки: scaler
6. Провели отбор признаков: selector
7. Рассчитали модель: bestML

In [32]:
import org.apache.spark.ml.Pipeline

val pipeline = new Pipeline().setStages(Array(indexer, encoder, assembler, scaler, selector, bestML))

[32mimport [39m[36morg.apache.spark.ml.Pipeline

[39m
[36mpipeline[39m: [32mPipeline[39m = pipeline_8e25609a8360

In [33]:
val ttData = data.randomSplit(Array(0.7, 0.3))
val trainingData = ttData(0)
val testData = ttData(1)

val pipelineModel = pipeline.fit(trainingData)

21/01/30 13:15:19 INFO StrongWolfeLineSearch: Line search t: 1.0839137451838616 fval: 0.6351667053803528 rhs: 0.6928285232187219 cdd: 0.0011142331008877008
21/01/30 13:15:19 INFO LBFGS: Step Size: 1,084
21/01/30 13:15:19 INFO LBFGS: Val and Grad Norm: 0,635167 (rel: 0,0832) 0,880028
21/01/30 13:15:19 INFO LBFGS: Step Size: 1,000
21/01/30 13:15:19 INFO LBFGS: Val and Grad Norm: 0,556800 (rel: 0,123) 0,397917
21/01/30 13:15:19 INFO LBFGS: Step Size: 1,000
21/01/30 13:15:19 INFO LBFGS: Val and Grad Norm: 0,524839 (rel: 0,0574) 0,0680475
21/01/30 13:15:19 INFO LBFGS: Step Size: 1,000
21/01/30 13:15:19 INFO LBFGS: Val and Grad Norm: 0,521263 (rel: 0,00681) 0,0503016
21/01/30 13:15:19 INFO LBFGS: Step Size: 1,000
21/01/30 13:15:19 INFO LBFGS: Val and Grad Norm: 0,517784 (rel: 0,00667) 0,0409918
21/01/30 13:15:19 INFO LBFGS: Step Size: 1,000
21/01/30 13:15:19 INFO LBFGS: Val and Grad Norm: 0,512974 (rel: 0,00929) 0,00959991
21/01/30 13:15:19 INFO LBFGS: Step Size: 1,000
21/01/30 13:15:19 INFO

[36mttData[39m: [32mArray[39m[[32mDataset[39m[[32mRow[39m]] = [33mArray[39m(
  [Attrition_Flag: string, Customer_Age: int ... 19 more fields],
  [Attrition_Flag: string, Customer_Age: int ... 19 more fields]
)
[36mtrainingData[39m: [32mDataset[39m[[32mRow[39m] = [Attrition_Flag: string, Customer_Age: int ... 19 more fields]
[36mtestData[39m: [32mDataset[39m[[32mRow[39m] = [Attrition_Flag: string, Customer_Age: int ... 19 more fields]
[36mpipelineModel[39m: [32morg[39m.[32mapache[39m.[32mspark[39m.[32mml[39m.[32mPipelineModel[39m = pipeline_8e25609a8360

## Сохраняем модель

In [34]:
pipelineModel.write.overwrite.save("pipelineModel")