## Create session

In [1]:
import scala.collection.mutable.ArrayBuffer
import sys.process._ // for using console commands

import org.apache.spark.SparkContext
import org.apache.spark.SparkConf
import org.apache.spark.sql.{SQLContext, DataFrame, Row}

import org.apache.spark.sql.functions._
import org.apache.spark.sql.types._
import org.apache.spark.ml.linalg.{Vector}
import org.apache.spark.ml.feature.{VectorAssembler}
import org.apache.spark.ml.classification.GBTClassifier
import org.apache.spark.ml.evaluation.BinaryClassificationEvaluator

val conf = new SparkConf()
               .setAppName("artem.spitsin_lab05")
               .set("spark.driver.cores", "2")
               .set("spark.driver.memory", "1g")
               .set("spark.executor.instances", "5")
               .set("spark.executor.cores", "6")
               .set("spark.executor.memory", "4g")
               .set("spark.executor.memoryOverhead", "2g")
               .set("spark.sql.autoBroadcastJoinThreshold", "-1")
               
val ss = SparkSession
         .builder()
         .appName("artem.spitsin_lab05")
         .config(conf=conf)
         .enableHiveSupport()
         .getOrCreate()

var sc = ss.sparkContext
val sqlContext = new SQLContext(sc)

conf = org.apache.spark.SparkConf@21273bf5
ss = org.apache.spark.sql.SparkSession@3c320d04
sc = org.apache.spark.SparkContext@31c2aa74
sqlContext = org.apache.spark.sql.SQLContext@59cb8...




org.apache.spark.sql.SQLContext@59cb8...

## Functions

In [8]:
def load_data(path: String) = {
    spark.read
    .option("encoding", "UTF-8")
    .option("header", "true")
    .option("inferSchema", "true")
    .format("csv").load(path)
    .drop("_c0")
}

def drop_str_cols(data: org.apache.spark.sql.DataFrame) = {
    var drop_cols = ArrayBuffer[String]()
    for (name_dtype <- data.dtypes if (name_dtype._2 == "StringType")) drop_cols += name_dtype._1
    
    data.drop(drop_cols:_*)
}

val vector2array = udf((v: Vector) => v.toArray)

vector2array = UserDefinedFunction(<function1>,ArrayType(DoubleType,false),Some(List(org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7)))


load_data: (path: String)org.apache.spark.sql.DataFrame
drop_str_cols: (data: org.apache.spark.sql.DataFrame)org.apache.spark.sql.DataFrame


UserDefinedFunction(<function1>,ArrayType(DoubleType,false),Some(List(org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7)))

## Constants

In [9]:
val NAME_TARGET: String = "TARGET"
val SERVICE_COLS: List[String] = List("ID", NAME_TARGET)

NAME_TARGET = TARGET
SERVICE_COLS = List(ID, TARGET)


List(ID, TARGET)

## Loading data

In [10]:
var train_data = load_data(path="/labs/slaba05/lab05_train.csv")
var test_data = load_data(path="/labs/slaba05/lab05_test.csv")

train_data = drop_str_cols(train_data)
test_data = drop_str_cols(test_data)

Waiting for a Spark session to start...

train_data = [ID: int, CR_PROD_CNT_IL: int ... 101 more fields]
test_data = [ID: int, CR_PROD_CNT_IL: int ... 100 more fields]
train_data = [ID: int, CR_PROD_CNT_IL: int ... 101 more fields]
test_data = [ID: int, CR_PROD_CNT_IL: int ... 100 more fields]


[ID: int, CR_PROD_CNT_IL: int ... 100 more fields]

## Fill NaN values

In [11]:
train_data = train_data.where("%s is not null".format(NAME_TARGET))
train_data = train_data.na.fill(-1e5)

test_data = test_data.na.fill(-1e5)

train_data = [ID: int, CR_PROD_CNT_IL: int ... 101 more fields]
train_data = [ID: int, CR_PROD_CNT_IL: int ... 101 more fields]
test_data = [ID: int, CR_PROD_CNT_IL: int ... 100 more fields]


[ID: int, CR_PROD_CNT_IL: int ... 100 more fields]

## Selection features and preparation for model

In [12]:
var select_features = ArrayBuffer[String]()
for(col <- train_data.columns if (SERVICE_COLS.indexOf(col) == -1)) select_features += col

select_features = ArrayBuffer(CR_PROD_CNT_IL, AMOUNT_RUB_CLO_PRC, PRC_ACCEPTS_A_EMAIL_LINK, APP_REGISTR_RGN_CODE, PRC_ACCEPTS_A_POS, PRC_ACCEPTS_A_TK, TURNOVER_DYNAMIC_IL_1M, CNT_TRAN_AUT_TENDENCY1M, SUM_TRAN_AUT_TENDENCY1M, AMOUNT_RUB_SUP_PRC, PRC_ACCEPTS_A_AMOBILE, SUM_TRAN_AUT_TENDENCY3M, PRC_ACCEPTS_TK, PRC_ACCEPTS_A_MTP, REST_DYNAMIC_FDEP_1M, CNT_TRAN_AUT_TENDENCY3M, CNT_ACCEPTS_TK, REST_DYNAMIC_SAVE_3M, CR_PROD_CNT_VCU, REST_AVG_CUR, CNT_TRAN_MED_TENDENCY1M, AMOUNT_RUB_NAS_PRC, TRANS_COUNT_SUP_PRC, CNT_TRAN_CLO_TENDENCY1M, SUM_TRAN_MED_TENDENCY1M, PRC_ACCEPTS_A_ATM, PRC_ACCEPTS_MTP, TRANS_COUNT_NAS_PRC, CNT_ACCEPTS_MTP, CR_PROD_CNT_TOVR, CR_PROD_CNT_PIL, SUM_TRAN_CLO_TENDENCY1M, TURNOVER_CC, TRANS_COUNT_ATM_PRC, AMOUNT_RUB_ATM_PRC, TUR...


ArrayBuffer(CR_PROD_CNT_IL, AMOUNT_RUB_CLO_PRC, PRC_ACCEPTS_A_EMAIL_LINK, APP_REGISTR_RGN_CODE, PRC_ACCEPTS_A_POS, PRC_ACCEPTS_A_TK, TURNOVER_DYNAMIC_IL_1M, CNT_TRAN_AUT_TENDENCY1M, SUM_TRAN_AUT_TENDENCY1M, AMOUNT_RUB_SUP_PRC, PRC_ACCEPTS_A_AMOBILE, SUM_TRAN_AUT_TENDENCY3M, PRC_ACCEPTS_TK, PRC_ACCEPTS_A_MTP, REST_DYNAMIC_FDEP_1M, CNT_TRAN_AUT_TENDENCY3M, CNT_ACCEPTS_TK, REST_DYNAMIC_SAVE_3M, CR_PROD_CNT_VCU, REST_AVG_CUR, CNT_TRAN_MED_TENDENCY1M, AMOUNT_RUB_NAS_PRC, TRANS_COUNT_SUP_PRC, CNT_TRAN_CLO_TENDENCY1M, SUM_TRAN_MED_TENDENCY1M, PRC_ACCEPTS_A_ATM, PRC_ACCEPTS_MTP, TRANS_COUNT_NAS_PRC, CNT_ACCEPTS_MTP, CR_PROD_CNT_TOVR, CR_PROD_CNT_PIL, SUM_TRAN_CLO_TENDENCY1M, TURNOVER_CC, TRANS_COUNT_ATM_PRC, AMOUNT_RUB_ATM_PRC, TUR...

In [13]:
val assembler_values = new VectorAssembler()
                           .setInputCols(select_features.toArray)
                           .setOutputCol("features")

train_data = assembler_values.transform(train_data).cache()
test_data = assembler_values.transform(test_data).cache()

println(train_data.count()); println(test_data.count())

320763
44399


assembler_values = vecAssembler_eb3daa771a60
train_data = [ID: int, CR_PROD_CNT_IL: int ... 102 more fields]
test_data = [ID: int, CR_PROD_CNT_IL: int ... 101 more fields]


[ID: int, CR_PROD_CNT_IL: int ... 101 more fields]

## Training model

In [14]:
val booster = new GBTClassifier()
                  .setFeaturesCol("features")
                  .setLabelCol(NAME_TARGET)
                  .setMaxDepth(5)
                  .setMaxIter(70)
                  .fit(train_data)

booster = GBTClassificationModel (uid=gbtc_f7f524e1baad) with 70 trees


GBTClassificationModel (uid=gbtc_f7f524e1baad) with 70 trees

In [15]:
val train_predictions = booster.transform(train_data)

train_predictions = [ID: int, CR_PROD_CNT_IL: int ... 105 more fields]


[ID: int, CR_PROD_CNT_IL: int ... 105 more fields]

## Evaluation

In [16]:
val evaluator = new BinaryClassificationEvaluator()
                    .setLabelCol(NAME_TARGET)
                    .setRawPredictionCol("probability")
                    .setMetricName("areaUnderROC")

evaluator.evaluate(train_predictions)

evaluator = binEval_3e8c51102242


0.845954265858909

## Submitting

In [17]:
val test_predictions = booster.transform(test_data)
                       .select(
                           col("ID").alias("id"),
                           vector2array(col("probability")).getItem(1).alias("target")
                       )

test_predictions.coalesce(1).write.option("header", "true").option("sep", "\t").mode("overwrite").csv("lab05.csv")
"""hdfs dfs -get lab05.csv""".!!

test_predictions = [id: int, target: double]


res47: String = ""


[id: int, target: double]

## Stop session

In [18]:
ss.catalog.clearCache()
ss.stop()