In [4]:
import org.apache.spark.sql.functions._

In [1]:
val df = spark
    .read
    .option("header", true)
    .json("/labs/laba07/laba07.json")
    .select('*, explode(col("visits")))
    .select('uid, 'gender_age, col("col.*"))
    .na.drop(List("uid"))
    .withColumn("url", lower(callUDF("parse_url", col("url"), lit("HOST"))))
    .withColumn("url", regexp_replace(col("url"), "www.", ""))
    .toDF
    .na.drop(List("url"))

Waiting for a Spark session to start...

df = [uid: string, gender_age: string ... 2 more fields]


[uid: string, gender_age: string ... 2 more fields]

In [2]:
df.printSchema

root
 |-- uid: string (nullable = true)
 |-- gender_age: string (nullable = true)
 |-- timestamp: long (nullable = true)
 |-- url: string (nullable = true)



In [3]:
val train_df = df
    .groupBy("uid", "gender_age")
    .agg(collect_list('url).as("domains"))

train_df = [uid: string, gender_age: string ... 1 more field]


[uid: string, gender_age: string ... 1 more field]

In [4]:
train_df.show

+--------------------+----------+--------------------+
|                 uid|gender_age|             domains|
+--------------------+----------+--------------------+
|032f99d3-18c1-40e...|   F:25-34|[http, cabinet.sa...|
|0355d721-ad22-473...|   F:25-34|[mail.qip.ru, mai...|
|03f6cef2-5208-43f...|   M:35-44|[myvideopl.com, 4...|
|03fb8ac3-2b5c-4da...|   F:25-34|[cenamashin.ru, o...|
|0520b6a6-4397-42f...|   M:25-34|[simpsonsvideos.r...|
|0552bcaf-c256-474...|   M:35-44|[urbangroup.ru, l...|
|09e2b2ed-eea5-457...|   F:35-44|[wh-lady.ru, post...|
|0a5a08ce-2645-483...|   F:25-34|     [hullabaloo.ru]|
|0a87adc3-b78d-4cd...|   M:25-34|[bigcinema.tv, bi...|
|0b130223-5e9c-4f1...|   F:25-34|[baskino.com, avi...|
|0b2cc555-2e1c-415...|   M:25-34|[avito.ru, avito....|
|0e0ae02c-eaec-42e...|   F:25-34|[bankreferatov.ru...|
|0e314d01-3ca5-4eb...|   F:25-34|[an.yandex.ru, ma...|
|0f1b8aaa-afc5-4c0...|   M:25-34|[tvcok.ru, tvcok....|
|0fb2fe52-6fc2-429...|   F:35-44|[longdress.ru, un...|
|1163e860-

In [5]:
import org.apache.spark.ml.classification.LogisticRegression
import org.apache.spark.ml.feature.{CountVectorizer, StringIndexer, IndexToString}
import org.apache.spark.ml.{Pipeline, PipelineModel}

In [6]:
val cv = new CountVectorizer()
      .setInputCol("domains")
      .setOutputCol("features")

val indexer = new StringIndexer()
        .setInputCol("gender_age")
        .setOutputCol("label")

val labels = indexer.fit(train_df).labels

val lr = new LogisticRegression()
        .setMaxIter(10)
        .setRegParam(0.001)

val revIndexer = new IndexToString()
        .setInputCol("prediction")
        .setLabels(labels)
        .setOutputCol("res")

val pipeline = new Pipeline()
      .setStages(Array(cv, indexer, lr, revIndexer))

cv = cntVec_429b475fd416
indexer = strIdx_c31e32dcf797
labels = Array(M:25-34, F:25-34, M:35-44, F:35-44, F:18-24, F:45-54, M:45-54, M:18-24, F:>=55, M:>=55)
lr = logreg_cc4bbfdc8fbd
revIndexer = idxToStr_43305836d4d9
pipeline = pipeline_4150d16472e5


pipeline_4150d16472e5

In [7]:
df.printSchema

root
 |-- uid: string (nullable = true)
 |-- gender_age: string (nullable = true)
 |-- timestamp: long (nullable = true)
 |-- url: string (nullable = true)



In [8]:
val model = pipeline.fit(train_df)

model = pipeline_4150d16472e5


pipeline_4150d16472e5

In [9]:
model.write.overwrite().save("lab07_model")