In [None]:
val USER = "***"
val HDFS_DIR = s"/user/$USER/users-items/20200429"
val OUT_DIR = s"/user/$USER/features"

In [184]:
import org.apache.spark.sql.types._
import org.apache.spark.sql.functions._
import org.apache.spark.sql.functions.udf
import org.apache.spark.sql.DataFrame

In [1]:
spark.conf.set("spark.sql.session.timeZone", "UTC")

# get weblogs.json data

In [186]:
val input = spark.read
                .option("header",true)
                .json("/labs/laba03/weblogs.json").toDF
                .select('uid, explode(col("visits")))
                .select('uid, col("col.*"))
                .toDF

input = [uid: string, timestamp: bigint ... 1 more field]


[uid: string, timestamp: bigint ... 1 more field]

In [3]:
val webLogs = input
    .withColumn("timestamp", to_utc_timestamp(from_unixtime('timestamp / 1000), "UTC"))
    .na.drop(List("uid"))
    .withColumn("url", lower(callUDF("parse_url", col("url"), lit("HOST"))))
    .withColumn("url", regexp_replace(col("url"), "www.", ""))
    .withColumn("url", regexp_replace(col("url"), "[.]", "-"))
    .na.drop(List("url"))

webLogs = [uid: string, timestamp: timestamp ... 1 more field]


[uid: string, timestamp: timestamp ... 1 more field]

In [4]:
webLogs.printSchema

root
 |-- uid: string (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- url: string (nullable = true)



# domain_features

In [189]:
val top_domains = webLogs
    .groupBy('url)
    .count
    .na.drop(List("url"))
    .orderBy('count.desc)
    .limit(1000)
    .orderBy('url.asc)
    .select('url)
    .rdd.map(r => r(0)).collect

top_domains = Array(-kasparov-ru, 0629-com-ua, 1001eda-com, 1001golos-ru, 1001goroskop-ru, 1001tur-ru, 101-ru, 11x11-ru, 123greetings-com, 123magazin-ru, 1prime-ru, 2012god-ru, 2015godkozy-com, 24au-ru, 24open-ru, 2ip-ru, 360-ru, 3d-galleru-ru, 3dnews-ru, 4tololo-ru, 5lb-ru, 62-ua, 7ba-ru, 7d-ru, 7days-ru, 9111-ru, 999-md, a-napishem-com, a-piter-ru, abc-people-com, account-forex4you-org, accuweather-com, actuallno-com, ad-adriver-ru, adidas-ru, adme-ru, adultmanga-ru, adultmult-tv, afisha-1ru-tv, aftershock-su, aif-ru, aif-ua, akusherstvo-ru, alfabank-ru, allday2-com, alleng-ru, allrecipes-ru, alpindustria-ru, amalgama-lab-com, amdm-ru, amic-ru, amoory-com, amway-ru, anekdot-ru, anekdotov-net, anistar-ru, anna-news-info, anysex-com, api-oktools-ru, app-facebook-com, apps-fa...


Array(-kasparov-ru, 0629-com-ua, 1001eda-com, 1001golos-ru, 1001goroskop-ru, 1001tur-ru, 101-ru, 11x11-ru, 123greetings-com, 123magazin-ru, 1prime-ru, 2012god-ru, 2015godkozy-com, 24au-ru, 24open-ru, 2ip-ru, 360-ru, 3d-galleru-ru, 3dnews-ru, 4tololo-ru, 5lb-ru, 62-ua, 7ba-ru, 7d-ru, 7days-ru, 9111-ru, 999-md, a-napishem-com, a-piter-ru, abc-people-com, account-forex4you-org, accuweather-com, actuallno-com, ad-adriver-ru, adidas-ru, adme-ru, adultmanga-ru, adultmult-tv, afisha-1ru-tv, aftershock-su, aif-ru, aif-ua, akusherstvo-ru, alfabank-ru, allday2-com, alleng-ru, allrecipes-ru, alpindustria-ru, amalgama-lab-com, amdm-ru, amic-ru, amoory-com, amway-ru, anekdot-ru, anekdotov-net, anistar-ru, anna-news-info, anysex-com, api-oktools-ru, app-facebook-com, apps-fa...

In [190]:
val topWebLogs = webLogs.filter('url.isInCollection(top_domains))

topWebLogs = [uid: string, timestamp: bigint ... 1 more field]


[uid: string, timestamp: bigint ... 1 more field]

In [191]:
val topWebMatrix = topWebLogs
    .groupBy("uid", "url")
    .count
    .groupBy("uid")
    .pivot("url")
    .sum("count")
    .na.fill(0)

topWebMatrix = [uid: string, -kasparov-ru: bigint ... 999 more fields]


[uid: string, -kasparov-ru: bigint ... 999 more fields]

In [192]:
val col_arr = topWebMatrix.columns.filter(_ != "uid")
val topDomainFeatures = topWebMatrix.select('uid, array(col_arr.map(col):_*).as("domain_features"))

col_arr = Array(-kasparov-ru, 0629-com-ua, 1001eda-com, 1001golos-ru, 1001goroskop-ru, 1001tur-ru, 101-ru, 11x11-ru, 123greetings-com, 123magazin-ru, 1prime-ru, 2012god-ru, 2015godkozy-com, 24au-ru, 24open-ru, 2ip-ru, 360-ru, 3d-galleru-ru, 3dnews-ru, 4tololo-ru, 5lb-ru, 62-ua, 7ba-ru, 7d-ru, 7days-ru, 9111-ru, 999-md, a-napishem-com, a-piter-ru, abc-people-com, account-forex4you-org, accuweather-com, actuallno-com, ad-adriver-ru, adidas-ru, adme-ru, adultmanga-ru, adultmult-tv, afisha-1ru-tv, aftershock-su, aif-ru, aif-ua, akusherstvo-ru, alfabank-ru, allday2-com, alleng-ru, allrecipes-ru, alpindustria-ru, amalgama-lab-com, amdm-ru, amic-ru, amoory-com, amway-ru, anekdot-ru, anekdotov-net, anistar-ru, anna-news-info, anysex-com, api-oktools-ru, app-facebook-com, apps-fac...


Array(-kasparov-ru, 0629-com-ua, 1001eda-com, 1001golos-ru, 1001goroskop-ru, 1001tur-ru, 101-ru, 11x11-ru, 123greetings-com, 123magazin-ru, 1prime-ru, 2012god-ru, 2015godkozy-com, 24au-ru, 24open-ru, 2ip-ru, 360-ru, 3d-galleru-ru, 3dnews-ru, 4tololo-ru, 5lb-ru, 62-ua, 7ba-ru, 7d-ru, 7days-ru, 9111-ru, 999-md, a-napishem-com, a-piter-ru, abc-people-com, account-forex4you-org, accuweather-com, actuallno-com, ad-adriver-ru, adidas-ru, adme-ru, adultmanga-ru, adultmult-tv, afisha-1ru-tv, aftershock-su, aif-ru, aif-ua, akusherstvo-ru, alfabank-ru, allday2-com, alleng-ru, allrecipes-ru, alpindustria-ru, amalgama-lab-com, amdm-ru, amic-ru, amoory-com, amway-ru, anekdot-ru, anekdotov-net, anistar-ru, anna-news-info, anysex-com, api-oktools-ru, app-facebook-com, apps-fac...

In [193]:
topDomainFeatures.printSchema

root
 |-- uid: string (nullable = true)
 |-- domain_features: array (nullable = false)
 |    |-- element: long (containsNull = false)



In [194]:
val uid_top_visitors = topDomainFeatures.select('uid).rdd.map(r => r(0)).collect

uid_top_visitors = Array(e7267fb3-c6e7-4a7b-8810-536b07f7d092, 1ba0d2cb-e4b8-4a1b-b4b3-41607fbf70ca, 29b41418-ceff-44ab-be66-150acb8cf571, b966b7ec-c6a2-4f87-8762-9a3a526bed00, 77108b3a-10fd-48f5-a392-0f50fcd5f290, bbcea5e3-bab2-47a0-85ed-b0c5799212b8, cd90fa40-b226-43b2-bf24-e79a5145f4ed, 946c3fbd-9100-4afb-857c-d7060471989c, 853148d9-6fff-4c42-bda3-582f68b52b13, fc5f88c4-20a1-4f2a-b2ef-4850b394ea6b, 310dfbe9-cac6-4d79-a984-0bc940b9581e, 9fcc20f8-b187-40a0-bd19-39f37e19079e, 7bb2f2b1-c97a-4a3a-8442-d89c8e71b1a2, 552f8145-f5ea-4b91-a88a-e0d284c44428, 31e43b0b-2371-405d-b5a5-587091be1737, 27f0a457-c86b-4d97-9ba9-d9bfd0e22c42, 1bc25004-16a1-4543-a3e6-ee6f2c2a4592, c1bf0d2b-6e76-4633-b058-e86b9d02a0c2, d1b25e00-e63e-4e6d-9b76-cc8cb6cbec77, 9d753f38-a074-4eec-9ef1-eae117787862, ...


Array(e7267fb3-c6e7-4a7b-8810-536b07f7d092, 1ba0d2cb-e4b8-4a1b-b4b3-41607fbf70ca, 29b41418-ceff-44ab-be66-150acb8cf571, b966b7ec-c6a2-4f87-8762-9a3a526bed00, 77108b3a-10fd-48f5-a392-0f50fcd5f290, bbcea5e3-bab2-47a0-85ed-b0c5799212b8, cd90fa40-b226-43b2-bf24-e79a5145f4ed, 946c3fbd-9100-4afb-857c-d7060471989c, 853148d9-6fff-4c42-bda3-582f68b52b13, fc5f88c4-20a1-4f2a-b2ef-4850b394ea6b, 310dfbe9-cac6-4d79-a984-0bc940b9581e, 9fcc20f8-b187-40a0-bd19-39f37e19079e, 7bb2f2b1-c97a-4a3a-8442-d89c8e71b1a2, 552f8145-f5ea-4b91-a88a-e0d284c44428, 31e43b0b-2371-405d-b5a5-587091be1737, 27f0a457-c86b-4d97-9ba9-d9bfd0e22c42, 1bc25004-16a1-4543-a3e6-ee6f2c2a4592, c1bf0d2b-6e76-4633-b058-e86b9d02a0c2, d1b25e00-e63e-4e6d-9b76-cc8cb6cbec77, 9d753f38-a074-4eec-9ef1-eae117787862, ...

In [195]:
val inferriorDomainFeatures = webLogs
    .select('uid)
    .filter(!col("uid").isInCollection(uid_top_visitors))
    .dropDuplicates("uid")
    .withColumn("domain_features", array(lit(0).cast(LongType)))

inferriorDomainFeatures = [uid: string, domain_features: array<bigint>]


[uid: string, domain_features: array<bigint>]

In [196]:
val domainFeatures = topDomainFeatures.union(inferriorDomainFeatures).withColumnRenamed("uid", "uid1")

domainFeatures = [uid1: string, domain_features: array<bigint>]


[uid1: string, domain_features: array<bigint>]

# web_day

In [9]:
val daysWebMatrix = webLogs
    .withColumn("day_of_week", concat(lit("web_day_"), lower(date_format(col("timestamp"), "E"))))
    .drop("timestamp")
    .groupBy("uid", "day_of_week")
    .count
    .groupBy("uid")
    .pivot("day_of_week")
    .sum("count")
    .na.fill(0)
    .withColumnRenamed("uid", "uid_days")

daysWebMatrix = [uid_days: string, web_day_fri: bigint ... 6 more fields]


[uid_days: string, web_day_fri: bigint ... 6 more fields]

In [10]:
daysWebMatrix.printSchema

root
 |-- uid_days: string (nullable = true)
 |-- web_day_fri: long (nullable = false)
 |-- web_day_mon: long (nullable = false)
 |-- web_day_sat: long (nullable = false)
 |-- web_day_sun: long (nullable = false)
 |-- web_day_thu: long (nullable = false)
 |-- web_day_tue: long (nullable = false)
 |-- web_day_wed: long (nullable = false)



# web_hour

In [11]:
val hoursWebMatrix = webLogs
    .withColumn("hour", concat(lit("web_hour_"), date_format(col("timestamp"), "k")))
    .drop("timestamp")
    .groupBy("uid", "hour")
    .count
    .groupBy("uid")
    .pivot("hour")
    .sum("count")
    .na.fill(0)
    .withColumnRenamed("uid", "uid_hours")

hoursWebMatrix = [uid_hours: string, web_hour_1: bigint ... 23 more fields]


[uid_hours: string, web_hour_1: bigint ... 23 more fields]

In [12]:
hoursWebMatrix.printSchema

root
 |-- uid_hours: string (nullable = true)
 |-- web_hour_1: long (nullable = false)
 |-- web_hour_10: long (nullable = false)
 |-- web_hour_11: long (nullable = false)
 |-- web_hour_12: long (nullable = false)
 |-- web_hour_13: long (nullable = false)
 |-- web_hour_14: long (nullable = false)
 |-- web_hour_15: long (nullable = false)
 |-- web_hour_16: long (nullable = false)
 |-- web_hour_17: long (nullable = false)
 |-- web_hour_18: long (nullable = false)
 |-- web_hour_19: long (nullable = false)
 |-- web_hour_2: long (nullable = false)
 |-- web_hour_20: long (nullable = false)
 |-- web_hour_21: long (nullable = false)
 |-- web_hour_22: long (nullable = false)
 |-- web_hour_23: long (nullable = false)
 |-- web_hour_24: long (nullable = false)
 |-- web_hour_3: long (nullable = false)
 |-- web_hour_4: long (nullable = false)
 |-- web_hour_5: long (nullable = false)
 |-- web_hour_6: long (nullable = false)
 |-- web_hour_7: long (nullable = false)
 |-- web_hour_8: long (nullable = fal

# web_fraction

In [30]:
val fractWebHours = webLogs
    .withColumn("hour", date_format(col("timestamp"), "k"))
    .drop("timestamp")
    .groupBy("uid")
    .agg(
        (sum(when('hour >= 9 && 'hour < 18, 1).otherwise(0)) / sum(when('hour >= 0 && 'hour <= 23, 1).otherwise(0)))
            .as("web_fraction_work_hours"),
        (sum(when('hour >= 18 && 'hour <= 23, 1).otherwise(0)) / sum(when('hour >= 0 && 'hour <= 23, 1).otherwise(0)))
            .as("web_fraction_evening_hours")
    )
    .na.fill(0)
    .withColumnRenamed("uid", "uid_fract")

fractWebHours = [uid_fract: string, web_fraction_work_hours: double ... 1 more field]


[uid_fract: string, web_fraction_work_hours: double ... 1 more field]

In [31]:
fractWebHours.printSchema

root
 |-- uid_fract: string (nullable = true)
 |-- web_fraction_work_hours: double (nullable = false)
 |-- web_fraction_evening_hours: double (nullable = false)



# get users-items matrix

In [15]:
val usersItems = spark.read.parquet(HDFS_DIR)

usersItems = [uid: string, buy_cameras_0: bigint ... 639 more fields]


[uid: string, buy_cameras_0: bigint ... 639 more fields]

# union all data

In [32]:
val webDF = domainFeatures
    .join(daysWebMatrix, domainFeatures("uid1") === daysWebMatrix("uid_days"), "inner").drop("uid_days")
    .join(hoursWebMatrix, domainFeatures("uid1") === hoursWebMatrix("uid_hours"), "inner").drop("uid_hours")
    .join(fractWebHours, domainFeatures("uid1") === fractWebHours("uid_fract"), "inner").drop("uid_fract")

webDF = [uid1: string, domain_features: array<bigint> ... 33 more fields]


[uid1: string, domain_features: array<bigint> ... 33 more fields]

In [33]:
webDF.printSchema

root
 |-- uid1: string (nullable = true)
 |-- domain_features: array (nullable = false)
 |    |-- element: long (containsNull = false)
 |-- web_day_fri: long (nullable = false)
 |-- web_day_mon: long (nullable = false)
 |-- web_day_sat: long (nullable = false)
 |-- web_day_sun: long (nullable = false)
 |-- web_day_thu: long (nullable = false)
 |-- web_day_tue: long (nullable = false)
 |-- web_day_wed: long (nullable = false)
 |-- web_hour_1: long (nullable = false)
 |-- web_hour_10: long (nullable = false)
 |-- web_hour_11: long (nullable = false)
 |-- web_hour_12: long (nullable = false)
 |-- web_hour_13: long (nullable = false)
 |-- web_hour_14: long (nullable = false)
 |-- web_hour_15: long (nullable = false)
 |-- web_hour_16: long (nullable = false)
 |-- web_hour_17: long (nullable = false)
 |-- web_hour_18: long (nullable = false)
 |-- web_hour_19: long (nullable = false)
 |-- web_hour_2: long (nullable = false)
 |-- web_hour_20: long (nullable = false)
 |-- web_hour_21: long (nul

In [34]:
val resDF = usersItems.join(webDF, usersItems("uid") === webDF("uid1"), "full").drop("uid1")

resDF = [uid: string, buy_cameras_0: bigint ... 673 more fields]


[uid: string, buy_cameras_0: bigint ... 673 more fields]

In [35]:
resDF
    .write
    .format("parquet")
    .mode("overwrite")
    .save(OUT_DIR)