# Generate Skewed Data 

## Import Modules

In [1]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql import Window

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')

In [3]:
spark = (
    SparkSession
    .builder
    .config("spark.driver.memory", "8g")
    .config("spark.driver.cores", "4")
    .getOrCreate()
)

sc = spark.sparkContext
sc.setLogLevel("ERROR")

# Setting Up Transactions Data

In [4]:
customers_file = "../../data/data_skew/raw_customers.csv"
txns_file = "../../data/data_skew/raw_transactions.csv"

In [6]:
df_raw_txns = spark.read.csv(txns_file, header=True)

In [7]:
df_raw_txns.printSchema()
df_raw_txns.show(3, False)

root
 |-- CUST_ID: string (nullable = true)
 |-- START_DATE: string (nullable = true)
 |-- END_DATE: string (nullable = true)
 |-- TRANS_ID: string (nullable = true)
 |-- DATE: string (nullable = true)
 |-- YEAR: string (nullable = true)
 |-- MONTH: string (nullable = true)
 |-- DAY: string (nullable = true)
 |-- EXP_TYPE: string (nullable = true)
 |-- AMOUNT: string (nullable = true)

+----------+----------+--------+---------------+----------+----+-----+---+------------+------+
|CUST_ID   |START_DATE|END_DATE|TRANS_ID       |DATE      |YEAR|MONTH|DAY|EXP_TYPE    |AMOUNT|
+----------+----------+--------+---------------+----------+----+-----+---+------------+------+
|CI6XLYUMQK|2015-05-01|null    |T8I9ZB5A6X90UG8|2015-09-11|2015|9    |11 |Motor/Travel|20.27 |
|CI6XLYUMQK|2015-05-01|null    |TZ4JSLS7SC7FO9H|2017-02-08|2017|2    |8  |Motor/Travel|12.85 |
|CI6XLYUMQK|2015-05-01|null    |TTUKRDDJ6B6F42H|2015-08-01|2015|8    |1  |Housing     |383.8 |
+----------+----------+--------+---------

In [8]:
df_txns = (
    df_raw_txns.withColumnRenamed("CUST_ID", "cust_id")
    .withColumnRenamed("START_DATE", "start_date")
    .withColumnRenamed("END_DATE", "end_date")
    .withColumnRenamed("TRANS_ID", "txn_id")
    .withColumnRenamed("DATE", "date")
    .withColumnRenamed("YEAR", "year")
    .withColumnRenamed("MONTH", "month")
    .withColumnRenamed("DAY", "day")
    .withColumnRenamed("EXP_TYPE", "expense_type")
    .withColumnRenamed("AMOUNT", "amt")
)

In [9]:
df_txns.printSchema()
df_txns.show(3, False)

root
 |-- cust_id: string (nullable = true)
 |-- start_date: string (nullable = true)
 |-- end_date: string (nullable = true)
 |-- txn_id: string (nullable = true)
 |-- date: string (nullable = true)
 |-- year: string (nullable = true)
 |-- month: string (nullable = true)
 |-- day: string (nullable = true)
 |-- expense_type: string (nullable = true)
 |-- amt: string (nullable = true)

+----------+----------+--------+---------------+----------+----+-----+---+------------+-----+
|cust_id   |start_date|end_date|txn_id         |date      |year|month|day|expense_type|amt  |
+----------+----------+--------+---------------+----------+----+-----+---+------------+-----+
|CI6XLYUMQK|2015-05-01|null    |T8I9ZB5A6X90UG8|2015-09-11|2015|9    |11 |Motor/Travel|20.27|
|CI6XLYUMQK|2015-05-01|null    |TZ4JSLS7SC7FO9H|2017-02-08|2017|2    |8  |Motor/Travel|12.85|
|CI6XLYUMQK|2015-05-01|null    |TTUKRDDJ6B6F42H|2015-08-01|2015|8    |1  |Housing     |383.8|
+----------+----------+--------+---------------+

# Setting Up Customer Data

In [10]:
df_customer_det = spark.read.csv(customers_file, header=True)

In [11]:
df_customer_det.printSchema()
df_customer_det.show(3, False)

root
 |-- name: string (nullable = true)
 |-- age: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- birthday: string (nullable = true)
 |-- zip: string (nullable = true)

+--------------+---+------+---------+-----+
|name          |age|gender|birthday |zip  |
+--------------+---+------+---------+-----+
|Carolyn Mathis|63 |Male  |3/7/1975 |49241|
|Anthony Lamb  |30 |Female|6/25/1987|37320|
|Eliza Bryan   |20 |Male  |9/13/1985|12568|
+--------------+---+------+---------+-----+
only showing top 3 rows



In [12]:
df_top20k_customers = (
    df_txns
    .groupBy("cust_id")
    .agg(F.countDistinct("txn_id").alias("distinct_txns"))
    .orderBy(F.desc("distinct_txns"))
    .limit(20000)
    .withColumn("row_id", F.row_number().over(Window.orderBy("cust_id")))
)

In [13]:
df_customer_det = df_customer_det.withColumn("row_id", F.row_number().over(Window.orderBy("name")))
df_customer_identity = df_top20k_customers.join(df_customer_det, "row_id").drop("row_id")

In [14]:
df_customer_identity.printSchema()
df_customer_identity.show(5, False)
df_customer_identity.select("cust_id").distinct().count()
df_customer_identity.count()

root
 |-- cust_id: string (nullable = true)
 |-- distinct_txns: long (nullable = false)
 |-- name: string (nullable = true)
 |-- age: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- birthday: string (nullable = true)
 |-- zip: string (nullable = true)



                                                                                

+----------+-------------+-------------+---+------+----------+-----+
|cust_id   |distinct_txns|name         |age|gender|birthday  |zip  |
+----------+-------------+-------------+---+------+----------+-----+
|C000BK8N2S|6949         |Aaron Abbott |34 |Female|7/13/1991 |97823|
|C005K7U9RE|6540         |Aaron Austin |37 |Female|12/16/2004|30332|
|C006CT8BVO|6356         |Aaron Barnes |29 |Female|3/11/1977 |23451|
|C007YEYTX9|7445         |Aaron Barrett|31 |Male  |7/9/1998  |46613|
|C00B971T1J|7532         |Aaron Becker |54 |Male  |11/24/1979|40284|
+----------+-------------+-------------+---+------+----------+-----+
only showing top 5 rows



                                                                                

20000

                                                                                

20000

# Write Customer Data

In [15]:
(
    df_customer_identity
    .write
    .mode("overwrite")
    .parquet("../../data/data_skew/customers.parquet")
)

                                                                                

# Write Skewed Transaction Data

In [16]:
df_top20k_customers.filter(F.col("distinct_txns") >= 7000).distinct().count()

                                                                                

5533

In [17]:
df_transactions = df_txns.join(
    df_top20k_customers,
    on="cust_id",
    how="inner"
).withColumn(
    "cust_id", 
    F.when(
        F.col("distinct_txns") >= 7000, F.lit("C0YDPQWPBJ")
    ).otherwise(F.col("cust_id"))
)

In [18]:
df_transactions.groupBy("cust_id").count().orderBy(F.desc("count")).show(5, False)
df_transactions.cache()

                                                                                

+----------+--------+
|cust_id   |count   |
+----------+--------+
|C0YDPQWPBJ|43551962|
|CXD6UZEGKS|6999    |
|CP2GC38KPG|6999    |
|CQZK7HS7HL|6999    |
|C1MZ9FNHAN|6999    |
+----------+--------+
only showing top 5 rows



DataFrame[cust_id: string, start_date: string, end_date: string, txn_id: string, date: string, year: string, month: string, day: string, expense_type: string, amt: string, distinct_txns: bigint, row_id: int]

In [19]:
df_transactions.printSchema()

root
 |-- cust_id: string (nullable = true)
 |-- start_date: string (nullable = true)
 |-- end_date: string (nullable = true)
 |-- txn_id: string (nullable = true)
 |-- date: string (nullable = true)
 |-- year: string (nullable = true)
 |-- month: string (nullable = true)
 |-- day: string (nullable = true)
 |-- expense_type: string (nullable = true)
 |-- amt: string (nullable = true)
 |-- distinct_txns: long (nullable = false)
 |-- row_id: integer (nullable = true)



In [20]:
df_transactions.select("cust_id").distinct().count()
df_transactions.select("txn_id").distinct().count()
df_transactions.count()

                                                                                

14468

                                                                                

127125002



127125002

In [21]:
(
    df_transactions
    .drop("distinct_txns", "row_id")
    .write
    .mode("overwrite")
    .parquet("../../data/data_skew/transactions.parquet")
)

                                                                                

In [22]:
df_transactions_test = spark.read.parquet("../../data/data_skew/transactions.parquet")

In [23]:
(
    df_transactions_test
    .groupBy("cust_id")
    .agg(F.countDistinct("txn_id").alias("ct"))
    .orderBy(F.desc("ct"))
    .show(20, False)
)



+----------+--------+
|cust_id   |ct      |
+----------+--------+
|C0YDPQWPBJ|43551962|
|CXD6UZEGKS|6999    |
|CBHTP5LBPL|6999    |
|CGZ47PFMP4|6999    |
|CQZK7HS7HL|6999    |
|CP2GC38KPG|6999    |
|C1MZ9FNHAN|6999    |
|C20YJKSHJO|6998    |
|CMPRPWHMXL|6998    |
|CDNM9WD0Y5|6998    |
|CJ54IW55MC|6998    |
|C2TVQYBG6G|6998    |
|C5A5O3NG3I|6998    |
|CN3CK5U6KT|6997    |
|CLCAMLZ379|6997    |
|CP0GJFQTVZ|6997    |
|CRGDYXMCR9|6997    |
|CGYD0EQ59P|6997    |
|CJSSKSUHZT|6997    |
|CVXA38F9W9|6996    |
+----------+--------+
only showing top 20 rows





In [24]:
spark.stop()