In [2]:
import math
from datetime import datetime
from pathlib import Path

from pyspark.sql import SparkSession
import pyspark.sql.functions as F

home = Path.home()
start = datetime.now()
spark = (
    SparkSession.builder.appName('demo1')
    .master("local[*]")
    .config("spark.driver.memory", "8g")
    .config("spark.sql.shuffle.partitions", 128)
    .config("spark.default.parallelism", 16)
    .getOrCreate()
)

In [3]:
assessment_dir = "./"
# tgu_df = spark.read.parquet(f"{assessment_dir}/target_user_sample/").cache()
tgu_df = spark.read.parquet(f"s3a://aws-test-benny/assessment/target_user_sample/").cache()
tgu_df.createOrReplaceTempView("target_user_sample")
# cid_df = spark.read.parquet(f"{assessment_dir}/cid_mapping/").cache()
cid_df = spark.read.parquet(f"s3a://aws-test-benny/assessment/cid_mapping/").cache()
cid_df.createOrReplaceTempView("cid_mapping")
# basic_df = spark.read.parquet(f"{assessment_dir}/basic/").cache()
basic_df = spark.read.parquet(f"s3a://aws-test-benny/assessment/basic/").cache()
basic_df.createOrReplaceTempView("basic")
# exp_job_df = spark.read.parquet(f"{assessment_dir}/exp_job/").cache()
exp_job_df = spark.read.parquet(f"s3a://aws-test-benny/assessment/exp_job/").cache()
exp_job_df.createOrReplaceTempView("exp_job")

In [4]:
print("target_user_sample:")
tgu_df.show(1)
tgu_df.printSchema()
print(tgu_df.count())
print("===================")
print("cid_mapping:")
cid_df.show(1)
cid_df.printSchema()
print(cid_df.count())
print("===================")
print("basic:")
basic_df.show(1)
basic_df.printSchema()
print(basic_df.count())
print("===================")
print("exp_job:")
exp_job_df.show(1)
exp_job_df.printSchema()
print(exp_job_df.count())
print("===================")

target_user_sample:
+---------------+-------------+-------------------+------------+------------+-------------+------------+------------+------------+-------------+------------+------------+------------+------------+------------+------------+------------+-------------+-------------+-------------+------------+-------------+------------+------------+-------------+-------------+------------+------------+------------+------------+-----------+-------------+------------+-----------+-----------+------------+------------+-----------+------------+------------+-------------+-------------+-------------+------------+------------+-------------+------------+------------+------------+------------+------------+------------+------------+-------------+------------+------------+------------+------------+-------------+------------+------------+-------------+-------------+------------+------------+------------+------------+-------------+------------+-------------+-------------+-------------+-------------+-

In [4]:
# # display_fields = closed_fin_date_tgu_df.columns
display_fields = ("tgu_id", "idno", "cid", "invoice", "industry_id", "position_id", "sm", "sy", "finish_date", "z_D01", "z_A01")
# print(display_fields)
# for row in closed_fin_date_tgu_df.collect()[:10]:
#     print([row[col] for col in display_fields])

In [5]:
dms_ids_c = {
    "A": 7,
    "B": 7,
    "C": 7,
    "I": 8,
    "L": 8,
    "R": 9,
    "S": 7,
}
dms_p_count = 28

z_fields = [f"z_{prefix}{str(num).zfill(2)}" for prefix, max_num in dms_ids_c.items() for num in range(1, max_num + 1)]
z_fields += [f"z_D{str(num).zfill(2)}" for num in range(1, dms_p_count + 1)]
print(f"Length of z fields: {len(z_fields)}")
z_fields_str = ", ".join(z_fields)

sample_fields = f"tgu_id, idno, cid, invoice, industry_id, position_id, sm, sy, finish_date, {z_fields_str}".split(", ")
print(sample_fields)

Length of z fields: 81
['tgu_id', 'idno', 'cid', 'invoice', 'industry_id', 'position_id', 'sm', 'sy', 'finish_date', 'z_A01', 'z_A02', 'z_A03', 'z_A04', 'z_A05', 'z_A06', 'z_A07', 'z_B01', 'z_B02', 'z_B03', 'z_B04', 'z_B05', 'z_B06', 'z_B07', 'z_C01', 'z_C02', 'z_C03', 'z_C04', 'z_C05', 'z_C06', 'z_C07', 'z_I01', 'z_I02', 'z_I03', 'z_I04', 'z_I05', 'z_I06', 'z_I07', 'z_I08', 'z_L01', 'z_L02', 'z_L03', 'z_L04', 'z_L05', 'z_L06', 'z_L07', 'z_L08', 'z_R01', 'z_R02', 'z_R03', 'z_R04', 'z_R05', 'z_R06', 'z_R07', 'z_R08', 'z_R09', 'z_S01', 'z_S02', 'z_S03', 'z_S04', 'z_S05', 'z_S06', 'z_S07', 'z_D01', 'z_D02', 'z_D03', 'z_D04', 'z_D05', 'z_D06', 'z_D07', 'z_D08', 'z_D09', 'z_D10', 'z_D11', 'z_D12', 'z_D13', 'z_D14', 'z_D15', 'z_D16', 'z_D17', 'z_D18', 'z_D19', 'z_D20', 'z_D21', 'z_D22', 'z_D23', 'z_D24', 'z_D25', 'z_D26', 'z_D27', 'z_D28']


In [6]:
# query = """
#     SELECT idno, count(*) AS count
#     FROM target_user_sample
#     WHERE idno > 0
#     GROUP BY idno
#     ORDER BY idno
# """
# tgu_df = spark.sql(query).cache()
# print(tgu_df.count())
# idnos = tuple(row["idno"] for row in tgu_df.collect())
# print(idnos[:10])

In [7]:
exp_job_fields = []
for col in exp_job_df.columns:
    if col == "id_no":
        continue
    else:
        if col == "invoice":
            exp_job_fields.append(f"CAST(exp.{col} AS INT)")
        else:
            exp_job_fields.append(f"exp.{col}")
exp_str = ", ".join(exp_job_fields)
tgu_z_fields_str = ", ".join(f"tgu.{z_field} AS {z_field}" for z_field in z_fields)
query = f"""
    SELECT
        tgu.tgu_id,
        tgu.idno,
        {exp_str},
        tgu.finish_date,
        {tgu_z_fields_str}
    FROM exp_job AS exp
    INNER JOIN target_user_sample AS tgu ON exp.id_no = tgu.idno
    WHERE tgu.idno IS NOT NULL AND (exp.ind_cat_no > 0 OR exp.job_cat_no > 0)
"""
#     WHERE tgu.idno IS NOT NULL AND tgu.idno IN {idnos}
# tgu.idno = 20000000181924
# exp.id_no = 1670005990833
# tgu.idno = 1670002970809
# tgu.idno = 1673866050855
# the same data idno 1812400542207
exp_tgu_df = spark.sql(query).cache()
# exp_tgu_df.show(2)
# exp_tgu_df.printSchema()
print(exp_tgu_df.columns)
print(exp_tgu_df.count())
exp_tgu_df.createOrReplaceTempView("exp_tgu_df")

['tgu_id', 'idno', 'pkey', 'invoice', 'ind_cat_no', 'job_cat_no', 'start_date', 'end_date', 'finish_date', 'z_A01', 'z_A02', 'z_A03', 'z_A04', 'z_A05', 'z_A06', 'z_A07', 'z_B01', 'z_B02', 'z_B03', 'z_B04', 'z_B05', 'z_B06', 'z_B07', 'z_C01', 'z_C02', 'z_C03', 'z_C04', 'z_C05', 'z_C06', 'z_C07', 'z_I01', 'z_I02', 'z_I03', 'z_I04', 'z_I05', 'z_I06', 'z_I07', 'z_I08', 'z_L01', 'z_L02', 'z_L03', 'z_L04', 'z_L05', 'z_L06', 'z_L07', 'z_L08', 'z_R01', 'z_R02', 'z_R03', 'z_R04', 'z_R05', 'z_R06', 'z_R07', 'z_R08', 'z_R09', 'z_S01', 'z_S02', 'z_S03', 'z_S04', 'z_S05', 'z_S06', 'z_S07', 'z_D01', 'z_D02', 'z_D03', 'z_D04', 'z_D05', 'z_D06', 'z_D07', 'z_D08', 'z_D09', 'z_D10', 'z_D11', 'z_D12', 'z_D13', 'z_D14', 'z_D15', 'z_D16', 'z_D17', 'z_D18', 'z_D19', 'z_D20', 'z_D21', 'z_D22', 'z_D23', 'z_D24', 'z_D25', 'z_D26', 'z_D27', 'z_D28']
2467412


In [8]:
exp_tgu_str = ", ".join(f"et.{col}" for col in exp_tgu_df.columns)
# et_z_fields_str = ", ".join(f"et.{z_field} AS {z_field}" for z_field in z_fields)
query = f"""
    SELECT
        {exp_tgu_str},
        b.update_date
    FROM exp_tgu_df AS et
    INNER JOIN basic AS b ON et.idno = b.id_no
"""
exp_tgu_basic_df = spark.sql(query).cache()
# exp_tgu_basic_df.show()
print(exp_tgu_basic_df.columns)
print(exp_tgu_basic_df.count())
exp_tgu_basic_df.createOrReplaceTempView("exp_tgu_basic_df")

['tgu_id', 'idno', 'pkey', 'invoice', 'ind_cat_no', 'job_cat_no', 'start_date', 'end_date', 'finish_date', 'z_A01', 'z_A02', 'z_A03', 'z_A04', 'z_A05', 'z_A06', 'z_A07', 'z_B01', 'z_B02', 'z_B03', 'z_B04', 'z_B05', 'z_B06', 'z_B07', 'z_C01', 'z_C02', 'z_C03', 'z_C04', 'z_C05', 'z_C06', 'z_C07', 'z_I01', 'z_I02', 'z_I03', 'z_I04', 'z_I05', 'z_I06', 'z_I07', 'z_I08', 'z_L01', 'z_L02', 'z_L03', 'z_L04', 'z_L05', 'z_L06', 'z_L07', 'z_L08', 'z_R01', 'z_R02', 'z_R03', 'z_R04', 'z_R05', 'z_R06', 'z_R07', 'z_R08', 'z_R09', 'z_S01', 'z_S02', 'z_S03', 'z_S04', 'z_S05', 'z_S06', 'z_S07', 'z_D01', 'z_D02', 'z_D03', 'z_D04', 'z_D05', 'z_D06', 'z_D07', 'z_D08', 'z_D09', 'z_D10', 'z_D11', 'z_D12', 'z_D13', 'z_D14', 'z_D15', 'z_D16', 'z_D17', 'z_D18', 'z_D19', 'z_D20', 'z_D21', 'z_D22', 'z_D23', 'z_D24', 'z_D25', 'z_D26', 'z_D27', 'z_D28', 'update_date']
2467412


In [9]:
query = f"""
    SELECT
        cid, invoice, row_no
    FROM (
        SELECT *, ROW_NUMBER() OVER (PARTITION BY invoice ORDER BY cid) AS row_no
        FROM cid_mapping
    ) AS cid
    WHERE row_no = 1
"""
cid_df = spark.sql(query)
cid_df.show(3)
cid_df.createOrReplaceTempView("cid_df")

+-----+-------+------+
|  cid|invoice|row_no|
+-----+-------+------+
|57082| 971770|     1|
|66918|1187025|     1|
|60748|2335002|     1|
+-----+-------+------+
only showing top 3 rows



In [10]:
exp_tgu_basic_str = ", ".join(f"etb.{col} AS {col}" for col in exp_tgu_basic_df.columns)
# etb_z_fields_str = ", ".join(f"etb.{z_field} AS {z_field}" for z_field in z_fields)
# print(etb_z_fields_str)
query = f"""
    SELECT
        {exp_tgu_basic_str},
        CASE WHEN cid.cid IS NULL OR cid.invoice = 0 THEN 0 ELSE cid.cid END AS cid
    FROM exp_tgu_basic_df AS etb
    LEFT JOIN cid_df AS cid ON etb.invoice = cid.invoice
"""
joined_df = spark.sql(query).cache()
# joined_df.where(F.col("invoice") == 0).show()
print(joined_df.columns)
joined_df.createOrReplaceTempView("module_cpi_sample")

['tgu_id', 'idno', 'pkey', 'invoice', 'ind_cat_no', 'job_cat_no', 'start_date', 'end_date', 'finish_date', 'z_A01', 'z_A02', 'z_A03', 'z_A04', 'z_A05', 'z_A06', 'z_A07', 'z_B01', 'z_B02', 'z_B03', 'z_B04', 'z_B05', 'z_B06', 'z_B07', 'z_C01', 'z_C02', 'z_C03', 'z_C04', 'z_C05', 'z_C06', 'z_C07', 'z_I01', 'z_I02', 'z_I03', 'z_I04', 'z_I05', 'z_I06', 'z_I07', 'z_I08', 'z_L01', 'z_L02', 'z_L03', 'z_L04', 'z_L05', 'z_L06', 'z_L07', 'z_L08', 'z_R01', 'z_R02', 'z_R03', 'z_R04', 'z_R05', 'z_R06', 'z_R07', 'z_R08', 'z_R09', 'z_S01', 'z_S02', 'z_S03', 'z_S04', 'z_S05', 'z_S06', 'z_S07', 'z_D01', 'z_D02', 'z_D03', 'z_D04', 'z_D05', 'z_D06', 'z_D07', 'z_D08', 'z_D09', 'z_D10', 'z_D11', 'z_D12', 'z_D13', 'z_D14', 'z_D15', 'z_D16', 'z_D17', 'z_D18', 'z_D19', 'z_D20', 'z_D21', 'z_D22', 'z_D23', 'z_D24', 'z_D25', 'z_D26', 'z_D27', 'z_D28', 'update_date', 'cid']


In [11]:
query = f"""
    SELECT
        pkey, tgu_id, idno, cid, invoice, industry_id, position_id, finish_date,
        start_date,
        end_date,
        final_end_date,
        12 * (YEAR(final_end_date) - YEAR(start_date)) + MONTH(final_end_date) - MONTH(start_date) AS sm,
        {z_fields_str}
    FROM (
        SELECT
            pkey, tgu_id, idno, invoice, cid, ind_cat_no AS industry_id, job_cat_no AS position_id, finish_date,
            start_date,
            end_date,
            CASE WHEN TO_UNIX_TIMESTAMP(start_date) < TO_UNIX_TIMESTAMP(end_date)
                THEN end_date ELSE update_date
            END AS final_end_date,
            {z_fields_str}
        FROM module_cpi_sample
    )
"""
end_date_df = spark.sql(query).cache()
# end_date_df.show()
print(end_date_df.columns)
print(end_date_df.count())
end_date_df.createOrReplaceTempView("end_date_df")

['pkey', 'tgu_id', 'idno', 'cid', 'invoice', 'industry_id', 'position_id', 'finish_date', 'start_date', 'end_date', 'final_end_date', 'sm', 'z_A01', 'z_A02', 'z_A03', 'z_A04', 'z_A05', 'z_A06', 'z_A07', 'z_B01', 'z_B02', 'z_B03', 'z_B04', 'z_B05', 'z_B06', 'z_B07', 'z_C01', 'z_C02', 'z_C03', 'z_C04', 'z_C05', 'z_C06', 'z_C07', 'z_I01', 'z_I02', 'z_I03', 'z_I04', 'z_I05', 'z_I06', 'z_I07', 'z_I08', 'z_L01', 'z_L02', 'z_L03', 'z_L04', 'z_L05', 'z_L06', 'z_L07', 'z_L08', 'z_R01', 'z_R02', 'z_R03', 'z_R04', 'z_R05', 'z_R06', 'z_R07', 'z_R08', 'z_R09', 'z_S01', 'z_S02', 'z_S03', 'z_S04', 'z_S05', 'z_S06', 'z_S07', 'z_D01', 'z_D02', 'z_D03', 'z_D04', 'z_D05', 'z_D06', 'z_D07', 'z_D08', 'z_D09', 'z_D10', 'z_D11', 'z_D12', 'z_D13', 'z_D14', 'z_D15', 'z_D16', 'z_D17', 'z_D18', 'z_D19', 'z_D20', 'z_D21', 'z_D22', 'z_D23', 'z_D24', 'z_D25', 'z_D26', 'z_D27', 'z_D28']
2467412


In [12]:
query = f"""
    SELECT
        pkey, tgu_id, idno, invoice, cid, industry_id, position_id, finish_date,
        start_date, end_date, final_end_date,
        CASE WHEN sm > 750 THEN 750 ELSE sm END AS sm,
        {z_fields_str}
    FROM end_date_df
"""
limit_sm_df = spark.sql(query).cache()
# limit_sm_df.show()
print(limit_sm_df.columns)
print(limit_sm_df.count())
limit_sm_df.createOrReplaceTempView("limit_sm_df")

['pkey', 'tgu_id', 'idno', 'invoice', 'cid', 'industry_id', 'position_id', 'finish_date', 'start_date', 'end_date', 'final_end_date', 'sm', 'z_A01', 'z_A02', 'z_A03', 'z_A04', 'z_A05', 'z_A06', 'z_A07', 'z_B01', 'z_B02', 'z_B03', 'z_B04', 'z_B05', 'z_B06', 'z_B07', 'z_C01', 'z_C02', 'z_C03', 'z_C04', 'z_C05', 'z_C06', 'z_C07', 'z_I01', 'z_I02', 'z_I03', 'z_I04', 'z_I05', 'z_I06', 'z_I07', 'z_I08', 'z_L01', 'z_L02', 'z_L03', 'z_L04', 'z_L05', 'z_L06', 'z_L07', 'z_L08', 'z_R01', 'z_R02', 'z_R03', 'z_R04', 'z_R05', 'z_R06', 'z_R07', 'z_R08', 'z_R09', 'z_S01', 'z_S02', 'z_S03', 'z_S04', 'z_S05', 'z_S06', 'z_S07', 'z_D01', 'z_D02', 'z_D03', 'z_D04', 'z_D05', 'z_D06', 'z_D07', 'z_D08', 'z_D09', 'z_D10', 'z_D11', 'z_D12', 'z_D13', 'z_D14', 'z_D15', 'z_D16', 'z_D17', 'z_D18', 'z_D19', 'z_D20', 'z_D21', 'z_D22', 'z_D23', 'z_D24', 'z_D25', 'z_D26', 'z_D27', 'z_D28']
2467412


In [None]:
query = f"""
    SELECT
        pkey, tgu_id, idno, cid, invoice, industry_id, position_id, finish_date,
        start_date, end_date, final_end_date,
        sm,
        CASE WHEN TO_UNIX_TIMESTAMP(start_date) < TO_UNIX_TIMESTAMP(end_date)
            THEN CASE WHEN sm == 0 THEN ROUND(2 + LOG2(0.5/3), 10) ELSE ROUND(2 + LOG2(sm/3), 10) END
            ELSE -9.9999999999
        END AS sy,
        {z_fields_str}
    FROM limit_sm_df
"""
sy_df = spark.sql(query).cache()
# print(sy_df.columns)
# print(sy_df.count())
sy_df.createOrReplaceTempView("sy_df")

In [14]:
query = f"""
    SELECT
        tgu_id, idno, cid, invoice, industry_id, position_id,
        sm, sy,
        finish_date,
        {z_fields_str}
    FROM (
        SELECT
            *,
            ROW_NUMBER() OVER (
                PARTITION BY pkey
                ORDER BY (
                    ABS((TO_UNIX_TIMESTAMP(start_date) + TO_UNIX_TIMESTAMP(final_end_date)) / 2 - TO_UNIX_TIMESTAMP(finish_date))
                )
            ) AS tgu_no
        FROM sy_df
    )
    WHERE tgu_no = 1
    ORDER BY tgu_id, idno, cid, invoice, industry_id, position_id, sm, sy
"""
closed_fin_date_tgu_df = spark.sql(query).cache()
# closed_fin_date_tgu_df.show()
print(closed_fin_date_tgu_df.columns)
print(closed_fin_date_tgu_df.count())

['tgu_id', 'idno', 'cid', 'invoice', 'industry_id', 'position_id', 'sm', 'sy', 'finish_date', 'z_A01', 'z_A02', 'z_A03', 'z_A04', 'z_A05', 'z_A06', 'z_A07', 'z_B01', 'z_B02', 'z_B03', 'z_B04', 'z_B05', 'z_B06', 'z_B07', 'z_C01', 'z_C02', 'z_C03', 'z_C04', 'z_C05', 'z_C06', 'z_C07', 'z_I01', 'z_I02', 'z_I03', 'z_I04', 'z_I05', 'z_I06', 'z_I07', 'z_I08', 'z_L01', 'z_L02', 'z_L03', 'z_L04', 'z_L05', 'z_L06', 'z_L07', 'z_L08', 'z_R01', 'z_R02', 'z_R03', 'z_R04', 'z_R05', 'z_R06', 'z_R07', 'z_R08', 'z_R09', 'z_S01', 'z_S02', 'z_S03', 'z_S04', 'z_S05', 'z_S06', 'z_S07', 'z_D01', 'z_D02', 'z_D03', 'z_D04', 'z_D05', 'z_D06', 'z_D07', 'z_D08', 'z_D09', 'z_D10', 'z_D11', 'z_D12', 'z_D13', 'z_D14', 'z_D15', 'z_D16', 'z_D17', 'z_D18', 'z_D19', 'z_D20', 'z_D21', 'z_D22', 'z_D23', 'z_D24', 'z_D25', 'z_D26', 'z_D27', 'z_D28']
2214509


In [15]:
closed_fin_date_tgu_df.write.parquet(f"{assessment_dir}/sample_data_spark/")
end = datetime.now()
print(f"completed in {end - start}")

completed in 0:03:34.825184


In [None]:
query = """
    SELECT *
    FROM sy_df
    ORDER BY idno, invoice, cid, industry_id, position_id, sm, finish_date
"""
closed_fin_date_tgu_df2 = spark.sql(query).cache()
print(closed_fin_date_tgu_df2.columns)

In [106]:
fields = display_fields
for row in closed_fin_date_tgu_df2.collect():
    print([row[field] for field in fields])
print()


fields = [*display_fields] + ["tgu_no"]
for row in closed_fin_date_tgu_df.collect():
    print([row[field] for field in fields])


[153554308426626, 1673866050855, 0, 0, 1001001002, 2007001004, 11, 3.8744691179, datetime.datetime(2012, 8, 16, 20, 30, 52), 0.7651318809, 0.7863538188]
[153554308426626, 1673866050855, 0, 0, 1001001002, 2007001004, 11, 3.8744691179, datetime.datetime(2012, 8, 16, 20, 30, 52), 0.7651318809, 0.7863538188]
[153554308426626, 1673866050855, 0, 0, 1001001002, 2007001004, 11, 3.8744691179, datetime.datetime(2012, 8, 16, 20, 30, 52), 0.7651318809, 0.7863538188]
[153596007627406, 1673866050855, 0, 0, 1001001002, 2007001004, 11, 3.8744691179, datetime.datetime(2018, 9, 1, 13, 57, 33), 0.7651318809, 0.7952734988]
[153596007627406, 1673866050855, 0, 0, 1001001002, 2007001004, 11, 3.8744691179, datetime.datetime(2018, 9, 1, 13, 57, 33), 0.7651318809, 0.7952734988]
[153596007627406, 1673866050855, 0, 0, 1001001002, 2007001004, 11, 3.8744691179, datetime.datetime(2018, 9, 1, 13, 57, 33), 0.7651318809, 0.7952734988]
[153554308426626, 1673866050855, 0, 0, 1001001002, 2007002006, 35, 5.5443205162, date

In [17]:
module_cpi_sample_df = spark.read.parquet(f"{assessment_dir}/module_cpi_sample").drop("mdl_id").dropDuplicates().cache()
module_cpi_sample_df.createOrReplaceTempView("module_cpi_sample")

In [16]:
sample_data_spark_df = spark.read.parquet(f"{assessment_dir}/sample_data_spark").cache()
sample_data_spark_df.createOrReplaceTempView("sample_data_spark")

query = f"""
    SELECT *
    FROM sample_data_spark
    ORDER BY idno, tgu_id, cid, invoice, industry_id, position_id, sm, sy
"""
sample_data_df = spark.sql(query).cache()

In [18]:
tmp_df = sample_data_df.where(F.col("idno") == 1673866050855)

In [19]:
for row in tmp_df.collect():
    print([row[field] for field in display_fields])

[153554308426626, 1673866050855, 0, 0, 1001001002, 2007001004, 11, 3.8744691179, datetime.datetime(2012, 8, 16, 20, 30, 52), 0.7651318809, 0.7863538188]
[153596007627406, 1673866050855, 0, 0, 1001001002, 2007001004, 11, 3.8744691179, datetime.datetime(2018, 9, 1, 13, 57, 33), 0.7651318809, 0.7952734988]
[153596007627406, 1673866050855, 0, 0, 1001001002, 2007001004, 11, 3.8744691179, datetime.datetime(2018, 9, 1, 13, 57, 33), 0.7651318809, 0.7952734988]
[153596007627406, 1673866050855, 0, 0, 1001001002, 2007002006, 35, 5.5443205162, datetime.datetime(2018, 9, 1, 13, 57, 33), 0.7651318809, 0.7952734988]


In [17]:
for row in sample_data_df.take(10):
    print([row[field] for field in display_fields])

[153554728684162, 1670001380813, 0, 0, 1005003003, 2002001008, 28, 5.2223924213, datetime.datetime(2015, 10, 12, 16, 6, 24), -0.09196229, 0.5524553681]
[153554728684162, 1670001380813, 0, 0, 1007001001, 2016002019, 9, 3.5849625007, datetime.datetime(2015, 10, 12, 16, 6, 24), -0.09196229, 0.5524553681]
[153554728684162, 1670001380813, 0, 0, 1007001003, 2006002008, 2, 1.4150374993, datetime.datetime(2015, 10, 12, 16, 6, 24), -0.09196229, 0.5524553681]
[153561583918705, 1670001600861, 0, 0, 1006003001, 2004002003, 12, 4.0, datetime.datetime(2014, 7, 16, 13, 17, 40), 0.3365847955, 1.1520673131]
[153561583918705, 1670001600861, 0, 0, 1006003001, 2013002015, 10, 3.7369655942, datetime.datetime(2014, 7, 16, 13, 17, 40), 0.3365847955, 1.1520673131]
[153561583918705, 1670001600861, 0, 46005176, 1010001004, 2004001005, 7, 3.2223924213, datetime.datetime(2014, 7, 16, 13, 17, 40), 0.3365847955, 1.1520673131]
[153561583918705, 1670001600861, 463, 22555003, 1003002016, 2005002004, 4, 2.4150374993, d

In [30]:
sample_fields = sample_data_df.columns
unique_keys = ("tgu_id", "idno", "cid", "invoice", "industry_id", "position_id")

query = f"""
    SELECT {", ".join(sample_fields)}
    FROM module_cpi_sample
    ORDER BY tgu_id, idno, cid, invoice, industry_id, position_id, sm, sy
"""
label_sample_df = spark.sql(query).cache()
print(sample_data_df.columns)
print(label_sample_df.count())

['tgu_id', 'idno', 'cid', 'invoice', 'industry_id', 'position_id', 'sm', 'sy', 'finish_date', 'z_A01', 'z_A02', 'z_A03', 'z_A04', 'z_A05', 'z_A06', 'z_A07', 'z_B01', 'z_B02', 'z_B03', 'z_B04', 'z_B05', 'z_B06', 'z_B07', 'z_C01', 'z_C02', 'z_C03', 'z_C04', 'z_C05', 'z_C06', 'z_C07', 'z_I01', 'z_I02', 'z_I03', 'z_I04', 'z_I05', 'z_I06', 'z_I07', 'z_I08', 'z_L01', 'z_L02', 'z_L03', 'z_L04', 'z_L05', 'z_L06', 'z_L07', 'z_L08', 'z_R01', 'z_R02', 'z_R03', 'z_R04', 'z_R05', 'z_R06', 'z_R07', 'z_R08', 'z_R09', 'z_S01', 'z_S02', 'z_S03', 'z_S04', 'z_S05', 'z_S06', 'z_S07', 'z_D01', 'z_D02', 'z_D03', 'z_D04', 'z_D05', 'z_D06', 'z_D07', 'z_D08', 'z_D09', 'z_D10', 'z_D11', 'z_D12', 'z_D13', 'z_D14', 'z_D15', 'z_D16', 'z_D17', 'z_D18', 'z_D19', 'z_D20', 'z_D21', 'z_D22', 'z_D23', 'z_D24', 'z_D25', 'z_D26', 'z_D27', 'z_D28']
2201652


In [19]:
def check_same(row1, row2):
    for field in ("tgu_id", "idno", "cid", "invoice", "industry_id", "position_id"):
        if row1[field] != row2[field]:
            return False
    
    if row1["sm"] == row2["sm"] and row1["sy"] == row2["sy"]:
        print("True:")
        print([row1[field] for field in display_fields])
        print([row2[field] for field in display_fields])
        print("------------")

        return True
    else:
        return False

In [None]:
take_num = 13000
part_row1 = closed_fin_date_tgu_df.take(take_num)
part_row2 = label_sample_df.take(take_num)
idx1 = 12028
idx2 = 12029
while idx1 != len(part_row1) and idx2 != len(part_row2):
    if idx2 != 0 and check_same(part_row2[idx2-1], part_row2[idx2]):
        print(idx1, idx2)
        idx2 += 1
        continue

    for field in display_fields:
        if (
            part_row1[idx1][field] != part_row2[idx2][field]
            and (field != "sm" or field == "sm" and part_row1[idx1][field] != part_row2[idx2][field] - 1)
        ):
            print("Detect different!!, idx1 -= 1")
            print(f"idx1: {idx1}, idx2: {idx2}, field: {field}, row1: {part_row1[idx1][field]}, row2: {part_row2[idx2][field]}")
            print([part_row1[idx1][field] for field in display_fields])
            print([part_row2[idx2][field] for field in display_fields])
            print("===========")
            if (
                part_row2[idx2]["idno"] == 20000000861677
                or part_row2[idx2]["idno"] == 1683482920833 and part_row2[idx2]["sy"] == 4.5025003405
#                 or part_row2[idx2]["idno"] == 1732895263409 and part_row2[idx2]["sy"] == 3.4150374993
            ):
                idx1 -= 1
            break
    idx1 += 1
    idx2 += 1