In [1]:
import os
import sys

notebook_dir = os.getcwd()
project_root = os.path.abspath(os.path.join(notebook_dir, ".."))
sys.path.insert(0, project_root)

In [6]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.sql import Window

s_raw_root_path = r'F:\DataSamples\DataSets'
s_spark_file_server_root = r'F:\Spark_Data_Test'

spark = SparkSession.builder \
    .appName('Prep Census Data') \
    .master("local[*]") \
    .config("spark.sql.warehouse.dir", s_spark_file_server_root) \
    .enableHiveSupport() \
    .getOrCreate()


In [7]:
# Load Files
df_census_surname = spark.read \
    .options(header='true', delimiter=',', inferSchema=True) \
    .csv(os.path.join(s_raw_root_path, 'Census_Surnames\\Names_2010Census.csv'))

df_census_surname.orderBy('rank', ascending=True).show(10, truncate=False)

+---------------+----+--------+--------+------------+--------+--------+------+-------+---------+-----------+
|name           |rank|count   |prop100k|cum_prop100k|pctwhite|pctblack|pctapi|pctaian|pct2prace|pcthispanic|
+---------------+----+--------+--------+------------+--------+--------+------+-------+---------+-----------+
|ALL OTHER NAMES|0   |29312001|9936.97 |9936.97     |66.65   |8.53    |7.97  |0.86   |2.32     |13.67      |
|SMITH          |1   |2442977 |828.19  |828.19      |70.9    |23.11   |0.5   |0.89   |2.19     |2.4        |
|JOHNSON        |2   |1932812 |655.24  |1483.42     |58.97   |34.63   |0.54  |0.94   |2.56     |2.36       |
|WILLIAMS       |3   |1625252 |550.97  |2034.39     |45.75   |47.68   |0.46  |0.82   |2.81     |2.49       |
|BROWN          |4   |1437026 |487.16  |2521.56     |57.95   |35.6    |0.51  |0.87   |2.55     |2.52       |
|JONES          |5   |1425470 |483.24  |3004.8      |55.19   |38.48   |0.44  |1      |2.61     |2.29       |
|GARCIA         |6 

In [62]:
# make a upper bound for cumulative distribution function
df_census = df_census_surname.filter(col('name') != 'ALL OTHER NAMES').withColumn('unqiue_rank', row_number().over(Window.orderBy('cum_prop100k')))
df_join = df_census.alias('high').join(df_census.alias('low'), on= col('low.unqiue_rank') == col('high.unqiue_rank') - 1, how='outer')


In [68]:
df_join = df_join.select(
    col('high.name'),col('low.name'),
    coalesce(col('high.name'),lit('OTHER')).alias('name'),
    coalesce(col('high.unqiue_rank'),col('low.unqiue_rank')+1).alias('unqiue_rank'),
    ifnull(col('low.cum_prop100k'), lit(0)).alias('cum_prop100k_low'),
    ifnull(col('high.cum_prop100k'), lit(100_000)).alias('cum_prop100k_high'),
)

df_join.orderBy(col('cum_prop100k_low'), ascending=True
).show(10)
df_join.orderBy(col('cum_prop100k_low'), ascending=False
).show(10)

+---------+---------+---------+-----------+----------------+-----------------+
|     name|     name|     name|unqiue_rank|cum_prop100k_low|cum_prop100k_high|
+---------+---------+---------+-----------+----------------+-----------------+
|    SMITH|     NULL|    SMITH|          1|             0.0|           828.19|
|  JOHNSON|    SMITH|  JOHNSON|          2|          828.19|          1483.42|
| WILLIAMS|  JOHNSON| WILLIAMS|          3|         1483.42|          2034.39|
|    BROWN| WILLIAMS|    BROWN|          4|         2034.39|          2521.56|
|    JONES|    BROWN|    JONES|          5|         2521.56|           3004.8|
|   GARCIA|    JONES|   GARCIA|          6|          3004.8|          3400.12|
|   MILLER|   GARCIA|   MILLER|          7|         3400.12|          3793.86|
|    DAVIS|   MILLER|    DAVIS|          8|         3793.86|          4172.31|
|RODRIGUEZ|    DAVIS|RODRIGUEZ|          9|         4172.31|           4543.5|
| MARTINEZ|RODRIGUEZ| MARTINEZ|         10|         

In [75]:
from src.DataCreator.DataGenerators.PyData import PyData

li_name_lkp = PyData.random_ints(1000, 1, 100_000)
df_new_names = spark.createDataFrame(li_name_lkp, StructType([StructField('name_int', IntegerType(), False)]))

df_new_name_w_str = df_new_names.join(df_join, on= (df_new_names.name_int > df_join.cum_prop100k_low and df_new_names.name_int <= df_join.cum_prop100k_high), how='left')

PySparkTypeError: [CANNOT_ACCEPT_OBJECT_IN_TYPE] `StructType` can not accept object `53430` in type `int`.