In [1]:
import os
import sys

notebook_dir = os.getcwd()
project_root = os.path.abspath(os.path.join(notebook_dir, ".."))
sys.path.insert(0, project_root)

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.sql import Window

s_raw_root_path = r'F:\DataSamples\DataSets'
s_spark_file_server_root = r'F:\Spark_Data_Test'

spark = SparkSession.builder \
    .appName('Prep Census Data') \
    .master("local[*]") \
    .config("spark.sql.warehouse.dir", s_spark_file_server_root) \
    .enableHiveSupport() \
    .getOrCreate()


In [3]:
# Load Files
df_census_surname = spark.read \
    .options(header='true', delimiter=',', inferSchema=True) \
    .csv(os.path.join(s_raw_root_path, 'Census_Surnames\\Names_2010Census.csv'))

df_census_surname.orderBy('rank', ascending=True).show(10, truncate=False)

+---------------+----+--------+--------+------------+--------+--------+------+-------+---------+-----------+
|name           |rank|count   |prop100k|cum_prop100k|pctwhite|pctblack|pctapi|pctaian|pct2prace|pcthispanic|
+---------------+----+--------+--------+------------+--------+--------+------+-------+---------+-----------+
|ALL OTHER NAMES|0   |29312001|9936.97 |9936.97     |66.65   |8.53    |7.97  |0.86   |2.32     |13.67      |
|SMITH          |1   |2442977 |828.19  |828.19      |70.9    |23.11   |0.5   |0.89   |2.19     |2.4        |
|JOHNSON        |2   |1932812 |655.24  |1483.42     |58.97   |34.63   |0.54  |0.94   |2.56     |2.36       |
|WILLIAMS       |3   |1625252 |550.97  |2034.39     |45.75   |47.68   |0.46  |0.82   |2.81     |2.49       |
|BROWN          |4   |1437026 |487.16  |2521.56     |57.95   |35.6    |0.51  |0.87   |2.55     |2.52       |
|JONES          |5   |1425470 |483.24  |3004.8      |55.19   |38.48   |0.44  |1      |2.61     |2.29       |
|GARCIA         |6 

In [39]:
df_census_surname.select(max('rank')).collect()[0][0]

160975

In [30]:
# make a upper bound for cumulative distribution function
df_census = df_census_surname.filter(col('name') != 'ALL OTHER NAMES') \
    .withColumn('unqiue_rank', row_number().over(Window.orderBy('cum_prop100k'))) \
    .withColumn('cum_prop100k', (col('cum_prop100k') * 100).cast(IntegerType()))

df_join = df_census.alias('high').join(df_census.alias('low'), on= col('low.unqiue_rank') == col('high.unqiue_rank') - 1, how='outer')


In [31]:
df_join = df_join.select(
    coalesce(col('high.name'),lit('OTHER')).alias('name'),
    coalesce(col('high.unqiue_rank'),col('low.unqiue_rank')+1).alias('unqiue_rank'),
    ifnull(col('low.cum_prop100k'), lit(0)).alias('cum_prop100k_low'),
    ifnull(col('high.cum_prop100k'), lit(10_000_000)).alias('cum_prop100k_high'),
)

df_join.orderBy(col('cum_prop100k_low'), ascending=True
).show(10)
df_join.orderBy(col('cum_prop100k_low'), ascending=False
).show(10)

+---------+-----------+----------------+-----------------+
|     name|unqiue_rank|cum_prop100k_low|cum_prop100k_high|
+---------+-----------+----------------+-----------------+
|    SMITH|          1|               0|            82819|
|  JOHNSON|          2|           82819|           148342|
| WILLIAMS|          3|          148342|           203439|
|    BROWN|          4|          203439|           252156|
|    JONES|          5|          252156|           300480|
|   GARCIA|          6|          300480|           340012|
|   MILLER|          7|          340012|           379386|
|    DAVIS|          8|          379386|           417231|
|RODRIGUEZ|          9|          417231|           454350|
| MARTINEZ|         10|          454350|           490289|
+---------+-----------+----------------+-----------------+
only showing top 10 rows

+-----------+-----------+----------------+-----------------+
|       name|unqiue_rank|cum_prop100k_low|cum_prop100k_high|
+-----------+-----------+-

In [36]:
df_join.withColumnRenamed('name','last_name').withColumnRenamed('cum_prop100k_low','profile_lower_bound').withColumnRenamed('cum_prop100k_high','profile_upper_bound') \
    .write.mode('overwrite').parquet(os.path.join(s_spark_file_server_root, 'census_surname_bounds.parquet'))
# df_join.write.mode('overwrite').saveAsTable('census_surname_bounds')

In [None]:
# reload test
df_first_names = spark.read.parquet(r'F:\Spark_Data_Test\census_surname_bounds.parquet')
df_first_names.show(10, truncate=False)
# Check count matches on reload
print(df_first_names.count())
print(df_join.count())

+---------+-----------+-------------------+-------------------+
|last_name|unqiue_rank|profile_lower_bound|profile_upper_bound|
+---------+-----------+-------------------+-------------------+
|SMITH    |1          |0                  |82819              |
|JOHNSON  |2          |82819              |148342             |
|WILLIAMS |3          |148342             |203439             |
|BROWN    |4          |203439             |252156             |
|JONES    |5          |252156             |300480             |
|GARCIA   |6          |300480             |340012             |
|MILLER   |7          |340012             |379386             |
|DAVIS    |8          |379386             |417231             |
|RODRIGUEZ|9          |417231             |454350             |
|MARTINEZ |10         |454350             |490289             |
+---------+-----------+-------------------+-------------------+
only showing top 10 rows

162254
162254


In [42]:
df_join.withColumnRenamed('name','first_name').withColumnRenamed('cum_prop100k_low','profile_lower_bound').withColumnRenamed('cum_prop100k_high','profile_upper_bound') \
    .write.mode('overwrite').parquet(os.path.join(s_spark_file_server_root, 'census_firstname_bounds.parquet'))

## Test Combine

In [24]:
from src.DataCreator.DataGenerators.PyData import PyData

li_name_lkp = PyData.random_ints(1000, 1, 10_000_000)
print(type(li_name_lkp))
schema = StructType([StructField('name_int', IntegerType(), False)])
df_new_names = spark.createDataFrame(zip(li_name_lkp), schema)
df_new_names.show(10)


<class 'list'>
+--------+
|name_int|
+--------+
| 5713313|
|  849722|
| 3755173|
| 8112786|
| 1929916|
| 3493837|
| 1589411|
| 8082121|
| 6311311|
| 7878448|
+--------+
only showing top 10 rows



In [25]:
df_new_name_w_str = df_new_names.join(df_join, on= (df_new_names.name_int > df_join.cum_prop100k_low) & (df_new_names.name_int <= df_join.cum_prop100k_high), how='left')
df_new_name_w_str.show(10)

+--------+-----------+-----------+----------------+-----------------+
|name_int|       name|unqiue_rank|cum_prop100k_low|cum_prop100k_high|
+--------+-----------+-----------+----------------+-----------------+
| 5713313|  MCFARLANE|       3795|         5713182|          5713498|
|  849722|      WHITE|         24|          830208|           852599|
| 3755173|    ENGLISH|        748|         3754501|          3756074|
| 8112786|STANCHFIELD|      40510|         8112777|          8112795|
| 1929916|     BRYANT|        128|         1924103|          1930639|
| 3493837|     DECKER|        597|         3492033|          3493951|
| 1589411|     CHAVEZ|         83|         1587366|          1595871|
| 8082121|      JOBST|      38877|         8082113|          8082132|
| 6311311|       SENN|       6273|         6311249|          6311434|
| 7878448|     ELISON|      29856|         7878437|          7878463|
+--------+-----------+-----------+----------------+-----------------+
only showing top 10 

In [26]:
df_new_name_w_str.select(count('*'), countDistinct('name')).show(1, truncate=False)

+--------+--------------------+
|count(1)|count(DISTINCT name)|
+--------+--------------------+
|1000    |766                 |
+--------+--------------------+



In [27]:
df_new_name_w_str.groupBy('name').agg(count('*').alias('name_cnt')).orderBy('name_cnt', ascending=False).show(10, truncate=False)

+---------+--------+
|name     |name_cnt|
+---------+--------+
|OTHER    |82      |
|JOHNSON  |10      |
|SMITH    |9       |
|ANDERSON |9       |
|DAVIS    |8       |
|BROWN    |8       |
|JONES    |6       |
|GARCIA   |6       |
|MOORE    |6       |
|HERNANDEZ|6       |
+---------+--------+
only showing top 10 rows

