In [1]:
import os
import sys

notebook_dir = os.getcwd()
project_root = os.path.abspath(os.path.join(notebook_dir, ".."))
sys.path.insert(0, project_root)

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import *
from pyspark.sql.functions import col, lit

s_raw_root_path = r'F:\DataSamples\DataSets'
s_spark_file_server_root = r'F:\Spark_Data_Test'

spark = SparkSession.builder \
    .appName('Prep Census Data') \
    .master("local[*]") \
    .config("spark.sql.warehouse.dir", s_spark_file_server_root) \
    .enableHiveSupport() \
    .getOrCreate()


In [72]:
# Load Files
df_census_surname = spark.read \
    .options(header='true', delimiter=',', inferSchema=True) \
    .csv(os.path.join(s_raw_root_path, 'Census_Surnames\\Names_2010Census.csv'))

df_census_surname.show(10, truncate=False)
df_census_surname.orderBy('rank', ascending=True).show(10, truncate=False)
df_census_surname.orderBy('rank', ascending=False).show(10, truncate=False)

+---------+----+-------+--------+------------+--------+--------+------+-------+---------+-----------+
|name     |rank|count  |prop100k|cum_prop100k|pctwhite|pctblack|pctapi|pctaian|pct2prace|pcthispanic|
+---------+----+-------+--------+------------+--------+--------+------+-------+---------+-----------+
|SMITH    |1   |2442977|828.19  |828.19      |70.9    |23.11   |0.5   |0.89   |2.19     |2.4        |
|JOHNSON  |2   |1932812|655.24  |1483.42     |58.97   |34.63   |0.54  |0.94   |2.56     |2.36       |
|WILLIAMS |3   |1625252|550.97  |2034.39     |45.75   |47.68   |0.46  |0.82   |2.81     |2.49       |
|BROWN    |4   |1437026|487.16  |2521.56     |57.95   |35.6    |0.51  |0.87   |2.55     |2.52       |
|JONES    |5   |1425470|483.24  |3004.8      |55.19   |38.48   |0.44  |1      |2.61     |2.29       |
|GARCIA   |6   |1166120|395.32  |3400.12     |5.38    |0.45    |1.41  |0.47   |0.26     |92.03      |
|MILLER   |7   |1161437|393.74  |3793.86     |84.11   |10.76   |0.54  |0.66   |1.7

## Understanding the data columns
The
**name**
- The column is truely unique

**rank**
- The columns is not unique, because the rank is a dense_rank where the same values (aka ties) recieve the same value.

**cum_prop100k**
- The max value for the column does not go to 100k or 100,000, because the data only contains count(s) >= 100

**pct%**
- These columns have non-numeric values in the data labelled as "(S)", will need to remove a convert.

In [20]:
df_census_surname.printSchema()

root
 |-- name: string (nullable = true)
 |-- rank: integer (nullable = true)
 |-- count: integer (nullable = true)
 |-- prop100k: double (nullable = true)
 |-- cum_prop100k: double (nullable = true)
 |-- pctwhite: string (nullable = true)
 |-- pctblack: string (nullable = true)
 |-- pctapi: string (nullable = true)
 |-- pctaian: string (nullable = true)
 |-- pct2prace: string (nullable = true)
 |-- pcthispanic: string (nullable = true)



In [None]:
from pprint import pprint
df_stats = df_census_surname.select(min('rank').alias('min_rank')
                         , max('rank').alias('max_rank')
                         ,min('count').alias('min_count')
                        , max('count').alias('max_count')
                        , count('*').alias('total_count')
                        , countDistinct('rank').alias('distinct_rank')
                         )
dict_stats = df_stats.collect()[0].asDict()
pprint(dict_stats)

In [32]:
# confirm name is distinct 
df_cnt = df_census_surname.count()
name_dist_cnt = df_census_surname.select(df_census_surname.name).distinct().count()
print(f"Distinct Name Count: {name_dist_cnt:,} vs DataFrame Count: {df_cnt:,}")

Distinct Name Count: 162,254 vs DataFrame Count: 162,254


In [None]:
from pyspark.sql.functions import *
# Check if rank column matches the row count

max_rank = df_census_surname.select(max(df_census_surname.rank)).collect()[0][0]
print(f"Max Rank: {max_rank:,} vs DataFrame Count: {df_cnt:,}")

Max Rank: 160,975 vs DataFrame Count: 162,254


In [46]:
df_census_surname.select('rank').distinct().count()

10231

In [71]:
from pyspark.sql import Window

df_census_surname.select('*'
                         , row_number().over(Window.orderBy('rank')).alias('rank_diff')
).show(10)

+---------------+----+--------+--------+------------+--------+--------+------+-------+---------+-----------+---------+
|           name|rank|   count|prop100k|cum_prop100k|pctwhite|pctblack|pctapi|pctaian|pct2prace|pcthispanic|rank_diff|
+---------------+----+--------+--------+------------+--------+--------+------+-------+---------+-----------+---------+
|ALL OTHER NAMES|   0|29312001| 9936.97|     9936.97|   66.65|    8.53|  7.97|   0.86|     2.32|      13.67|        1|
|          SMITH|   1| 2442977|  828.19|      828.19|    70.9|   23.11|   0.5|   0.89|     2.19|        2.4|        2|
|        JOHNSON|   2| 1932812|  655.24|     1483.42|   58.97|   34.63|  0.54|   0.94|     2.56|       2.36|        3|
|       WILLIAMS|   3| 1625252|  550.97|     2034.39|   45.75|   47.68|  0.46|   0.82|     2.81|       2.49|        4|
|          BROWN|   4| 1437026|  487.16|     2521.56|   57.95|    35.6|  0.51|   0.87|     2.55|       2.52|        5|
|          JONES|   5| 1425470|  483.24|      30

In [58]:
# Aggregates on rank
df_census_rank = df_census_surname.groupBy('rank').agg(count('rank').alias('rank_cnt'), 
                                                       min('name').alias('min_name'), 
                                                       max('name').alias('max_name'),
                                                       sum(df_census_surname.prop100k).alias('sum_prop100k'),
                                                       )
df_census_rank.select('*').orderBy('rank_cnt', ascending=False).show(10)

# Second Method
df_census_rank = df_census_surname.groupBy('rank').count().alias('rank_cnt')
df_census_rank.select('*').orderBy('count', ascending=False).show(10)

+------+--------+----------+---------+------------------+
|  rank|rank_cnt|  min_name| max_name|      sum_prop100k|
+------+--------+----------+---------+------------------+
|158432|    1280|ABDUSSAMAD|  ZWANZIG|38.400000000000695|
|160975|    1279|   AARSETH|  ZWEIBEL|38.370000000000694|
|159712|    1263|ABDELDAYEM|    ZURKO|37.890000000000676|
|157234|    1198|    ABAYAN|ZWERNEMAN|  35.9400000000006|
|156044|    1190|      ABAI|ZWERDLING| 47.59999999999916|
|152628|    1141|   ABANGAN|   ZYBERT|45.639999999999205|
|153769|    1138|      ABAL|ZWIERLEIN| 45.51999999999921|
|154907|    1137|    ABDELA|      ZOR| 45.47999999999921|
|151532|    1096|   ABOGADO|    ZULLA| 43.83999999999924|
|147253|    1094|   AALDERS|   ZYDZIK|43.759999999999245|
+------+--------+----------+---------+------------------+
only showing top 10 rows

+------+-----+
|  rank|count|
+------+-----+
|158432| 1280|
|160975| 1279|
|159712| 1263|
|157234| 1198|
|156044| 1190|
|152628| 1141|
|153769| 1138|
|154907| 113

In [63]:
# Looking into the stats of a lower rank / frequency name
df_census_surname.select('*').where(df_census_surname.rank == 158432).orderBy('prop100k').show(10)

+---------+------+-----+--------+------------+--------+--------+------+-------+---------+-----------+
|     name|  rank|count|prop100k|cum_prop100k|pctwhite|pctblack|pctapi|pctaian|pct2prace|pcthispanic|
+---------+------+-----+--------+------------+--------+--------+------+-------+---------+-----------+
|    YIRGA|158432|  102|    0.03|     89932.2|   19.61|   77.45|     0|      0|      (S)|        (S)|
|   YEVOLI|158432|  102|    0.03|    89932.23|   97.06|     (S)|     0|      0|        0|        (S)|
|   YOCIUS|158432|  102|    0.03|    89932.27|   99.02|     (S)|     0|      0|        0|        (S)|
|ZAMBERLAN|158432|  102|    0.03|     89932.3|   93.14|       0|   (S)|      0|      (S)|        (S)|
|     YONK|158432|  102|    0.03|    89932.34|    95.1|     (S)|     0|      0|      (S)|        (S)|
|    ZAKON|158432|  102|    0.03|    89932.37|   97.06|     (S)|   (S)|      0|      (S)|          0|
|   ZAMSKY|158432|  102|    0.03|    89932.41|   99.02|       0|     0|      0|   

In [None]:
# Check if a column contains a null value
df_census_surname.filter(col('pcthispanic').isNull()).count()
df_census_surname.filter(df_census_surname.pcthispanic == '(S)').count()

# Check if a column has a non-numeric value
# Coming in Spark 4.0
# isnumeric_cnt = df_census_surname.filter(try_cast('pctapi', 'double').isNotNull()).count()


# rank the data per "First Letter", by prop100K
# sum the count column , then by "First Letter" and "Rank"

51950

# Quality Check all the data rows are consistent.

In [14]:
rdd_ssa_file = spark.sparkContext.textFile(os.path.join(s_raw_root_path, 'SSA_Names\\yob1880.txt'))
print(rdd_ssa_file.take(5))
print(rdd_ssa_file.count())


['Mary,F,7065', 'Anna,F,2604', 'Emma,F,2003', 'Elizabeth,F,1939', 'Minnie,F,1746']
2000


In [None]:
rdd_ssa_files_all = spark.sparkContext.textFile(os.path.join(s_raw_root_path, 'SSA_Names\\*.txt'))
print(rdd_ssa_files_all.take(5))
# 2,117,219
print(rdd_ssa_files_all.count())

['Mary,F,7065', 'Anna,F,2604', 'Emma,F,2003', 'Elizabeth,F,1939', 'Minnie,F,1746']
2117219


In [18]:
from pprint import pprint

i_bad_lines = 0
i_expected_sep_cnt = 2
ls_bad_lines = []

for row in rdd_ssa_files_all.toLocalIterator():
    i_sep_cnt = row.count(',')
    if i_sep_cnt != i_expected_sep_cnt:
        i_bad_lines += 1
        ls_bad_lines.append(row)

print(f'Bad Line Count: {i_bad_lines}')
pprint(ls_bad_lines[0:5])

Bad Line Count: 0
[]
