In [None]:
# AWS cluster configuration for below
# Master node: r5.2x large
# core node : m4.4x large 
# Tasknode node: m5.4x large 

In [2]:
%%configure -f
{"conf":{"spark.driver.maxResultSize":"15G",
         "spark.executor.memory": "30G",
         "spark.executor.cores": "16",
         "spark.driver.memory": "25G",
         "spark.executor.memoryOverhead":"22G",
         "spark.dynamicAllocation.enabled":"true",
         "spark.dynamicAllocation.minExecutors":"20",
         "spark.shuffle.service.enabled":"true",
         "spark.network.timeout":"1000000",
         "spark.sql.shuffle.partitions":"3001",
         "spark.pyspark.virtualenv.enabled": "true",
         "spark.pyspark.python": "python3",
         "spark.pyspark.virtualenv.type": "native",
         "spark.pyspark.virtualenv.bin.path": "/usr/bin/virtualenv"},
"kind": "pyspark"}

UsageError: Cell magic `%%configure` not found.


In [3]:
from IPython.core.display import display, HTML
display(HTML("<style>div.jp-OutputArea-output pre {white-space: pre;}</style>"))

  from IPython.core.display import display, HTML


In [9]:
import sys
import pyspark
from pyspark.context import SparkContext
from pyspark.sql.session import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql.window import Window
import re 
import pyspark.sql.functions as F
from datetime import datetime

In [10]:
## Building a spark session
spark = SparkSession.builder \
.enableHiveSupport()\
.getOrCreate()

### Adult Dataset

In [11]:
### Listing all the columns and passing numerical columns into numerical and categorical columns into categorical
columns = ['age','fnlwgt','workclass','education','education_num','marital_status','occupation','relationship',
    'race','sex','capital_gain','capital_loss','hours_per_week','native_country']

numerical = set(('age','fnlwgt','education_num','capital_gain','capital_loss','hours_per_week'))
categorical = set(('workclass','education','marital_status','occupation','relationship','sex','native_country','race'))


In [12]:
###### Defining the schema for all the variables
schema = StructType([StructField('age', IntegerType(), True),
                     StructField('workclass', StringType(), True),
                     StructField('fnlwgt', LongType(), True),
                     StructField('education', StringType(), True),
                     StructField('education_num', IntegerType(), True),
                     StructField('marital_status', StringType(), True),
                     StructField('occupation', StringType(), True),
                     StructField('relationship', StringType(), True),
                     StructField('race', StringType(), True),
                     StructField('sex', StringType(), True),
                     StructField('capital_gain', LongType(), True),
                     StructField('capital_loss', LongType(), True),
                     StructField('hours_per_week', LongType(), True),
                     StructField('native_country', StringType(), True),
                     ])

### Reading the csv file from s3 bucket ####
input_df = spark.read.option("delimiter", ",").schema(schema)\
            .csv("data_files/adult.csv")\
            .where((F.col('occupation')!='?') & (F.col('native_country')!='?')).limit(100)

In [13]:
###### creating the row number for the data 
from pyspark.sql.functions import row_number,lit
from pyspark.sql.window import Window
w = Window().orderBy(lit('A'))
input_df = input_df.withColumn("id", row_number().over(w))

input_df.registerTempTable("input_df")

In [14]:
input_df.show(10,False)

+---+----------------+------+---------+-------------+---------------------+-----------------+-------------+-----+------+------------+------------+--------------+--------------+---+
|age|workclass       |fnlwgt|education|education_num|marital_status       |occupation       |relationship |race |sex   |capital_gain|capital_loss|hours_per_week|native_country|id |
+---+----------------+------+---------+-------------+---------------------+-----------------+-------------+-----+------+------------+------------+--------------+--------------+---+
|39 |State-gov       |77516 |Bachelors|13           |Never-married        |Adm-clerical     |Not-in-family|White|Male  |2174        |0           |40            |United-States |1  |
|50 |Self-emp-not-inc|83311 |Bachelors|13           |Married-civ-spouse   |Exec-managerial  |Husband      |White|Male  |0           |0           |13            |United-States |2  |
|38 |Private         |215646|HS-grad  |9            |Divorced             |Handlers-cleaners|No

In [15]:
input_df.count()

100

In [16]:
# Finding the col span values for numerical features is diff of max and min and for categorical is distinct of column.
def get_span(numcols, catcols, table_name):
    query1=""
    query1+="""SELECT 'x' partition_id,
    tbl.id"""
    
    for col in numcols:
        query1+=""",
        tbl.{0},
        inner_query_{0}.{0}_span
        """.format(col)
        
    for col in catcols:
        query1+=""",
        tbl.{0},
        inner_query_{0}.{0}_span
        """.format(col)
    
    query1+="from {0} tbl".format(table_name)
    
    for col in numcols:
        query1+="""
        inner join
        (SELECT max({0})-min({0}) {0}_span
        FROM {1}) inner_query_{0}
        ON 1=1
        """.format(col,table_name)
        
    for col in catcols:
        query1+="""
        inner join
        (select 
        COUNT(DISTINCT {0})  {0}_span
        FROM {1}) inner_query_{0}
        on 1=1
        """.format(col,table_name)
    return query1

In [17]:
#### calling the get_span function by passing numerical, categorical and input table
span_df = spark.sql(get_span(numerical,categorical,'input_df'))

In [18]:
#### Writing into parquet file #####
span_df.write.mode('overwrite').parquet('output_files/k_anon/fourteen_feat_30k/0')


In [19]:
######## Implementing the udf where categorical features length were divided by 2 and registering the udf 
from pyspark.sql.functions import udf
from pyspark.sql.types import ArrayType, StringType
def func1(arr,flag):
    arr_mid = len(arr)//2 
    la = arr[:arr_mid]
    ra = arr[arr_mid:]
    if flag == '1':
        return la
    elif flag == '2':
        return ra
    
spark.udf.register("func_udf",func1,ArrayType(StringType()))

<function __main__.func1(arr, flag)>

In [20]:
#Finding the median value for all numerical columns and for categorical calling the udf and aggregating all the elements
def get_med_query(numcols,catcols,table_name):
    query1=""
    query1+="""SELECT tbl.partition_id,
    """
    for col in numcols:
        query1+="""
        max(tbl.{0}_span) {0}_span,
        max(tbl.{0})-min(tbl.{0}) {0}_span_part,
        max(inner_qr_{0}.{0}_med) {0}_med,
        sum(case when tbl.{0} < inner_qr_{0}.{0}_med then 1 else 0 end) {0}_left_cnt,
        sum(case when tbl.{0} >= inner_qr_{0}.{0}_med then 1 else 0 end) {0}_right_cnt,
        """.format(col)
        
    for col in catcols:
        query1+="""
        max(tbl.{0}_span) {0}_span,
        count(distinct tbl.{0}) {0}_span_part,
        max({0}_inner_q.{0}_arr_l) {0}_arr_l,
        max({0}_inner_q.{0}_arr_r) {0}_arr_r,
        sum(case when array_contains({0}_inner_q.{0}_arr_l,tbl.{0}) then 1 else 0 end) {0}_left_cnt,
        sum(case when array_contains({0}_inner_q.{0}_arr_r,tbl.{0}) then 1 else 0 end) {0}_right_cnt,
        """.format(col)

    query1+="""
    count(*) row_cnt 
    from {0} tbl
    """.format(table_name)
    
    for col in numcols:
        query1+="""
        inner join 
        (select 
        partition_id,
        percentile_approx({0}, 0.5) {0}_med
        from {1}
        group by
        partition_id) inner_qr_{0}
        on tbl.partition_id = inner_qr_{0}.partition_id
        """.format(col,table_name)
        
    for col in catcols:
        query1+="""
        inner join
          (select
          inner_tbl.partition_id,
          func_udf(collect_set(inner_tbl.{0}),'1') {0}_arr_l,
          func_udf(collect_set(inner_tbl.{0}),'2') {0}_arr_r
          from {1} inner_tbl
          group by inner_tbl.partition_id) {0}_inner_q
        on tbl.partition_id = {0}_inner_q.partition_id
        """.format(col,table_name)

    query1+="""
    group by 
    tbl.partition_id
    """
    return query1


In [21]:
##### Finding the span value and checking the k_anonymous criteria
def split_col_query(cols, table_name):
    query1 = ""
    query1 += """
    SELECT main_tbl.partition_id,
    main_tbl.split_col
    from (select
    inner_tbl.partition_id,
    inner_tbl.split_col,
    row_number() over (partition by inner_tbl.partition_id order by inner_tbl.span desc, inner_tbl.split_col) row_num 
    from(
    """
    for col_ind in range(len(cols) - 1):
        query1 += """
        select 
        partition_id, 
        '{0}' split_col, 
        {0}_span_part/{0}_span span 
        from {1}
        where {0}_left_cnt >= 3
        and {0}_right_cnt >= 3
        and {0}_span > 0
        union all
        """.format(cols[col_ind], table_name)
        
    query1 += """
    select 
    partition_id, 
    '{0}' split_col, 
    {0}_span_part/{0}_span span 
    from {1}
    where {0}_left_cnt >= 3
    and {0}_right_cnt >= 3
    and {0}_span > 0
    """.format(cols[-1], table_name)
    query1 += """
    ) inner_tbl
    ) main_tbl
    where main_tbl.row_num = 1
    """
    return query1

In [22]:
''' Implemented another query for numerical the 'l' and 'r' will be appended when ever it is less than or greater 
value and for categorical it check whether the value contains in left list or right list and appended with those values ''' 

def final_query(numcols,catcols,main_tbl,step1_tbl,step2_tbl):
    query1 = ""
    query1 += """
    select
    case 
    when step2_tbl.split_col is null then main_tbl.partition_id
    """
    for col in numcols:
        query1 += """
        when step2_tbl.split_col = '{0}' and main_tbl.{0} < step1_tbl.{0}_med then main_tbl.partition_id||'l'
        when step2_tbl.split_col = '{0}' and main_tbl.{0} >= step1_tbl.{0}_med then main_tbl.partition_id||'r'
        """.format(col)
        
    for col in catcols:
        query1 += """
        when step2_tbl.split_col = '{0}' and array_contains(step1_tbl.{0}_arr_l,{0}) then main_tbl.partition_id||'l'
        when step2_tbl.split_col = '{0}' and array_contains(step1_tbl.{0}_arr_r,{0}) then main_tbl.partition_id||'r'
        """.format(col)
        
    query1 += """
    else 'unknown'
    end partition_id,
    step2_tbl.split_col,
    """
    for col in numcols:
        query1 += """
        step1_tbl.{0}_med,
        main_tbl.{0},
        main_tbl.{0}_span,
        """.format(col)
        
    for col in catcols:
        query1 += """
        step1_tbl.{0}_arr_l,
        step1_tbl.{0}_arr_r,
        main_tbl.{0},
        main_tbl.{0}_span,
        """.format(col)
        
    query1 += """
    main_tbl.id
    from {0} main_tbl
      inner join {1} step1_tbl
      on main_tbl.partition_id = step1_tbl.partition_id
      left outer join {2} step2_tbl
      on step1_tbl.partition_id = step2_tbl.partition_id
    """.format(main_tbl, step1_tbl, step2_tbl)
    return query1

In [23]:
### Iterating the loop and running all the queries by calling those functions.
for loop in range(0,100):
    print(datetime.now().strftime("%Y-%m-%d %H:%M:%S.%f")+"|loop: "+str(loop))
    loop_input_df=spark.read.parquet("output_files/k_anon/fourteen_feat_30k/"+str(loop))
    loop_input_df.registerTempTable("loop_input_df")

    med_query = get_med_query(numerical,categorical,'loop_input_df')
    #print(med_query)
    loop_step1_df = spark.sql(med_query)
    #loop_step1_df.show(5,False)
    loop_step1_df.registerTempTable("loop_step1_df")

    split_query = split_col_query(columns,'loop_step1_df')
    #print(split_query)
    loop_step2_df = spark.sql(split_query)
    #loop_step2_df.show(5,False)
    loop_step2_df.registerTempTable("loop_step2_df")

    split_cnt=loop_step2_df.count()
    print(datetime.now().strftime("%Y-%m-%d %H:%M:%S.%f")+"|split_cnt: "+str(split_cnt))

    if split_cnt==0:
        break

    final_1 = final_query(numerical,categorical,'loop_input_df', 'loop_step1_df', 'loop_step2_df')
    #print(final_1)
    loop_step3_df = spark.sql(final_1)
    #loop_step3_df.show(5,False)
   
    loop_step3_df.write.parquet("output_files/k_anon/fourteen_feat_30k/"+str(loop+1), mode="overwrite")
print(datetime.now().strftime("%Y-%m-%d %H:%M:%S.%f")+"|after write")


2023-09-23 11:26:45.471627|loop: 0


----------------------------------------
Exception occurred during processing of request from ('127.0.0.1', 41626)
ERROR:root:Exception while sending command.
Traceback (most recent call last):
  File "/usr/local/spark/python/lib/py4j-0.10.9.7-src.zip/py4j/clientserver.py", line 516, in send_command
    raise Py4JNetworkError("Answer from Java side is empty")
py4j.protocol.Py4JNetworkError: Answer from Java side is empty

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/local/spark/python/lib/py4j-0.10.9.7-src.zip/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/spark/python/lib/py4j-0.10.9.7-src.zip/py4j/clientserver.py", line 539, in send_command
    raise Py4JNetworkError(
py4j.protocol.Py4JNetworkError: Error while sending or receiving
Traceback (most recent call last):
  File "/opt/conda/lib/python3.

Py4JError: An error occurred while calling o102.count

#### Partitioning output

In [None]:
2021-09-28 14:19:07.689102|loop: 0
2021-09-28 14:21:04.419906|split_cnt: 1
2021-09-28 14:23:32.522937|loop: 1
2021-09-28 14:25:32.770516|split_cnt: 2
2021-09-28 14:28:14.736992|loop: 2
2021-09-28 14:30:14.378372|split_cnt: 4
2021-09-28 14:32:59.712398|loop: 3
2021-09-28 14:35:07.356943|split_cnt: 8
2021-09-28 14:37:57.538933|loop: 4
2021-09-28 14:39:56.688526|split_cnt: 16
2021-09-28 14:42:43.300846|loop: 5
2021-09-28 14:44:51.562501|split_cnt: 32
2021-09-28 14:47:40.254535|loop: 6
2021-09-28 14:49:46.864529|split_cnt: 62
2021-09-28 14:52:13.779699|loop: 7
2021-09-28 14:54:06.315054|split_cnt: 116
2021-09-28 14:56:42.971604|loop: 8
2021-09-28 14:58:44.119904|split_cnt: 197
2021-09-28 15:01:21.051039|loop: 9
2021-09-28 15:03:21.422372|split_cnt: 292
2021-09-28 15:05:47.163267|loop: 10
2021-09-28 15:07:57.923257|split_cnt: 424
2021-09-28 15:10:39.911727|loop: 11
2021-09-28 15:12:44.807146|split_cnt: 565
2021-09-28 15:15:25.759416|loop: 12
2021-09-28 15:17:37.300752|split_cnt: 651
2021-09-28 15:20:17.189181|loop: 13
2021-09-28 15:22:32.320687|split_cnt: 713
2021-09-28 15:25:27.629532|loop: 14
2021-09-28 15:27:46.214194|split_cnt: 773
2021-09-28 15:30:42.341847|loop: 15
2021-09-28 15:33:05.059612|split_cnt: 775
2021-09-28 15:36:16.693243|loop: 16
2021-09-28 15:38:39.714015|split_cnt: 704
2021-09-28 15:42:43.711305|loop: 17
2021-09-28 15:45:26.249777|split_cnt: 616
2021-09-28 15:48:48.732298|loop: 18
2021-09-28 15:51:24.338337|split_cnt: 556
2021-09-28 15:54:44.484278|loop: 19
2021-09-28 15:57:15.783963|split_cnt: 408
2021-09-28 16:00:37.878123|loop: 20
2021-09-28 16:03:20.209884|split_cnt: 292
2021-09-28 16:07:24.885998|loop: 21
2021-09-28 16:10:20.526879|split_cnt: 133
2021-09-28 16:14:06.677973|loop: 22
2021-09-28 16:16:58.706909|split_cnt: 63
2021-09-28 16:20:27.534253|loop: 23
2021-09-28 16:23:12.840285|split_cnt: 12
2021-09-28 16:26:50.206500|loop: 24
2021-09-28 16:29:32.562561|split_cnt: 0
2021-09-28 16:29:32.562686|after write

In [None]:
### Summary: 2 hours 10 minutes for all 14 cols

#### 14 cols partition file

In [None]:
#### writing down the last loop number in the parquet file ####
partition_fourteen_cols = spark.read.parquet('output_files/k_anon/fourteen_feat_30k//24')


#### Final k_anonymised file logic

In [None]:
''' Grouping by partition id and finding the mean for numerical columns and
                    finding the aggregation of all the categorical elements '''
from pyspark.sql import functions as F
k_anon_agg_cols = partition_fourteen_cols.groupBy("partition_id")\
            .agg(F.mean('age').alias('age_k_anon'),F.mean('fnlwgt').alias('fnlwgt_k_anon'),\
            F.mean('education_num').alias('education_num_k_anon'),F.mean('capital_gain').alias('capital_gain_k_anon'),\
            F.mean('capital_loss').alias('capital_loss_k_anon'),F.mean('hours_per_week').alias('hours_per_week_k_anon'),\
            F.collect_set('workclass').alias('workclass_k_anon'),F.collect_set('education').alias('education_k_anon'),\
            F.collect_set('marital_status').alias('marital_status_k_anon'),F.collect_set('occupation').alias('occupation_k_anon'),\
            F.collect_set('relationship').alias('relationship_k_anon'),F.collect_set('sex').alias('sex_k_anon'),\
            F.collect_set('native_country').alias('native_country_k_anon'),F.collect_set('race').alias('race_k_anon'))\
.withColumn('age_k_anon',F.round(F.col('age_k_anon'),2)).withColumn('fnlwgt_k_anon',F.round(F.col('fnlwgt_k_anon'),2))\
.withColumn('education_num_k_anon',F.round(F.col('education_num_k_anon'),2)).withColumn('capital_gain_k_anon',F.round(F.col('capital_gain_k_anon'),2))\
.withColumn('capital_loss_k_anon',F.round(F.col('capital_loss_k_anon'),2)).withColumn('hours_per_week_k_anon',F.round(F.col('hours_per_week_k_anon'),2))

# Joining with partition id by joining the above aggregated columns and partition dataframe ##
k_anon_fourteen_cols = k_anon_agg_cols.join(partition_fourteen_cols,['partition_id'],how='inner').orderBy('partition_id')\
    .select('partition_id','age','fnlwgt','workclass','education','education_num','marital_status','occupation','relationship',\
            'race','sex','capital_gain','capital_loss','hours_per_week','native_country','age_k_anon','fnlwgt_k_anon',\
            'workclass_k_anon','education_k_anon','education_num_k_anon','marital_status_k_anon','occupation_k_anon',\
            'relationship_k_anon','race_k_anon','sex_k_anon','capital_gain_k_anon','capital_loss_k_anon',\
            'hours_per_week_k_anon','native_country_k_anon')


In [None]:
#k_anon_fourteen_cols.write.mode('overwrite').parquet("s3://oneid-datascience-us-east-1/Adam/data/ayyappa/k_anon/k_anonymised_14_cols")

k_anon_cols_14 = spark.read.parquet("output_files/k_anon/k_anonymised_14_cols")

#### Final output

In [None]:
+----------------+---+------+----------------+------------+-------------+------------------+-----------------+------------+------------------+----+------------+------------+--------------+--------------+----------+-------------+-------------------------------+-----------------------+--------------------+---------------------+--------------------------------------------------+-------------------+--------------------+----------+-------------------+-------------------+---------------------+----------------------------+
|partition_id    |age|fnlwgt|workclass       |education   |education_num|marital_status    |occupation       |relationship|race              |sex |capital_gain|capital_loss|hours_per_week|native_country|age_k_anon|fnlwgt_k_anon|workclass_k_anon               |education_k_anon       |education_num_k_anon|marital_status_k_anon|occupation_k_anon                                 |relationship_k_anon|race_k_anon         |sex_k_anon|capital_gain_k_anon|capital_loss_k_anon|hours_per_week_k_anon|native_country_k_anon       |
+----------------+---+------+----------------+------------+-------------+------------------+-----------------+------------+------------------+----+------------+------------+--------------+--------------+----------+-------------+-------------------------------+-----------------------+--------------------+---------------------+--------------------------------------------------+-------------------+--------------------+----------+-------------------+-------------------+---------------------+----------------------------+
|xrrrlrlrrllllrrl|47 |340982|Private         |Some-college|10           |Married-civ-spouse|Machine-op-inspct|Husband     |Asian-Pac-Islander|Male|3103        |0           |40            |Philippines   |47.67     |172878.33    |[Private]                      |[Some-college]         |10.0                |[Married-civ-spouse] |[Machine-op-inspct, Craft-repair, Exec-managerial]|[Husband]          |[Asian-Pac-Islander]|[Male]    |1034.33            |0.0                |40.0                 |[Philippines, United-States]|
|xrrrlrlrrllllrrl|47 |95680 |Private         |Some-college|10           |Married-civ-spouse|Exec-managerial  |Husband     |Asian-Pac-Islander|Male|0           |0           |40            |United-States |47.67     |172878.33    |[Private]                      |[Some-college]         |10.0                |[Married-civ-spouse] |[Machine-op-inspct, Craft-repair, Exec-managerial]|[Husband]          |[Asian-Pac-Islander]|[Male]    |1034.33            |0.0                |40.0                 |[Philippines, United-States]|
|xrrrlrlrrllllrrl|49 |81973 |Private         |Some-college|10           |Married-civ-spouse|Craft-repair     |Husband     |Asian-Pac-Islander|Male|0           |0           |40            |United-States |47.67     |172878.33    |[Private]                      |[Some-college]         |10.0                |[Married-civ-spouse] |[Machine-op-inspct, Craft-repair, Exec-managerial]|[Husband]          |[Asian-Pac-Islander]|[Male]    |1034.33            |0.0                |40.0                 |[Philippines, United-States]|
|xrrrlrlrrllllrrr|59 |81973 |Private         |Masters     |14           |Married-civ-spouse|Exec-managerial  |Husband     |Asian-Pac-Islander|Male|0           |0           |40            |United-States |55.0      |120224.25    |[Private]                      |[Masters, Prof-school] |14.25               |[Married-civ-spouse] |[Craft-repair, Exec-managerial]                   |[Husband]          |[Asian-Pac-Islander]|[Male]    |1824.5             |475.5              |41.25                |[Philippines, United-States]|
|xrrrlrlrrllllrrr|54 |139850|Private         |Masters     |14           |Married-civ-spouse|Exec-managerial  |Husband     |Asian-Pac-Islander|Male|0           |0           |45            |United-States |55.0      |120224.25    |[Private]                      |[Masters, Prof-school] |14.25               |[Married-civ-spouse] |[Craft-repair, Exec-managerial]                   |[Husband]          |[Asian-Pac-Islander]|[Male]    |1824.5             |475.5              |41.25                |[Philippines, United-States]|
|xrrrlrlrrllllrrr|50 |160724|Private         |Masters     |14           |Married-civ-spouse|Exec-managerial  |Husband     |Asian-Pac-Islander|Male|7298        |0           |40            |Philippines   |55.0      |120224.25    |[Private]                      |[Masters, Prof-school] |14.25               |[Married-civ-spouse] |[Craft-repair, Exec-managerial]                   |[Husband]          |[Asian-Pac-Islander]|[Male]    |1824.5             |475.5              |41.25                |[Philippines, United-States]|
|xrrrlrlrrllllrrr|57 |98350 |Private         |Prof-school |15           |Married-civ-spouse|Craft-repair     |Husband     |Asian-Pac-Islander|Male|0           |1902        |40            |Philippines   |55.0      |120224.25    |[Private]                      |[Masters, Prof-school] |14.25               |[Married-civ-spouse] |[Craft-repair, Exec-managerial]                   |[Husband]          |[Asian-Pac-Islander]|[Male]    |1824.5             |475.5              |41.25                |[Philippines, United-States]|
|xrrrlrlrrlllr   |48 |82098 |Self-emp-not-inc|Some-college|10           |Married-civ-spouse|Exec-managerial  |Husband     |Asian-Pac-Islander|Male|0           |0           |65            |United-States |51.67     |77992.67     |[Self-emp-not-inc, Federal-gov]|[Some-college, Masters]|12.67               |[Married-civ-spouse] |[Exec-managerial]                                 |[Husband]          |[Asian-Pac-Islander]|[Male]    |0.0                |0.0                |55.0                 |[Philippines, United-States]|
|xrrrlrlrrlllr   |55 |88876 |Federal-gov     |Masters     |14           |Married-civ-spouse|Exec-managerial  |Husband     |Asian-Pac-Islander|Male|0           |0           |60            |United-States |51.67     |77992.67     |[Self-emp-not-inc, Federal-gov]|[Some-college, Masters]|12.67               |[Married-civ-spouse] |[Exec-managerial]                                 |[Husband]          |[Asian-Pac-Islander]|[Male]    |0.0                |0.0                |55.0                 |[Philippines, United-States]|
|xrrrlrlrrlllr   |52 |63004 |Self-emp-not-inc|Masters     |14           |Married-civ-spouse|Exec-managerial  |Husband     |Asian-Pac-Islander|Male|0           |0           |40            |Philippines   |51.67     |77992.67     |[Self-emp-not-inc, Federal-gov]|[Some-college, Masters]|12.67               |[Married-civ-spouse] |[Exec-managerial]                                 |[Husband]          |[Asian-Pac-Islander]|[Male]    |0.0                |0.0                |55.0                 |[Philippines, United-States]|
+----------------+---+------+----------------+------------+-------------+------------------+-----------------+------------+------------------+----+------------+------------+--------------+--------------+----------+-------------+-------------------------------+-----------------------+--------------------+---------------------+--------------------------------------------------+-------------------+--------------------+----------+-------------------+-------------------+---------------------+----------------------------+
only showing top 10 rows