In [82]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.sql import Row
import numpy as np

In [46]:
spark = SparkSession \
.builder \
.appName("Python Spark SQL basic example") \
.config("spark.some.config.option", "some-value") \
.getOrCreate()


In [142]:
sc = spark.sparkContext

schemaString = ["age", "workclass", "fnlwgt", 
                "education", "education-num", "marital-status",
                "occupation", "relationship", "race", "sex", 
                "capital-gain", "capital-loss", "hours-per-week", "native-country", "income"]
"""
typeString = [IntegerType(), StringType(), IntegerType(), 
              StringType(), IntegerType(), StringType(), StringType(), 
             StringType(), StringType(), StringType(), IntegerType(), 
              IntegerType(), IntegerType(), StringType(), StringType()]
"""

lines = sc.textFile("datasets/adult.data")
parts = lines.map(lambda l: l.split(",")).filter(lambda x: len(x) == len(schemaString))
parts = parts.map(lambda p: [p[i].strip() for i in range(len(schemaString))])
fields = [StructField(field_name, StringType(), True) for field_name in schemaString]
schema = StructType(fields)

df = spark.createDataFrame(parts, schema)

df = df.withColumn("age", col("age").cast(IntegerType()))
df = df.withColumn("fnlwgt", col("fnlwgt").cast(IntegerType()))
df = df.withColumn("education-num", col("education-num").cast(IntegerType()))
df = df.withColumn("capital-gain", col("capital-gain").cast(IntegerType()))
df = df.withColumn("capital-loss", col("capital-loss").cast(IntegerType()))
df = df.withColumn("hours-per-week", col("hours-per-week").cast(IntegerType()))

In [143]:
print((df.count(), len(df.columns)))
df.show(10)

(32561, 15)
+---+----------------+------+---------+-------------+--------------------+-----------------+-------------+-----+------+------------+------------+--------------+--------------+------+
|age|       workclass|fnlwgt|education|education-num|      marital-status|       occupation| relationship| race|   sex|capital-gain|capital-loss|hours-per-week|native-country|income|
+---+----------------+------+---------+-------------+--------------------+-----------------+-------------+-----+------+------------+------------+--------------+--------------+------+
| 39|       State-gov| 77516|Bachelors|           13|       Never-married|     Adm-clerical|Not-in-family|White|  Male|        2174|           0|            40| United-States| <=50K|
| 50|Self-emp-not-inc| 83311|Bachelors|           13|  Married-civ-spouse|  Exec-managerial|      Husband|White|  Male|           0|           0|            13| United-States| <=50K|
| 38|         Private|215646|  HS-grad|            9|            Divorced

(a)

In [144]:
df_marital = df.groupBy("marital-status").count()
df_marital_sex_male = df.groupBy("marital-status", "sex").count().where(col("sex") == "Male")
df_ratio = df_marital_sex_male.join(df_marital, "marital-status").select(df_marital_sex_male["marital-status"],
                    df_marital_sex_male["count"] / df_marital["count"]).withColumnRenamed("(count / count)", "ratio")

In [145]:
df_ratio.show()

+--------------------+-------------------+
|      marital-status|              ratio|
+--------------------+-------------------+
|           Separated|0.38439024390243903|
|       Never-married| 0.5537770289244595|
|Married-spouse-ab...| 0.5095693779904307|
|            Divorced|0.39860454647760524|
|             Widowed| 0.1691842900302115|
|   Married-AF-spouse|  0.391304347826087|
|  Married-civ-spouse| 0.8893563034188035|
+--------------------+-------------------+



(b)

In [159]:
df.where(col("sex") == "Female").where(col("income") == ">50K")\
    .groupBy(df["native-country"]).avg("hours-per-week")\
    .withColumnRenamed("avg(hours-per-week)", "average hours_per_week")\
    .sort("average hours_per_week", ascending=False).show()

+------------------+----------------------+
|    native-country|average hours_per_week|
+------------------+----------------------+
|            Greece|                  65.0|
|          Honduras|                  60.0|
|             South|    56.666666666666664|
|          Thailand|                  50.0|
|Dominican-Republic|                  47.0|
|            Canada|    46.111111111111114|
|            France|                  45.0|
|            Poland|                  45.0|
|          Portugal|                  43.0|
|             Italy|                  42.0|
|     United-States|    40.492537313432834|
|       Philippines|    40.083333333333336|
|           Hungary|                  40.0|
|       El-Salvador|                  40.0|
|              Iran|                  40.0|
|              Hong|                  40.0|
|           Ireland|                  40.0|
|        Yugoslavia|                  40.0|
|          Scotland|                  40.0|
|              Laos|            

(d)

In [160]:
df.groupBy("income").agg(max("education-num").alias("highest_education"),\
                        min("education-num").alias("lowest_education")).show()

+------+-----------------+----------------+
|income|highest_education|lowest_education|
+------+-----------------+----------------+
| <=50K|               16|               1|
|  >50K|               16|               2|
+------+-----------------+----------------+

