In [1]:
import pyspark

## Using Spark, inspect the Bank Marketing dataset

- Load df

In [22]:
# create dataframe
from pyspark.sql import SparkSession

# Initialize Spark
spark = SparkSession.builder.appName("bank_df").getOrCreate()

25/01/31 23:14:01 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [23]:
df = spark.read.csv("bank.csv", inferSchema=True, header=True)

- Print 10 random rows from the dataset

In [24]:
df.sample(0.5).limit(10).show()

+---+----------+--------+---------+-------+-------+-------+----+-------+---+-----+--------+--------+-----+--------+--------+-------+
|age|       job| marital|education|default|balance|housing|loan|contact|day|month|duration|campaign|pdays|previous|poutcome|deposit|
+---+----------+--------+---------+-------+-------+-------+----+-------+---+-----+--------+--------+-----+--------+--------+-------+
| 56|    admin.| married|secondary|     no|     45|     no|  no|unknown|  5|  may|    1467|       1|   -1|       0| unknown|    yes|
| 41|technician| married|secondary|     no|   1270|    yes|  no|unknown|  5|  may|    1389|       1|   -1|       0| unknown|    yes|
| 56|management| married| tertiary|     no|    830|    yes| yes|unknown|  6|  may|    1201|       1|   -1|       0| unknown|    yes|
| 60|   retired|divorced|secondary|     no|    545|    yes|  no|unknown|  6|  may|    1030|       1|   -1|       0| unknown|    yes|
| 28|  services|  single|secondary|     no|   5090|    yes|  no|unkno

- What are the relative proportions of no and yes for deposit feature? (inspect also other qualitative variables)

In [28]:
from pyspark.sql import functions as F

In [38]:
df.groupBy("deposit").count().withColumn("fraction", F.round(F.col("count") / F.lit(df.count()),2)).show()

+-------+-----+--------+
|deposit|count|fraction|
+-------+-----+--------+
|     no| 5873|    0.53|
|    yes| 5289|    0.47|
+-------+-----+--------+



In [39]:
df.groupBy("marital").count().withColumn("fraction", F.round(F.col("count") / F.lit(df.count()),2)).show()

+--------+-----+--------+
| marital|count|fraction|
+--------+-----+--------+
|divorced| 1293|    0.12|
| married| 6351|    0.57|
|  single| 3518|    0.32|
+--------+-----+--------+



In [40]:
df.groupBy("education").count().withColumn("fraction", F.round(F.col("count") / F.lit(df.count()),2)).show()

+---------+-----+--------+
|education|count|fraction|
+---------+-----+--------+
|  unknown|  497|    0.04|
| tertiary| 3689|    0.33|
|secondary| 5476|    0.49|
|  primary| 1500|    0.13|
+---------+-----+--------+



- Get descriptive statistics for numerical variables

In [45]:
df.select(F.round(F.mean("age"),3), F.median("age"), F.skewness("age")).show()

+------------------+-----------+------------------+
|round(avg(age), 3)|median(age)|     skewness(age)|
+------------------+-----------+------------------+
|            41.232|       39.0|0.8626636888266418|
+------------------+-----------+------------------+



In [46]:
df.select(F.round(F.stddev("balance"),3), F.sum("balance"), F.count("balance")).show()

+-------------------------+------------+--------------+
|round(stddev(balance), 3)|sum(balance)|count(balance)|
+-------------------------+------------+--------------+
|                 3225.413|    17061547|         11162|
+-------------------------+------------+--------------+



In [47]:
df.approxQuantile("balance", [0.25, 0.5, 0.75], 0.01)

[113.0, 525.0, 1631.0]

- Use relevant visualizations to inspect variables and relations between them

- Who is the client with the biggest balance?

In [50]:
df.orderBy("balance", ascending=False).limit(1).show()

+---+-------+-------+---------+-------+-------+-------+----+---------+---+-----+--------+--------+-----+--------+--------+-------+
|age|    job|marital|education|default|balance|housing|loan|  contact|day|month|duration|campaign|pdays|previous|poutcome|deposit|
+---+-------+-------+---------+-------+-------+-------+----+---------+---+-----+--------+--------+-----+--------+--------+-------+
| 84|retired|married|secondary|     no|  81204|     no|  no|telephone| 28|  dec|     679|       1|  313|       2|   other|    yes|
+---+-------+-------+---------+-------+-------+-------+----+---------+---+-----+--------+--------+-----+--------+--------+-------+



- What is the proportion of clients who's balance is more than twice the average?

In [55]:
avg_balance = df.select(F.avg("balance")).collect()[0][0]

In [57]:
df.filter(df["balance"] > 2*avg_balance).count() / df.count()

0.13931195126321447

- Do people with higher education have a better chance to deposit?

## OULAD Data 7 queries

## Use PySpark syntax to find pairs of coprimes up to some constant n. 

In [5]:
# create dataframe
from pyspark.sql import SparkSession

# Initialize Spark
spark = SparkSession.builder.appName("primes").getOrCreate()

25/01/31 23:11:03 WARN Utils: Your hostname, alehak-ThinkPad-S5-S540 resolves to a loopback address: 127.0.1.1; using 10.0.0.46 instead (on interface wlp4s0)
25/01/31 23:11:03 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/01/31 23:11:04 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [16]:
data = [(i,) for i in range(1,100)]

In [17]:
columns = ["n",]

In [18]:
df = spark.createDataFrame(data, columns)

In [19]:
df.show(3)

+---+
|  n|
+---+
|  1|
|  2|
|  3|
+---+
only showing top 3 rows

