In [1]:
import pyspark

## Using Spark, inspect the Bank Marketing dataset

- Load df

In [103]:
# create dataframe
from pyspark.sql import SparkSession

# initialize Spark
spark = SparkSession.builder.appName("bank_df").getOrCreate()

In [23]:
df = spark.read.csv("bank.csv", inferSchema=True, header=True)

- Print 10 random rows from the dataset

In [24]:
df.sample(0.5).limit(10).show()

+---+----------+--------+---------+-------+-------+-------+----+-------+---+-----+--------+--------+-----+--------+--------+-------+
|age|       job| marital|education|default|balance|housing|loan|contact|day|month|duration|campaign|pdays|previous|poutcome|deposit|
+---+----------+--------+---------+-------+-------+-------+----+-------+---+-----+--------+--------+-----+--------+--------+-------+
| 56|    admin.| married|secondary|     no|     45|     no|  no|unknown|  5|  may|    1467|       1|   -1|       0| unknown|    yes|
| 41|technician| married|secondary|     no|   1270|    yes|  no|unknown|  5|  may|    1389|       1|   -1|       0| unknown|    yes|
| 56|management| married| tertiary|     no|    830|    yes| yes|unknown|  6|  may|    1201|       1|   -1|       0| unknown|    yes|
| 60|   retired|divorced|secondary|     no|    545|    yes|  no|unknown|  6|  may|    1030|       1|   -1|       0| unknown|    yes|
| 28|  services|  single|secondary|     no|   5090|    yes|  no|unkno

- What are the relative proportions of no and yes for deposit feature? (inspect also other qualitative variables)

In [28]:
from pyspark.sql import functions as F

In [38]:
df.groupBy("deposit").count().withColumn("fraction", F.round(F.col("count") / F.lit(df.count()),2)).show()

+-------+-----+--------+
|deposit|count|fraction|
+-------+-----+--------+
|     no| 5873|    0.53|
|    yes| 5289|    0.47|
+-------+-----+--------+



In [39]:
df.groupBy("marital").count().withColumn("fraction", F.round(F.col("count") / F.lit(df.count()),2)).show()

+--------+-----+--------+
| marital|count|fraction|
+--------+-----+--------+
|divorced| 1293|    0.12|
| married| 6351|    0.57|
|  single| 3518|    0.32|
+--------+-----+--------+



In [40]:
df.groupBy("education").count().withColumn("fraction", F.round(F.col("count") / F.lit(df.count()),2)).show()

+---------+-----+--------+
|education|count|fraction|
+---------+-----+--------+
|  unknown|  497|    0.04|
| tertiary| 3689|    0.33|
|secondary| 5476|    0.49|
|  primary| 1500|    0.13|
+---------+-----+--------+



- Get descriptive statistics for numerical variables

In [45]:
df.select(F.round(F.mean("age"),3), F.median("age"), F.skewness("age")).show()

+------------------+-----------+------------------+
|round(avg(age), 3)|median(age)|     skewness(age)|
+------------------+-----------+------------------+
|            41.232|       39.0|0.8626636888266418|
+------------------+-----------+------------------+



In [46]:
df.select(F.round(F.stddev("balance"),3), F.sum("balance"), F.count("balance")).show()

+-------------------------+------------+--------------+
|round(stddev(balance), 3)|sum(balance)|count(balance)|
+-------------------------+------------+--------------+
|                 3225.413|    17061547|         11162|
+-------------------------+------------+--------------+



In [47]:
df.approxQuantile("balance", [0.25, 0.5, 0.75], 0.01)

[113.0, 525.0, 1631.0]

- Use relevant visualizations to inspect variables and relations between them

- Who is the client with the biggest balance?

In [50]:
df.orderBy("balance", ascending=False).limit(1).show()

+---+-------+-------+---------+-------+-------+-------+----+---------+---+-----+--------+--------+-----+--------+--------+-------+
|age|    job|marital|education|default|balance|housing|loan|  contact|day|month|duration|campaign|pdays|previous|poutcome|deposit|
+---+-------+-------+---------+-------+-------+-------+----+---------+---+-----+--------+--------+-----+--------+--------+-------+
| 84|retired|married|secondary|     no|  81204|     no|  no|telephone| 28|  dec|     679|       1|  313|       2|   other|    yes|
+---+-------+-------+---------+-------+-------+-------+----+---------+---+-----+--------+--------+-----+--------+--------+-------+



- What is the proportion of clients who's balance is more than twice the average?

In [55]:
avg_balance = df.select(F.avg("balance")).collect()[0][0]

In [57]:
df.filter(df["balance"] > 2*avg_balance).count() / df.count()

0.13931195126321447

- Do people with higher education have a better chance to deposit?

In [67]:
df.select("education").distinct().show()

+---------+
|education|
+---------+
|  unknown|
| tertiary|
|secondary|
|  primary|
+---------+



In [70]:
df_with_ed = df.withColumn("higher_education", df["education"] == "tertiary")

In [96]:
# Step 1: Compute total count per higher_education group
total_counts = df_with_ed.groupBy("higher_education").agg(F.count("*").alias("total_count"))

# Step 2: Compute count per higher_education and deposit group
df_with_ed_counts = df_with_ed.groupBy(["higher_education", "deposit"]).agg(F.count("*").alias("deposit_count"))

# Step 3: Join the counts and calculate the percentage
df_with_percentages = df_with_ed_counts.join(total_counts, "higher_education") \
    .withColumn("percentage", F.round((F.col("deposit_count") / F.col("total_count")) * 100, 2))

# Show the result
df_with_percentages.show()

+----------------+-------+-------------+-----------+----------+
|higher_education|deposit|deposit_count|total_count|percentage|
+----------------+-------+-------------+-----------+----------+
|            true|     no|         1693|       3689|     45.89|
|            true|    yes|         1996|       3689|     54.11|
|           false|    yes|         3293|       7473|     44.07|
|           false|     no|         4180|       7473|     55.93|
+----------------+-------+-------------+-----------+----------+



- What are the best predictors for deposit?

In [98]:
# calcluate correlations

In [105]:
df_transformed = df.withColumn(
    "deposit_numeric", F.when(F.col("deposit") == "yes", 1).otherwise(0)
)

In [118]:
for col, col_type in df_transformed.dtypes:
    if col_type != "string" and col not in ["deposit", "deposit_numeric"]:
        col_corr = df_transformed.corr(col, "deposit_numeric")
        print(f"Correlation between {col} and deposit: {col_corr}")

Correlation between age and deposit: 0.034900927890285476
Correlation between balance and deposit: 0.08112858919103522
Correlation between day and deposit: -0.0563258392226927
Correlation between duration and deposit: 0.4519193608425855
Correlation between campaign and deposit: -0.1280808461248679
Correlation between pdays and deposit: 0.15159251635244733
Correlation between previous and deposit: 0.13986711820078132


In [120]:
# best predictor is duration

In [99]:
# apply decision tree and see feature importance

In [127]:
df_numeric = df_transformed.drop("deposit")

In [129]:
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml import Pipeline
from pyspark.sql import functions as F

# Step 1: Identify categorical columns
categorical_cols = [col for col, dtype in df_numeric.dtypes if dtype == "string" and col != "deposit_numeric"]

# Step 2: Convert categorical columns to numeric using StringIndexer
indexers = [StringIndexer(inputCol=col, outputCol=col + "_index") for col in categorical_cols]

# Step 3: Use VectorAssembler to combine all feature columns into a single vector column
feature_cols = [col + "_index" for col in categorical_cols]  # Include transformed categorical columns
feature_cols += [col for col, dtype in df_numeric.dtypes if dtype != "string" and col != "deposit_numeric"]  # Include non-categorical columns

assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")

# Step 4: Initialize DecisionTreeClassifier
dt = DecisionTreeClassifier(labelCol="deposit_numeric", featuresCol="features")

# Step 5: Create the Pipeline
pipeline = Pipeline(stages=indexers + [assembler, dt])

# Step 6: Train the model
model = pipeline.fit(df_numeric)

# Step 7: Get the trained model
dt_model = model.stages[-1]

# Step 8: Get feature importance
feature_importances = dt_model.featureImportances

# Step 9: Print feature importance
for idx, col in enumerate(feature_cols):
    print(f"Feature: {col}, Importance: {feature_importances[idx]}")


                                                                                

Feature: job_index, Importance: 0.0
Feature: marital_index, Importance: 0.0
Feature: education_index, Importance: 0.0
Feature: default_index, Importance: 0.0
Feature: housing_index, Importance: 0.054546207933520365
Feature: loan_index, Importance: 0.0
Feature: contact_index, Importance: 0.1241385295303864
Feature: month_index, Importance: 0.08424206520946219
Feature: poutcome_index, Importance: 0.06541763300402254
Feature: age, Importance: 0.0
Feature: balance, Importance: 0.0
Feature: day, Importance: 0.011037971547373246
Feature: duration, Importance: 0.6080498499210819
Feature: campaign, Importance: 0.0
Feature: pdays, Importance: 0.05256774285415343
Feature: previous, Importance: 0.0


## OULAD Data 7 queries

## Use PySpark syntax to find pairs of coprimes up to some constant n. 

In [131]:
# create dataframe
from pyspark.sql import SparkSession

# Initialize Spark
spark = SparkSession.builder.appName("primes").getOrCreate()

25/02/01 00:03:49 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [145]:
data = [(i,) for i in range(2,100)]

In [146]:
columns = ["n",]

In [147]:
df = spark.createDataFrame(data, columns)

In [148]:
df_joined = df.alias("df1").join(
    df.alias("df2"),
    (F.col("df1.n") > F.col("df2.n")) & (F.col("df1.n") % F.col("df2.n") == 0),
    how="inner"
)