# Set Up PySpark

In [None]:
!pip install pyspark
!pip install findspark

Collecting findspark
  Downloading findspark-2.0.1-py2.py3-none-any.whl.metadata (352 bytes)
Downloading findspark-2.0.1-py2.py3-none-any.whl (4.4 kB)
Installing collected packages: findspark
Successfully installed findspark-2.0.1


In [None]:
import findspark
findspark.init()

In [None]:
from pyspark.sql import SparkSession
from pyspark.conf import SparkConf

# Load Data

In [None]:
spark=SparkSession.builder\
    .master("local[*]")\
    .appName("Merge_Data")\
    .getOrCreate()

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
df_curr = spark.read.option("header", "true").csv('/content/drive/MyDrive/Data/processed_current_application.csv')

In [None]:
df_prev = spark.read.option("header", "true").csv('/content/drive/MyDrive/Data/processed_previous_application.csv')

In [None]:
print(df_curr.count(), len(df_curr.columns))
print(df_prev.count(), len(df_prev.columns))

244280 72
338857 33


# Merge the two Data sets

In [None]:
df_merged = df_curr.join(df_prev, on='SK_ID_CURR', how='inner')

In [None]:
print(df_merged.count(), len(df_merged.columns))

232196 104


# Clean the Data

In [None]:
from pyspark.sql.functions import countDistinct
from pyspark.sql.functions import collect_set
from pyspark.sql.functions import col, count, when

In [None]:
def clean_data(df, null_threshold=25):
    # Get null percentage in each column
    total_rows = df.count()
    null_perc = df.select([
    ( (count(when(col(c).isNull(), c)) / total_rows) * 100 ).alias(c)
    for c in df.columns
    ])

    null_perc_row = null_perc.collect()[0].asDict()

    # Decide which columns to drop - threshold
    cols_to_drop = [col_name for col_name, perc in null_perc_row.items() if perc > null_threshold]
    print(f"Columns to drop:  {cols_to_drop}")

    # Drop columns with nulls over threshold
    data_cleaned = df.drop(*cols_to_drop)
    print(f"Shape after dropping cols: ({data_cleaned.count()}, {len(data_cleaned.columns)})")

    # Drop rows with any nulls
    data_cleaned = data_cleaned.dropna()
    print(f"Shape after dropping rows: ({data_cleaned.count()}, {len(data_cleaned.columns)})")

    return data_cleaned

In [None]:
df_merged = clean_data(df_merged)

Columns to drop:  []
Shape after dropping cols: (232196, 104)
Shape after dropping rows: (232196, 104)


# Save the Merged Data

In [None]:
import os
import shutil
from pathlib import Path
# Define your paths
current_dir = '/content/drive/MyDrive'
data_relative_path = 'Data'
output_dir = os.path.join(current_dir, data_relative_path, "tmp_output")

def save_csv(df, final_csv_path):
    # write df into a temporary folder
    df.coalesce(1).write.option("header", "true").mode("overwrite").csv(output_dir)

    # find the generated part file
    part_file = next(Path(output_dir).glob("part-*.csv"))

    # move and rename
    shutil.move(str(part_file), final_csv_path)

    # delete the temporary folder
    shutil.rmtree(output_dir)

    return f"File saved to: {final_csv_path}"

# Example usage
final_csv_path = os.path.join(current_dir, data_relative_path, "merged_application.csv")
save_csv(df_merged, final_csv_path)

'File saved to: /content/drive/MyDrive/Data/merged_application.csv'

In [None]:
print(df_merged.printSchema())

root
 |-- SK_ID_CURR: string (nullable = true)
 |-- TARGET: string (nullable = true)
 |-- NAME_CONTRACT_TYPE: string (nullable = true)
 |-- CODE_GENDER: string (nullable = true)
 |-- FLAG_OWN_CAR: string (nullable = true)
 |-- FLAG_OWN_REALTY: string (nullable = true)
 |-- CNT_CHILDREN: string (nullable = true)
 |-- AMT_INCOME_TOTAL: string (nullable = true)
 |-- AMT_CREDIT: string (nullable = true)
 |-- AMT_ANNUITY: string (nullable = true)
 |-- AMT_GOODS_PRICE: string (nullable = true)
 |-- NAME_TYPE_SUITE: string (nullable = true)
 |-- NAME_INCOME_TYPE: string (nullable = true)
 |-- NAME_EDUCATION_TYPE: string (nullable = true)
 |-- NAME_FAMILY_STATUS: string (nullable = true)
 |-- NAME_HOUSING_TYPE: string (nullable = true)
 |-- REGION_POPULATION_RELATIVE: string (nullable = true)
 |-- DAYS_BIRTH: string (nullable = true)
 |-- DAYS_EMPLOYED: string (nullable = true)
 |-- DAYS_REGISTRATION: string (nullable = true)
 |-- DAYS_ID_PUBLISH: string (nullable = true)
 |-- FLAG_MOBIL: stri