# Set Up PySpark

In [None]:
!pip install pyspark
!pip install findspark

Collecting findspark
  Downloading findspark-2.0.1-py2.py3-none-any.whl.metadata (352 bytes)
Downloading findspark-2.0.1-py2.py3-none-any.whl (4.4 kB)
Installing collected packages: findspark
Successfully installed findspark-2.0.1


In [None]:
import findspark
findspark.init()

In [None]:
from pyspark.sql import SparkSession
from pyspark.conf import SparkConf

# Load Data

In [None]:
spark=SparkSession.builder\
    .master("local[*]")\
    .appName("Process_curr_Data")\
    .getOrCreate()

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!/content/drive/MyDrive/Data/application_data.csv.zip

/bin/bash: line 1: /content/drive/MyDrive/Data/application_data.csv.zip: Permission denied


In [None]:
!unzip -q "/content/drive/MyDrive/Data/application_data.csv.zip" -d "/content/current_application.csv"

In [None]:
df_curr = spark.read.csv("/content/current_application.csv/application_data.csv", header=True, inferSchema=True)

# Clean Data -> col with Threshold for null & row contains nulls

In [None]:
from pyspark.sql.functions import countDistinct
from pyspark.sql.functions import collect_set
from pyspark.sql.functions import col, count, when

In [None]:
def clean_data(df, null_threshold=25):
    # Get null percentage in each column
    total_rows = df.count()
    null_perc = df.select([
    ( (count(when(col(c).isNull(), c)) / total_rows) * 100 ).alias(c)
    for c in df.columns
    ])

    null_perc_row = null_perc.collect()[0].asDict()

    # Decide which columns to drop - threshold
    cols_to_drop = [col_name for col_name, perc in null_perc_row.items() if perc > null_threshold]
    print(f"Columns to drop:  {cols_to_drop}")

    # Drop columns with nulls over threshold
    data_cleaned = df.drop(*cols_to_drop)
    print(f"Shape after dropping cols: ({data_cleaned.count()}, {len(data_cleaned.columns)})")

    # Drop rows with any nulls
    data_cleaned = data_cleaned.dropna()
    print(f"Shape after dropping rows: ({data_cleaned.count()}, {len(data_cleaned.columns)})")

    return data_cleaned

In [None]:
df_curr = clean_data(df_curr)

Columns to drop:  ['OWN_CAR_AGE', 'OCCUPATION_TYPE', 'EXT_SOURCE_1', 'APARTMENTS_AVG', 'BASEMENTAREA_AVG', 'YEARS_BEGINEXPLUATATION_AVG', 'YEARS_BUILD_AVG', 'COMMONAREA_AVG', 'ELEVATORS_AVG', 'ENTRANCES_AVG', 'FLOORSMAX_AVG', 'FLOORSMIN_AVG', 'LANDAREA_AVG', 'LIVINGAPARTMENTS_AVG', 'LIVINGAREA_AVG', 'NONLIVINGAPARTMENTS_AVG', 'NONLIVINGAREA_AVG', 'APARTMENTS_MODE', 'BASEMENTAREA_MODE', 'YEARS_BEGINEXPLUATATION_MODE', 'YEARS_BUILD_MODE', 'COMMONAREA_MODE', 'ELEVATORS_MODE', 'ENTRANCES_MODE', 'FLOORSMAX_MODE', 'FLOORSMIN_MODE', 'LANDAREA_MODE', 'LIVINGAPARTMENTS_MODE', 'LIVINGAREA_MODE', 'NONLIVINGAPARTMENTS_MODE', 'NONLIVINGAREA_MODE', 'APARTMENTS_MEDI', 'BASEMENTAREA_MEDI', 'YEARS_BEGINEXPLUATATION_MEDI', 'YEARS_BUILD_MEDI', 'COMMONAREA_MEDI', 'ELEVATORS_MEDI', 'ENTRANCES_MEDI', 'FLOORSMAX_MEDI', 'FLOORSMIN_MEDI', 'LANDAREA_MEDI', 'LIVINGAPARTMENTS_MEDI', 'LIVINGAREA_MEDI', 'NONLIVINGAPARTMENTS_MEDI', 'NONLIVINGAREA_MEDI', 'FONDKAPREMONT_MODE', 'HOUSETYPE_MODE', 'TOTALAREA_MODE', 'WALL

# Save Cleaned Data

In [None]:
import os
import shutil
from pathlib import Path

In [None]:
# Define your paths
current_dir = '/content/drive/MyDrive'
data_relative_path = 'Data'
output_dir = os.path.join(current_dir, data_relative_path, "tmp_output")

def save_csv(df, final_csv_path):
    # write df into a temporary folder
    df.coalesce(1).write.option("header", "true").mode("overwrite").csv(output_dir)

    # find the generated part file
    part_file = next(Path(output_dir).glob("part-*.csv"))

    # move and rename
    shutil.move(str(part_file), final_csv_path)

    # delete the temporary folder
    shutil.rmtree(output_dir)

    return f"File saved to: {final_csv_path}"

# Example usage
final_csv_path = os.path.join(current_dir, data_relative_path, "processed_current_application.csv")
save_csv(df_curr, final_csv_path)

'File saved to: /content/drive/MyDrive/Data/processed_current_application.csv'