# Set Up Pyspark

In [None]:
!pip install pyspark
!pip install findspark

Collecting findspark
  Downloading findspark-2.0.1-py2.py3-none-any.whl.metadata (352 bytes)
Downloading findspark-2.0.1-py2.py3-none-any.whl (4.4 kB)
Installing collected packages: findspark
Successfully installed findspark-2.0.1


In [None]:
import findspark
findspark.init()

In [None]:
from pyspark.sql import SparkSession
from pyspark.conf import SparkConf

In [None]:
spark=SparkSession.builder\
    .master("local[*]")\
    .appName("Encoding_Outliers_FeatureSelection")\
    .getOrCreate()

# Load Data

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
df = spark.read.option("header", "true").csv('/content/drive/MyDrive/Data/merged_application.csv')

In [None]:
print(df.count(), len(df.columns))

232196 104


# Select The Important Features from curr_features.sh file after checking and EDA Phaze

In [None]:
print(df.printSchema())

root
 |-- SK_ID_CURR: string (nullable = true)
 |-- TARGET: string (nullable = true)
 |-- NAME_CONTRACT_TYPE: string (nullable = true)
 |-- CODE_GENDER: string (nullable = true)
 |-- FLAG_OWN_CAR: string (nullable = true)
 |-- FLAG_OWN_REALTY: string (nullable = true)
 |-- CNT_CHILDREN: string (nullable = true)
 |-- AMT_INCOME_TOTAL: string (nullable = true)
 |-- AMT_CREDIT: string (nullable = true)
 |-- AMT_ANNUITY: string (nullable = true)
 |-- AMT_GOODS_PRICE: string (nullable = true)
 |-- NAME_TYPE_SUITE: string (nullable = true)
 |-- NAME_INCOME_TYPE: string (nullable = true)
 |-- NAME_EDUCATION_TYPE: string (nullable = true)
 |-- NAME_FAMILY_STATUS: string (nullable = true)
 |-- NAME_HOUSING_TYPE: string (nullable = true)
 |-- REGION_POPULATION_RELATIVE: string (nullable = true)
 |-- DAYS_BIRTH: string (nullable = true)
 |-- DAYS_EMPLOYED: string (nullable = true)
 |-- DAYS_REGISTRATION: string (nullable = true)
 |-- DAYS_ID_PUBLISH: string (nullable = true)
 |-- FLAG_MOBIL: stri

In [None]:
features = [
    'TARGET', 'NAME_CONTRACT_TYPE', 'CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY',
    'CNT_CHILDREN', 'AMT_INCOME_TOTAL', 'AMT_CREDIT', 'AMT_ANNUITY', 'AMT_GOODS_PRICE',
    'NAME_INCOME_TYPE', 'NAME_EDUCATION_TYPE', 'NAME_FAMILY_STATUS', 'NAME_HOUSING_TYPE',
    'REGION_POPULATION_RELATIVE', 'DAYS_BIRTH', 'DAYS_EMPLOYED', 'DAYS_REGISTRATION',
    'DAYS_ID_PUBLISH', 'FLAG_MOBIL', 'FLAG_EMP_PHONE', 'FLAG_WORK_PHONE', 'FLAG_CONT_MOBILE',
    'FLAG_PHONE', 'FLAG_EMAIL', 'CNT_FAM_MEMBERS', 'REGION_RATING_CLIENT',
    'REGION_RATING_CLIENT_W_CITY', 'WEEKDAY_APPR_PROCESS_START', 'HOUR_APPR_PROCESS_START',
    'REG_REGION_NOT_LIVE_REGION', 'REG_REGION_NOT_WORK_REGION', 'LIVE_REGION_NOT_WORK_REGION',
    'REG_CITY_NOT_LIVE_CITY', 'REG_CITY_NOT_WORK_CITY', 'LIVE_CITY_NOT_WORK_CITY',
    'ORGANIZATION_TYPE', 'OBS_30_CNT_SOCIAL_CIRCLE', 'DEF_30_CNT_SOCIAL_CIRCLE',
    'OBS_60_CNT_SOCIAL_CIRCLE', 'DEF_60_CNT_SOCIAL_CIRCLE', 'DAYS_LAST_PHONE_CHANGE',
    'FLAG_DOCUMENT_2', 'FLAG_DOCUMENT_3', 'FLAG_DOCUMENT_4', 'FLAG_DOCUMENT_5', 'FLAG_DOCUMENT_6',
    'FLAG_DOCUMENT_7', 'FLAG_DOCUMENT_8', 'FLAG_DOCUMENT_9', 'FLAG_DOCUMENT_10', 'FLAG_DOCUMENT_11',
    'FLAG_DOCUMENT_12', 'FLAG_DOCUMENT_13', 'FLAG_DOCUMENT_14', 'FLAG_DOCUMENT_15', 'FLAG_DOCUMENT_16',
    'FLAG_DOCUMENT_17', 'FLAG_DOCUMENT_18', 'FLAG_DOCUMENT_19', 'FLAG_DOCUMENT_20', 'FLAG_DOCUMENT_21',
    'AMT_REQ_CREDIT_BUREAU_HOUR', 'AMT_REQ_CREDIT_BUREAU_DAY', 'AMT_REQ_CREDIT_BUREAU_WEEK',
    'AMT_REQ_CREDIT_BUREAU_MON', 'AMT_REQ_CREDIT_BUREAU_QRT', 'AMT_REQ_CREDIT_BUREAU_YEAR',
    'PREV_COUNT', 'PREV_AVG_AMT_ANNUITY', 'PREV_AVG_AMT_APPLICATION', 'PREV_AVG_AMT_CREDIT',
    'PREV_AVG_GOODS_APP_RATIO', 'PREV_AVG_DAYS_DECISION', 'PREV_AVG_CNT_PAYMENT',
    'PREV_AVG_HOUR_APPR_PROCESS_START', 'REFUSED_STATUS_COUNT', 'APPROVED_STATUS_COUNT',
    'Consumer loans', 'Revolving loans', 'Cash loans', 'XAP', 'HC', 'LIMIT',
    'Refreshed', 'Repeater', 'POS', 'Cash_x', 'Cards', 'Credit and cash offices',
    'Country-wide', 'Stone', 'Cash', 'POS household with interest',
    'POS mobile with interest', 'ZERO_NFLAG_LAST_APPL_IN_DAY',
    'ZERO_FLAG_LAST_APPL_PER_CONTRACT', 'XAP_NAME_CASH_LOAN_PURPOSE',
    'Other_NAME_CASH_LOAN_PURPOSE', 'PREV_AVG_NAME_YIELD_GROUP'
]
len(features)

100

In [None]:
df = df.select(features)

In [None]:
print(df.count(), len(df.columns))

232196 100


# Encoding  using (curr_application_features_encoding_methods.txt) file

In [None]:
from pyspark.sql import functions as F

In [None]:
df = df \
    .withColumn('NAME_CONTRACT_TYPE', F.when(F.col('NAME_CONTRACT_TYPE') == 'Cash loans', 1)
                                         .when(F.col('NAME_CONTRACT_TYPE') == 'Revolving loans', 0)
                                         .otherwise(None)) \
    .withColumn('CODE_GENDER', F.when(F.col('CODE_GENDER').isin('M', 'XNA'), 1)
                                .when(F.col('CODE_GENDER') == 'F', 0)
                                .otherwise(None)) \
    .withColumn('FLAG_OWN_CAR', F.when(F.col('FLAG_OWN_CAR') == 'Y', 1)
                                 .when(F.col('FLAG_OWN_CAR') == 'N', 0)
                                 .otherwise(None)) \
    .withColumn('FLAG_OWN_REALTY', F.when(F.col('FLAG_OWN_REALTY') == 'Y', 1)
                                    .when(F.col('FLAG_OWN_REALTY') == 'N', 0)
                                    .otherwise(None))

In [None]:
from pyspark.sql.functions import count,col

total_count = df.count()

# Calculate the count of each unique value in the 'NAME_INCOME_TYPE' column
income_type_counts = df.groupBy("NAME_INCOME_TYPE").agg(count("*").alias("count"))

# Calculate the percentage of each unique value
income_type_percentages = income_type_counts.withColumn(
    "percentage", (col("count") / total_count) * 100
)

# Display the results
income_type_percentages.show()


+--------------------+------+-------------------+
|    NAME_INCOME_TYPE| count|         percentage|
+--------------------+------+-------------------+
|             Student|    10|0.00430670640321108|
|Commercial associate| 51822|  22.31821392272046|
|       State servant| 17486|  7.530706816654894|
|             Working|121016| 52.118038209099204|
|           Pensioner| 41861|   18.0283036744819|
|          Unemployed|     1|4.30670640321108E-4|
+--------------------+------+-------------------+



In [None]:
df = df \
    .withColumn('NAME_INCOME_TYPE_Commercial_associate',
                F.when(F.col('NAME_INCOME_TYPE') == 'Commercial associate', 1).otherwise(0)) \
    .withColumn('NAME_INCOME_TYPE_Pensioner',
                F.when(F.col('NAME_INCOME_TYPE') == 'Pensioner', 1).otherwise(0)) \
    .withColumn('NAME_INCOME_TYPE_Working',
                F.when(F.col('NAME_INCOME_TYPE') == 'Working', 1).otherwise(0))

In [None]:
df = df.drop("NAME_INCOME_TYPE")

In [None]:
df = df.withColumn(
    'NAME_EDUCATION_TYPE_encoded',
    F.when(F.col('NAME_EDUCATION_TYPE') == 'Secondary / secondary special', 1)
     .when(F.col('NAME_EDUCATION_TYPE') == 'Higher education', 3)
     .when(F.col('NAME_EDUCATION_TYPE') == 'Incomplete higher', 2)
     .when(F.col('NAME_EDUCATION_TYPE') == 'Lower secondary', 0)
     .when(F.col('NAME_EDUCATION_TYPE') == 'Academic degree', 4)
     .otherwise(None)
)

In [None]:
df = df.drop("NAME_EDUCATION_TYPE")

In [None]:
# Calculate the count of each unique value in the 'NAME_INCOME_TYPE' column
income_type_counts = df.groupBy("NAME_FAMILY_STATUS").agg(count("*").alias("count"))

# Calculate the percentage of each unique value
income_type_percentages = income_type_counts.withColumn(
    "percentage", (col("count") / total_count) * 100
)

# Display the results
income_type_percentages.show()

+--------------------+------+------------------+
|  NAME_FAMILY_STATUS| count|        percentage|
+--------------------+------+------------------+
|           Separated| 15231| 6.559544522730796|
|             Married|151503| 65.24789402056884|
|Single / not married| 31730|13.665179417388757|
|               Widow| 12263|5.2813140622577475|
|      Civil marriage| 21469| 9.246067977053867|
+--------------------+------+------------------+



In [None]:
df_target_encoded  = df.groupBy('NAME_FAMILY_STATUS') \
    .agg(F.mean('TARGET').alias('NAME_FAMILY_STATUS_TARGET_EN'))

# Join this result back to the original DataFrame to apply target encoding
df = df.join(df_target_encoded, on='NAME_FAMILY_STATUS', how='left')

# Drop the unnecessary columns
df = df.drop('NAME_FAMILY_STATUS')

In [None]:
# Calculate the count of each unique value in the 'NAME_INCOME_TYPE' column
income_type_counts = df.groupBy("NAME_HOUSING_TYPE").agg(count("*").alias("count"))

# Calculate the percentage of each unique value
income_type_percentages = income_type_counts.withColumn(
    "percentage", (col("count") / total_count) * 100
)

# Display the results
income_type_percentages.show()

+-------------------+------+-------------------+
|  NAME_HOUSING_TYPE| count|         percentage|
+-------------------+------+-------------------+
|  House / apartment|206879|  89.09671139899051|
|Municipal apartment|  8422|  3.627108132784372|
|    Co-op apartment|   862|0.37123809195679514|
|   Rented apartment|  3424| 1.4746162724594738|
|   Office apartment|  1971| 0.8488518320729039|
|       With parents| 10638|  4.581474271735947|
+-------------------+------+-------------------+



In [None]:
df = df.withColumn(
    "NAME_HOUSING_TYPE_MAPPED",
    F.when(F.col("NAME_HOUSING_TYPE").isin("House / apartment", "With parents"), F.lit(1)).otherwise(F.lit(0))
)

df = df.drop("NAME_HOUSING_TYPE")

In [None]:
df = df.withColumn(
    "DAYS_BIRTH_YEARS",
    F.round(F.abs(F.col("DAYS_BIRTH")) / 365, 2)
)

# (Optional) Drop the original 'DAYS_BIRTH' column if you don't need it
df = df.drop("DAYS_BIRTH")

In [None]:
from pyspark.sql.functions import when

In [None]:
df = df.withColumn(
    "WEEKDAY_APPR_PROCESS_START",
    when(F.col("WEEKDAY_APPR_PROCESS_START") == "MONDAY", 0)
    .when(F.col("WEEKDAY_APPR_PROCESS_START") == "TUESDAY", 1)
    .when(F.col("WEEKDAY_APPR_PROCESS_START") == "WEDNESDAY", 2)
    .when(F.col("WEEKDAY_APPR_PROCESS_START") == "THURSDAY", 3)
    .when(F.col("WEEKDAY_APPR_PROCESS_START") == "FRIDAY", 4)
    .when(F.col("WEEKDAY_APPR_PROCESS_START") == "SATURDAY", 5)
    .when(F.col("WEEKDAY_APPR_PROCESS_START") == "SUNDAY", 6)
)

In [None]:
# Calculate the count of each unique value in the 'NAME_INCOME_TYPE' column
income_type_counts = df.groupBy("ORGANIZATION_TYPE").agg(count("*").alias("count"))

# Calculate the percentage of each unique value
income_type_percentages = income_type_counts.withColumn(
    "percentage", (col("count") / total_count) * 100
)

# Display the results
income_type_percentages.show()

+-------------------+-----+--------------------+
|  ORGANIZATION_TYPE|count|          percentage|
+-------------------+-----+--------------------+
|           Services| 1160| 0.49957794277248535|
|  Industry: type 13|   41|0.017657496253165428|
|      Trade: type 5|   36|0.015504143051559888|
|            Telecom|  444| 0.19121776430257195|
|               Bank| 1868|  0.8044927561198298|
|   Industry: type 5|  479| 0.20629123671381072|
|        Electricity|  739| 0.31826560319729885|
|            Housing| 2265|  0.9754690003273097|
|  Transport: type 2| 1844|  0.7941566607521232|
|          Insurance|  402| 0.17312959740908543|
|           Security| 2475|  1.0659098347947422|
|             Postal| 1657|   0.713621251012076|
|             School| 7198|   3.099967269031336|
|         Government| 8275|   3.563799548657169|
|Security Ministries| 1635|  0.7041464969250116|
|           Military| 2102|   0.905269685954969|
|            Realtor|  287| 0.12360247377215801|
|  Industry: type 10

In [None]:
org_target_mean = df.groupBy("ORGANIZATION_TYPE").agg(
    F.mean("target").alias("ORG_TYPE_TARGET_MEAN")
)

df = df.join(
    org_target_mean,
    on="ORGANIZATION_TYPE",
    how="left"
)

df = df.drop("ORGANIZATION_TYPE")

In [None]:
doc_cols = [col for col in df.columns if col.startswith('FLAG_DOCUMENT_')]

df = df.withColumn("Documents_count", sum(F.col(c) for c in doc_cols))

df = df.drop(*doc_cols)

In [None]:
df.show(5)

+------+------------------+-----------+------------+---------------+------------+----------------+----------+-----------+---------------+--------------------------+-------------+-----------------+---------------+----------+--------------+---------------+----------------+----------+----------+---------------+--------------------+---------------------------+--------------------------+-----------------------+--------------------------+--------------------------+---------------------------+----------------------+----------------------+-----------------------+------------------------+------------------------+------------------------+------------------------+----------------------+--------------------------+-------------------------+--------------------------+-------------------------+-------------------------+--------------------------+----------+--------------------+------------------------+-------------------+------------------------+----------------------+--------------------+-----------

## save the new data

In [None]:
import os
import shutil
from pathlib import Path
# Define your paths
current_dir = '/content/drive/MyDrive'
data_relative_path = 'Data'
output_dir = os.path.join(current_dir, data_relative_path, "tmp_output")

def save_csv(df, final_csv_path):
    # write df into a temporary folder
    df.coalesce(1).write.option("header", "true").mode("overwrite").csv(output_dir)

    # find the generated part file
    part_file = next(Path(output_dir).glob("part-*.csv"))

    # move and rename
    shutil.move(str(part_file), final_csv_path)

    shutil.rmtree(output_dir)

    return f"File saved to: {final_csv_path}"

final_csv_path = os.path.join(current_dir, data_relative_path, "encoded_merged_application.csv")
save_csv(df, final_csv_path)

'File saved to: /content/drive/MyDrive/Data/encoded_merged_application.csv'

In [None]:
from pyspark.sql.types import DoubleType

In [None]:
from pyspark.sql import functions as F

In [None]:
for col_name in df.columns:
    df = df.withColumn(col_name, F.col(col_name).cast(DoubleType()))

In [None]:
print(df.printSchema())

root
 |-- TARGET: double (nullable = true)
 |-- NAME_CONTRACT_TYPE: double (nullable = true)
 |-- CODE_GENDER: double (nullable = true)
 |-- FLAG_OWN_CAR: double (nullable = true)
 |-- FLAG_OWN_REALTY: double (nullable = true)
 |-- CNT_CHILDREN: double (nullable = true)
 |-- AMT_INCOME_TOTAL: double (nullable = true)
 |-- AMT_CREDIT: double (nullable = true)
 |-- AMT_ANNUITY: double (nullable = true)
 |-- AMT_GOODS_PRICE: double (nullable = true)
 |-- REGION_POPULATION_RELATIVE: double (nullable = true)
 |-- DAYS_EMPLOYED: double (nullable = true)
 |-- DAYS_REGISTRATION: double (nullable = true)
 |-- DAYS_ID_PUBLISH: double (nullable = true)
 |-- FLAG_MOBIL: double (nullable = true)
 |-- FLAG_EMP_PHONE: double (nullable = true)
 |-- FLAG_WORK_PHONE: double (nullable = true)
 |-- FLAG_CONT_MOBILE: double (nullable = true)
 |-- FLAG_PHONE: double (nullable = true)
 |-- FLAG_EMAIL: double (nullable = true)
 |-- CNT_FAM_MEMBERS: double (nullable = true)
 |-- REGION_RATING_CLIENT: double (n

In [None]:
numeric_cols = [field.name for field in df.schema.fields if isinstance(field.dataType, DoubleType)]
len(numeric_cols)

83

# Outliers

In [None]:
from pyspark.sql import Window

In [None]:
stats = df.select([
    F.mean(F.col(c)).alias(f"{c}_mean") for c in numeric_cols
] + [
    F.stddev(F.col(c)).alias(f"{c}_stddev") for c in numeric_cols
]).collect()[0]


zscore_outliers = {}

for col_name in numeric_cols:
    mean = stats[f"{col_name}_mean"]
    stddev = stats[f"{col_name}_stddev"]

    if stddev == 0 or stddev is None:
        continue

    df = df.withColumn(f"{col_name}_zscore", (F.col(col_name) - mean) / stddev)

    outliers_df = df.filter(F.abs(F.col(f"{col_name}_zscore")) > 3)

    count_outliers = outliers_df.count()

    zscore_outliers[col_name] = {
        'outliers_count': count_outliers,
        'mean': mean,
        'stddev': stddev
    }

for col_name, info in zscore_outliers.items():
    print(f"Column: {col_name}")
    print(f"  ➔ Outliers count: {info['outliers_count']}")
    print(f"  ➔ Mean: {info['mean']}")
    print(f"  ➔ Stddev: {info['stddev']}")
    print("-" * 30)


Column: TARGET
  ➔ Outliers count: 18296
  ➔ Mean: 0.07879550035314993
  ➔ Stddev: 0.26941989920660797
------------------------------
Column: NAME_CONTRACT_TYPE
  ➔ Outliers count: 20394
  ➔ Mean: 0.9121690296129132
  ➔ Stddev: 0.2830495293557322
------------------------------
Column: CODE_GENDER
  ➔ Outliers count: 0
  ➔ Mean: 0.331693913762511
  ➔ Stddev: 0.47082270125846976
------------------------------
Column: FLAG_OWN_CAR
  ➔ Outliers count: 0
  ➔ Mean: 0.34836948095574427
  ➔ Stddev: 0.47645478626642657
------------------------------
Column: FLAG_OWN_REALTY
  ➔ Outliers count: 0
  ➔ Mean: 0.7054342021395718
  ➔ Stddev: 0.4558483119579252
------------------------------
Column: CNT_CHILDREN
  ➔ Outliers count: 3349
  ➔ Mean: 0.4295724301882892
  ➔ Stddev: 0.7307387971727246
------------------------------
Column: AMT_INCOME_TOTAL
  ➔ Outliers count: 152
  ➔ Mean: 170607.98641615274
  ➔ Stddev: 260983.59123385994
------------------------------
Column: AMT_CREDIT
  ➔ Outliers count: 

In [None]:
drop_cols = [f"{col}_zscore" for col in numeric_cols]
df = df.drop(*drop_cols)

In [None]:
df.show(5)

+------+------------------+-----------+------------+---------------+------------+----------------+----------+-----------+---------------+--------------------------+-------------+-----------------+---------------+----------+--------------+---------------+----------------+----------+----------+---------------+--------------------+---------------------------+--------------------------+-----------------------+--------------------------+--------------------------+---------------------------+----------------------+----------------------+-----------------------+------------------------+------------------------+------------------------+------------------------+----------------------+--------------------------+-------------------------+--------------------------+-------------------------+-------------------------+--------------------------+----------+--------------------+------------------------+-------------------+------------------------+----------------------+--------------------+-----------

## Remove Outliers

In [None]:
#  Identify non binary columns
non_binary_cols = []


for col in numeric_cols:
    unique_vals = df.select(col).distinct().rdd.flatMap(lambda x: x).collect()
    unique_vals = [v for v in unique_vals if v is not None]

    if not (set(unique_vals) <= {0.0, 1.0}):
        non_binary_cols.append(col)

print(f"Total non-binary numeric features: {len(non_binary_cols)}")

Total non-binary numeric features: 57


In [None]:
df = df.drop("PREV_AVG_GOODS_APP_RATIO")
numeric_cols.remove("PREV_AVG_GOODS_APP_RATIO")

In [None]:
len(non_binary_cols)

57

In [None]:
stats = df.select([
    F.mean(col).alias(f"{col}_mean") for col in non_binary_cols
] + [
    F.stddev(col).alias(f"{col}_stddev") for col in non_binary_cols
]).collect()[0].asDict()

#  Build Z-Score filters
zscore_filters = []

for col in non_binary_cols:
    mean = stats[f"{col}_mean"]
    stddev = stats[f"{col}_stddev"]

    if stddev == 0 or stddev is None:
        continue

    zscore = (F.col(col) - mean) / stddev
    zscore_filters.append(F.abs(zscore) <= 3)

#  Apply filters
from functools import reduce
df_no_outliers = df.filter(reduce(lambda x, y: x & y, zscore_filters))

#  See the result
print(f"Original rows: {df.count()}")
print(f"Rows after removing outliers: {df_no_outliers.count()}")

Original rows: 232196
Rows after removing outliers: 157782


## Save the cleaned data

In [None]:
import os
import shutil
from pathlib import Path

current_dir = '/content/drive/MyDrive'

output_dir = os.path.join(current_dir, data_relative_path, "tmp_output")

def save_csv(df, final_csv_path):
    # write df into a temporary folder
    df.coalesce(1).write.option("header", "true").mode("overwrite").csv(output_dir)

    # find the generated part file
    part_file = next(Path(output_dir).glob("part-*.csv"))

    # move and rename
    shutil.move(str(part_file), final_csv_path)

    shutil.rmtree(output_dir)

    return f"File saved to: {final_csv_path}"


data_relative_path = 'Data'

df_no_outliers_path = os.path.join(current_dir, data_relative_path, "df_no_outliers.csv")
save_csv(df_no_outliers, df_no_outliers_path)

'File saved to: /content/drive/MyDrive/Data/df_no_outliers.csv'

## Split the data to prevent any information Leakage when testing the Corr

In [None]:
df=df_no_outliers

In [None]:
print(f"Shape of df: ({df.count()}, {len(df.columns)})")

Shape of df: (157782, 82)


In [None]:
seed = 42

class_0 = df.filter(F.col("TARGET") == 0)
class_1 = df.filter(F.col("TARGET") == 1)

class_0_train, class_0_test = class_0.randomSplit([0.8, 0.2], seed=seed)
class_1_train, class_1_test = class_1.randomSplit([0.8, 0.2], seed=seed)

train_df = class_0_train.union(class_1_train)
test_df = class_0_test.union(class_1_test)

train_df = train_df.orderBy(F.rand(seed))
test_df = test_df.orderBy(F.rand(seed))

In [None]:
y_train = train_df.select("TARGET")
X_train = train_df.drop("TARGET")

y_test = test_df.select("TARGET")
X_test = test_df.drop("TARGET")

In [None]:
print(X_train.count(), len(X_train.columns))
print(y_train.count(), len(y_train.columns))
print(X_test.count(), len(X_test.columns))
print(y_test.count(), len(y_test.columns))

126163 81
126163 1
31619 81
31619 1


# Correlation

## High Correlated Features

In [None]:
from pyspark.ml.stat import Correlation
from pyspark.ml.feature import VectorAssembler
from pyspark.sql.functions import abs as abs_

In [None]:
numeric_cols = [field.name for field in X_train.schema.fields if isinstance(field.dataType, (DoubleType))]

vector_col = "features_vec"
assembler = VectorAssembler(inputCols=numeric_cols, outputCol=vector_col)
df_vector = assembler.transform(X_train).select(vector_col)

correlation_matrix = Correlation.corr(df_vector, vector_col, method="pearson").collect()[0][0]

import numpy as np
corr_array = np.array(correlation_matrix.toArray())


high_corr_pairs = []

for i in range(len(numeric_cols)):
    for j in range(i+1, len(numeric_cols)):
        if abs(corr_array[i, j]) > 0.8:
            high_corr_pairs.append((numeric_cols[i], numeric_cols[j], corr_array[i, j]))

high_corr_pairs_sorted = sorted(high_corr_pairs, key=lambda x: abs(x[2]), reverse=True)

for f1, f2, corr in high_corr_pairs_sorted:
    print(f"{f1} <--> {f2} | correlation = {corr:.4f}")

DAYS_EMPLOYED <--> NAME_INCOME_TYPE_Pensioner | correlation = 0.9998
FLAG_EMP_PHONE <--> NAME_INCOME_TYPE_Pensioner | correlation = -0.9998
DAYS_EMPLOYED <--> FLAG_EMP_PHONE | correlation = -0.9998
OBS_30_CNT_SOCIAL_CIRCLE <--> OBS_60_CNT_SOCIAL_CIRCLE | correlation = 0.9976
AMT_CREDIT <--> AMT_GOODS_PRICE | correlation = 0.9848
Consumer loans <--> POS | correlation = 0.9709
PREV_COUNT <--> Repeater | correlation = 0.9643
PREV_AVG_AMT_APPLICATION <--> PREV_AVG_AMT_CREDIT | correlation = 0.9608
REGION_RATING_CLIENT <--> REGION_RATING_CLIENT_W_CITY | correlation = 0.9495
NAME_CONTRACT_TYPE <--> Documents_count | correlation = 0.9436
PREV_COUNT <--> XAP | correlation = 0.9428
XAP <--> Repeater | correlation = 0.9181
Consumer loans <--> XAP_NAME_CASH_LOAN_PURPOSE | correlation = 0.8999
APPROVED_STATUS_COUNT <--> XAP | correlation = 0.8873
Revolving loans <--> Cards | correlation = 0.8871
Cash loans <--> Cash_x | correlation = 0.8860
POS <--> XAP_NAME_CASH_LOAN_PURPOSE | correlation = 0.873

In [None]:
# Cash loans
# REG_CITY_NOT_WORK_CITY
# XAP

# POS
# DEF_30_CNT_SOCIAL_CIRCLE
# REG_REGION_NOT_WORK_REGION
# CNT_CHILDREN
# Consumer loans
# Revolving loans
# REGION_RATING_CLIENT
# Repeater
# PREV_AVG_AMT_APPLICATION
# AMT_GOODS_PRICE
# OBS_30_CNT_SOCIAL_CIRCLE
# DAYS_EMPLOYED
# NAME_INCOME_TYPE_Pensioner

# NAME_CONTRACT_TYPE

In [None]:
cols_to_drop = [
    "Cash loans", "REG_CITY_NOT_WORK_CITY", "XAP","NAME_CONTRACT_TYPE", "POS", "DEF_30_CNT_SOCIAL_CIRCLE",
    "REG_REGION_NOT_WORK_REGION", "CNT_CHILDREN", "Consumer loans", "Revolving loans",
    "REGION_RATING_CLIENT", "Repeater", "PREV_AVG_AMT_APPLICATION", "AMT_GOODS_PRICE",
    "OBS_30_CNT_SOCIAL_CIRCLE", "DAYS_EMPLOYED", "NAME_INCOME_TYPE_Pensioner"
]

X_train = X_train.drop(*cols_to_drop)
X_test = X_test.drop(*cols_to_drop)

In [None]:
print(X_test.count(), len(X_test.columns))

31619 64


## low correlated features with target

In [None]:
import numpy as np

In [None]:
from pyspark.sql.functions import monotonically_increasing_id

In [None]:
numeric_cols = [field.name for field in X_train.schema.fields if isinstance(field.dataType, DoubleType)]


X_train = X_train.withColumn("row_id", monotonically_increasing_id())
y_train = y_train.withColumn("row_id", monotonically_increasing_id())


Xy_train = X_train.join(y_train, on="row_id", how="inner")


Xy_train = Xy_train.drop("row_id")


numeric_cols = [field.name for field in X_train.schema.fields if isinstance(field.dataType, DoubleType)]
numeric_cols.append("TARGET")


assembler = VectorAssembler(inputCols=numeric_cols, outputCol="features_vec")
df_vector = assembler.transform(Xy_train).select("features_vec")


correlation_matrix = Correlation.corr(df_vector, "features_vec", "pearson").head()[0]
corr_array = np.array(correlation_matrix.toArray())

target_corr = corr_array[:-1, -1]

low_corr_features = []
for feature, corr in zip(numeric_cols[:-1], target_corr):
    if abs(corr) < 0.:
        low_corr_features.append((feature, corr))

low_corr_features_sorted = sorted(low_corr_features, key=lambda x: abs(x[1]))
for feature, corr in low_corr_features_sorted:
    print(f"{feature} | correlation with TARGET = {corr:.4f}")

In [None]:
low_corr_features_sorted

[]

In [None]:
import os
import shutil
from pathlib import Path

current_dir = '/content/drive/MyDrive'
data_relative_path = 'Data/splited_data_NoOutliers_featureSelected'

output_dir = os.path.join(current_dir, data_relative_path, "tmp_output")

def save_csv(df, final_csv_path):
    # write df into a temporary folder
    df.coalesce(1).write.option("header", "true").mode("overwrite").csv(output_dir)

    # find the generated part file
    part_file = next(Path(output_dir).glob("part-*.csv"))

    # move and rename
    shutil.move(str(part_file), final_csv_path)

    shutil.rmtree(output_dir)

    return f"File saved to: {final_csv_path}"


# X_train
final_csv_path_X_train = os.path.join(current_dir, data_relative_path, "X_train.csv")
save_csv(X_train, final_csv_path_X_train)

# y_train
final_csv_path_y_train = os.path.join(current_dir, data_relative_path, "y_train.csv")
save_csv(y_train, final_csv_path_y_train)

# X_test
final_csv_path_X_test = os.path.join(current_dir, data_relative_path, "X_test.csv")
save_csv(X_test, final_csv_path_X_test)

# y_test
final_csv_path_y_test = os.path.join(current_dir, data_relative_path, "y_test.csv")
save_csv(y_test, final_csv_path_y_test)

'File saved to: /content/drive/MyDrive/Data/splited_data/X_train.csv'