In [22]:
from src import utils
from copy import deepcopy
from sklearn.preprocessing import OneHotEncoder 
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler

In [23]:
X_train = utils.deserialized_data("data/interim/X_train.pkl")
y_train = utils.deserialized_data("data/interim/y_train.pkl")
X_valid = utils.deserialized_data("data/interim/X_valid.pkl")
y_valid = utils.deserialized_data("data/interim/y_valid.pkl")
X_test = utils.deserialized_data("data/interim/X_test.pkl")
y_test = utils.deserialized_data("data/interim/y_test.pkl")

In [24]:
def drop_duplicate_data(X, y):
  """
  Drops duplicate rows from a DataFrame.

  Args:
    X: A pandas DataFrame representing the dataset.
    y: A pandas Series representing the target variable.

  Returns:
    A tuple containing the cleaned DataFrame and the corresponding target Series.
  """

  # Validate input parameters
  if not isinstance(X, pd.DataFrame):
    raise TypeError("X must be a pandas DataFrame")
  if not isinstance(y, pd.Series):
    raise TypeError("y must be a pandas Series")

  print("Fungsi drop_duplicate_data: parameter telah divalidasi.")

  # Create copies to avoid modifying original data
  X = X.copy()
  y = y.copy()

  # Get initial shape
  print(f"Fungsi drop_duplicate_data: shape dataset sebelum dropping duplicate adalah {X.shape}.")

  # Find duplicate rows
  X_duplicate = X[X.duplicated()]
  print(f"Fungsi drop_duplicate_data: shape dari data yang duplicate adalah {X_duplicate.shape}.")

  # Calculate expected shape after dropping duplicates
  X_clean = (X.shape[0] - X_duplicate.shape[0], X.shape[1])
  print(f"Fungsi drop_duplicate_data: shape dataset setelah drop duplicate seharusnya adalah {X_clean}.")

  # Drop duplicates in-place
  X.drop_duplicates(inplace=True)
  y = y[X.index]

  print(f"Fungsi drop_duplicate_data: shape dataset setelah dropping duplicate adalah {X.shape}.")

  return X, y

In [25]:
X_train, y_train = drop_duplicate_data(X_train, y_train)

Fungsi drop_duplicate_data: parameter telah divalidasi.
Fungsi drop_duplicate_data: shape dataset sebelum dropping duplicate adalah (26064, 11).
Fungsi drop_duplicate_data: shape dari data yang duplicate adalah (96, 11).
Fungsi drop_duplicate_data: shape dataset setelah drop duplicate seharusnya adalah (25968, 11).
Fungsi drop_duplicate_data: shape dataset setelah dropping duplicate adalah (25968, 11).


In [26]:
def median_imputation(data, subset_data, fit=True):
  """
  Imputes missing numerical values in a DataFrame using median imputation.

  Args:
    data: A pandas DataFrame containing the data to be imputed.
    subset_data: A list of column names to impute (for fit=True) or a dictionary of column names and their corresponding median values (for fit=False).
    fit: A boolean indicating whether to calculate medians or use pre-calculated values.

  Returns:
    If fit=True, returns a dictionary of column names and their corresponding median values.
    If fit=False, returns the imputed DataFrame.
  """

  # Validate input parameters
  if not isinstance(data, pd.DataFrame):
    raise RuntimeError("Fungsi median_imputation: parameter data haruslah bertipe DataFrame!")

  if fit:
    if not isinstance(subset_data, list):
      raise RuntimeError("Fungsi median_imputation: untuk nilai parameter fit = True, subset_data harus bertipe list dan berisi daftar nama kolom yang ingin dicari nilai mediannya guna menjadi data imputasi pada kolom tersebut.")
  else:
    if not isinstance(subset_data, dict):
      raise RuntimeError("Fungsi median_imputation: untuk nilai parameter fit = False, subset_data harus bertipe dict dan berisi key yang merupakan nama kolom beserta value yang merupakan nilai median dari kolom tersebut.")
    if not isinstance(fit, bool):
      raise RuntimeError("Fungsi median_imputation: parameter fit haruslah bertipe boolean, bernilai True atau False.")

  print("Fungsi median_imputation: parameter telah divalidasi.")

  # Create copies to avoid modifying original data
  data = data.copy()
  subset_data = deepcopy(subset_data)

  if fit:
    imputation_data = {}
    for subset in subset_data:
      imputation_data[subset] = data[subset].median()
    print(f"Fungsi median_imputation: proses fitting telah selesai, berikut hasilnya {imputation_data}.")
    return imputation_data
  else:
    print("Fungsi median_imputation: informasi count na sebelum dilakukan imputasi:")
    print(data.isna().sum())
    print()

    data.fillna(subset_data, inplace=True)

    print("Fungsi median_imputation: informasi count na setelah dilakukan imputasi:")
    print(data.isna().sum())
    print()

    return data

In [27]:
# Assuming X_train is a DataFrame and subset_data is a list of columns
subset_data = ['person_emp_length', 'loan_int_rate']

# Calculate medians
medians = median_imputation(X_train, subset_data, fit=True)

# Impute missing values in X_train, X_test, and X_valid
X_train = median_imputation(X_train, medians, fit=False)
X_test = median_imputation(X_test, medians, fit=False)
X_valid = median_imputation(X_valid, medians, fit=False)

Fungsi median_imputation: parameter telah divalidasi.
Fungsi median_imputation: proses fitting telah selesai, berikut hasilnya {'person_emp_length': 4.0, 'loan_int_rate': 10.99}.
Fungsi median_imputation: parameter telah divalidasi.
Fungsi median_imputation: informasi count na sebelum dilakukan imputasi:
person_age                       0
person_income                    0
person_home_ownership            0
person_emp_length              734
loan_intent                      0
loan_grade                       0
loan_amnt                        0
loan_int_rate                 2491
loan_percent_income              0
cb_person_default_on_file        0
cb_person_cred_hist_length       0
dtype: int64

Fungsi median_imputation: informasi count na setelah dilakukan imputasi:
person_age                    0
person_income                 0
person_home_ownership         0
person_emp_length             0
loan_intent                   0
loan_grade                    0
loan_amnt                     

In [28]:
def create_onehot_encoder(categories, path):
  """
  Creates a OneHotEncoder and saves it to disk.

  Args:
    categories: A list of categories to encode.
    path: The path to save the encoder.
  """

  if not isinstance(categories, list):
    raise RuntimeError("Fungsi create_onehot_encoder: parameter categories haruslah bertipe list, berisi kategori yang akan dibuat encodernya.")
  if not isinstance(path, str):
    raise RuntimeError("Fungsi create_onehot_encoder: parameter path haruslah bertipe string, berisi lokasi pada disk komputer dimana encoder akan disimpan.")

  ohe = OneHotEncoder()
  ohe.fit(np.array(categories).reshape(-1, 1))
  utils.serialize_data(ohe, path)

  print(f"Kategori yang telah dipelajari adalah {ohe.categories_[0].tolist()}")
  return ohe

In [29]:
# Definisikan variabel kategorikal (contoh data)
person_home_ownership = ['RENT', 'MORTGAGE', 'OWN','OTHER']
loan_intent = ['PERSONAL', 'EDUCATION','MEDICAL','VENTURE','DEBTCONSOLIDATION','HOMEIMPROVEMENT']
loan_grade = ['A', 'B', 'C', 'D', 'E', 'F', 'G']
cb_person_default_on_file = ['Y', 'N']

# Buat encoder dan simpan
ohe_home_ownership = create_onehot_encoder(person_home_ownership, 'models/ohe_home_ownership.pkl')
ohe_loan_intent = create_onehot_encoder(loan_intent, 'models/ohe_loan_intent.pkl')
ohe_loan_grade = create_onehot_encoder(loan_grade, 'models/ohe_loan_grade.pkl')
ohe_default_on_file = create_onehot_encoder(cb_person_default_on_file, 'models/ohe_defaultb_on_file.pkl')

Kategori yang telah dipelajari adalah ['MORTGAGE', 'OTHER', 'OWN', 'RENT']
Kategori yang telah dipelajari adalah ['DEBTCONSOLIDATION', 'EDUCATION', 'HOMEIMPROVEMENT', 'MEDICAL', 'PERSONAL', 'VENTURE']
Kategori yang telah dipelajari adalah ['A', 'B', 'C', 'D', 'E', 'F', 'G']
Kategori yang telah dipelajari adalah ['N', 'Y']


In [30]:
def ohe_transform(dataset, subset, prefix, ohe):
  """
  Transforms a categorical column in a DataFrame using a pre-trained OneHotEncoder.

  Args:
    dataset: The DataFrame to transform.
    subset: The name of the column to encode.
    prefix: The prefix for the encoded columns.
    ohe: A pre-trained OneHotEncoder.

  Returns:
    The transformed DataFrame.
  """

  # Validate input parameters
  if not isinstance(dataset, pd.DataFrame):
    raise RuntimeError("Fungsi ohe_transform: parameter dataset harus bertipe DataFrame!")
  if not isinstance(ohe, OneHotEncoder):
    raise RuntimeError("Fungsi ohe_transform: parameter ohe harus bertipe OneHotEncoder!")
  if not isinstance(prefix, str):
    raise RuntimeError("Fungsi ohe_transform: parameter prefix harus bertipe str!")
  if not isinstance(subset, str):
    raise RuntimeError("Fungsi ohe_transform: parameter subset harus bertipe str!")

  # Check if subset column exists in the DataFrame
  try:
    dataset.columns.get_loc(subset)
  except KeyError:
    raise RuntimeError("Fungsi ohe_transform: parameter subset string namun data tidak ditemukan dalam daftar kolom yang terdapat pada parameter dataset.")

  print("Fungsi ohe_transform: parameter telah divalidasi.")

  # Create a copy to avoid modifying the original DataFrame
  dataset = dataset.copy()

  # Print original column names
  print(f"Fungsi ohe_transform: daftar nama kolom sebelum dilakukan pengkodean adalah {list(dataset.columns)}.\n")

  # Create new column names for encoded columns
  col_names = [f"{prefix}_{col}" for col in ohe.categories_[0].tolist()]

  # Encode the specified column
  encoded = pd.DataFrame(ohe.transform(dataset[[subset]]).toarray(),
                         columns=col_names,
                         index=dataset.index)

  # Concatenate the encoded DataFrame with the original DataFrame
  dataset = pd.concat([dataset, encoded], axis=1)

  # Drop the original categorical column
  dataset.drop(columns=[subset], inplace=True)

  # Print new column names
  print(f"Fungsi ohe_transform: daftar nama kolom setelah dilakukan pengkodean adalah {list(dataset.columns)}.\n")

  return dataset

In [31]:
# Asumsikan X_train, X_test, dan X_valid sudah didefinisikan dan berisi DataFrame

# Encoding untuk X_train
X_train = ohe_transform(X_train, 'person_home_ownership', 'home_ownership', ohe_home_ownership)
X_train = ohe_transform(X_train, 'loan_intent', 'loan_intent', ohe_loan_intent)
X_train = ohe_transform(X_train, 'loan_grade', 'loan_grade', ohe_loan_grade)
X_train = ohe_transform(X_train, 'cb_person_default_on_file', 'default_onfile', ohe_default_on_file)

# Encoding untuk X_test
X_test = ohe_transform(X_test, 'person_home_ownership', 'home_ownership', ohe_home_ownership)
X_test = ohe_transform(X_test, 'loan_intent', 'loan_intent', ohe_loan_intent)
X_test = ohe_transform(X_test, 'loan_grade', 'loan_grade', ohe_loan_grade)
X_test = ohe_transform(X_test, 'cb_person_default_on_file', 'default_onfile', ohe_default_on_file)

# Encoding untuk X_valid
X_valid = ohe_transform(X_valid, 'person_home_ownership', 'home_ownership', ohe_home_ownership)
X_valid = ohe_transform(X_valid, 'loan_intent', 'loan_intent', ohe_loan_intent)
X_valid = ohe_transform(X_valid, 'loan_grade', 'loan_grade', ohe_loan_grade)
X_valid = ohe_transform(X_valid, 'cb_person_default_on_file', 'default_onfile', ohe_default_on_file)

Fungsi ohe_transform: parameter telah divalidasi.
Fungsi ohe_transform: daftar nama kolom sebelum dilakukan pengkodean adalah ['person_age', 'person_income', 'person_home_ownership', 'person_emp_length', 'loan_intent', 'loan_grade', 'loan_amnt', 'loan_int_rate', 'loan_percent_income', 'cb_person_default_on_file', 'cb_person_cred_hist_length'].

Fungsi ohe_transform: daftar nama kolom setelah dilakukan pengkodean adalah ['person_age', 'person_income', 'person_emp_length', 'loan_intent', 'loan_grade', 'loan_amnt', 'loan_int_rate', 'loan_percent_income', 'cb_person_default_on_file', 'cb_person_cred_hist_length', 'home_ownership_MORTGAGE', 'home_ownership_OTHER', 'home_ownership_OWN', 'home_ownership_RENT'].

Fungsi ohe_transform: parameter telah divalidasi.
Fungsi ohe_transform: daftar nama kolom sebelum dilakukan pengkodean adalah ['person_age', 'person_income', 'person_emp_length', 'loan_intent', 'loan_grade', 'loan_amnt', 'loan_int_rate', 'loan_percent_income', 'cb_person_default_on_fi



In [18]:
#scaling the data because we use logreg and knn which use distance to predict the output

#define the scaler function

def fit_scaler(data):
    scaler = StandardScaler()
    scaler.fit(data)
    return scaler

def transform_scaler(data,scaler):
    scaled_data = scaler.transform(data)
    scaled_data = pd.DataFrame(scaled_data)
    scaled_data.columns = data.columns
    scaled_data.index = data.index
    return scaled_data

In [19]:
# because we gonna use logistic regression and knn which use distant to predict the output, we must scaling the X_train
# Fit the scaler
scaler = fit_scaler(X_train)# WRITE YOUR CODE HERE

# Transform the scaler for the train,valid,test
X_train_clean = transform_scaler(X_train,scaler)
X_valid_clean = transform_scaler(X_valid,scaler)
X_test_clean = transform_scaler(X_test,scaler)

In [20]:
utils.serialize_data(X_train,'data/processed/X_train_prep.pkl')
utils.serialize_data(X_valid,'data/processed/X_valid_prep.pkl')
utils.serialize_data(X_test,'data/processed/X_test_prep.pkl')

In [32]:
utils.serialize_data(y_train,'data/processed/y_train_prep.pkl')
utils.serialize_data(y_valid,'data/processed/y_valid_prep.pkl')
utils.serialize_data(y_test,'data/processed/y_test_prep.pkl')