In [6]:
import sys
sys.path.append('../')

from src import data_processing
import numpy as np

In [12]:
raw_data = np.genfromtxt('../data/raw/BankChurners.csv', delimiter=',', encoding='utf8', dtype=str)
raw_data = np.array([[cell.replace('"', '') for cell in row] for row in raw_data])
header = list(raw_data[0])
data = raw_data[1:]
selected_features = [
    'Attrition_Flag',
    'Total_Trans_Ct',
    'Total_Trans_Amt',
    'Total_Ct_Chng_Q4_Q1',
    'Total_Revolving_Bal',
    'Avg_Utilization_Ratio',
    'Total_Relationship_Count',
    'Months_Inactive_12_mon',
    'Contacts_Count_12_mon',
    'Credit_Limit'
]

selected_indices = [header.index(col) for col in selected_features]

data_selected = data[:, selected_indices]
header_selected = selected_features

print("Selected data size:", data_selected.shape)

Selected data size: (10127, 10)


In [13]:
processed_data = data_selected.copy()
header = header_selected.copy()

target_idx = header.index('Attrition_Flag')
attrition_column = processed_data[:, target_idx]

target_encoded = np.where(attrition_column == 'Attrited Customer', 1, 0)
processed_data[:, target_idx] = target_encoded

In [14]:
processed_data = processed_data.astype(float)

revolving_bal_idx = header.index('Total_Revolving_Bal')
credit_limit_idx = header.index('Credit_Limit')

revolving_bal = processed_data[:, revolving_bal_idx]
credit_limit = processed_data[:, credit_limit_idx]
utilization_ratio = revolving_bal / (credit_limit + 1e-6) 

processed_data = np.c_[processed_data, utilization_ratio]
header.append('Custom_Utilization_Ratio')

print("Add new feature Custom_Utilization_Ratio.")
print("New shape:", processed_data.shape)

Add new feature Custom_Utilization_Ratio.
New shape: (10127, 11)


In [16]:
target_column = processed_data[:, 0].reshape(-1, 1) # Giữ dạng cột
feature_columns = processed_data[:, 1:]
feature_header = header[1:]

print("Remove Outliers")

outlier_check_cols = [
    'Credit_Limit', 'Total_Revolving_Bal', 
    'Total_Trans_Amt', 'Total_Trans_Ct', 
    'Custom_Utilization_Ratio'
]

indices_to_remove = set()

for col_name in outlier_check_cols:
    if col_name in feature_header:
        col_idx = feature_header.index(col_name)
        column_data = feature_columns[:, col_idx]

        outlier_indices = data_processing.get_outlier_indices_iqr(column_data)
        
        print(f"[{col_name}] Finds {len(outlier_indices)} outliers.")

        indices_to_remove.update(outlier_indices)
    else:
        print(f"'{col_name}' is not in header.")

indices_to_remove = sorted(list(indices_to_remove))

print(f"\nTotal removed outliners: {len(indices_to_remove)}")

original_rows = feature_columns.shape[0]

if len(indices_to_remove) > 0:
    feature_columns = np.delete(feature_columns, indices_to_remove, axis=0)
    target_column = np.delete(target_column, indices_to_remove, axis=0)

print(f"Data shape after removing outliers: {feature_columns.shape[0]} row")
print(f"Removed: {original_rows - feature_columns.shape[0]} row")

# Normalization
standardized_features = data_processing.standardize_features(feature_columns)

final_processed_data = np.hstack((target_column, standardized_features))

final_header = [header[0]] + feature_header

Remove Outliers
[Credit_Limit] Finds 984 outliers.
[Total_Revolving_Bal] Finds 0 outliers.
[Total_Trans_Amt] Finds 896 outliers.
[Total_Trans_Ct] Finds 2 outliers.
[Custom_Utilization_Ratio] Finds 0 outliers.

Total removed outliners: 1684
Data shape after removing outliers: 8443 row
Removed: 1684 row


In [11]:
output_path = '../data/processed/data_processed.csv'

data_to_save = np.vstack((header, processed_data.astype(str)))

np.savetxt(output_path, data_to_save, delimiter=',', fmt='%s', encoding='utf-8')

print(f"Save successfully: {output_path}")

Save successfully: ../data/processed/data_processed.csv
