# Dataset Join and Cleaning

Bu notebook quyidagilarni bajaradi:
1. CSV fayllarni tekshirish va validatsiya qilish
2. Ma'lumotlarni tozalash (currency, commas, categorical)
3. Barcha datasetlarni birlashtirish (join)
4. Final tozalangan datasetni saqlash


In [154]:
import pandas as pd
import numpy as np
from pathlib import Path

# Sozlamalar
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
pd.set_option('display.width', None)

print("‚úÖ Kutubxonalar yuklandi!")

# Papka yo'li
data_folder = Path('../converted_csv/')
print(f"\nüìÇ Ma'lumotlar papkasi: {data_folder}")
print(f"   Papka mavjud: {data_folder.exists()}")


‚úÖ Kutubxonalar yuklandi!

üìÇ Ma'lumotlar papkasi: ..\converted_csv
   Papka mavjud: True


In [155]:
# PHASE 1: CSV FAYLLARNI TEKSHIRISH VA VALIDATSIYA QILISH
print("="*70)
print("üìã PHASE 1: CSV FAYLLARNI VALIDATSIYA QILISH")
print("="*70)

csv_files = {
    'application_metadata': 'application_metadata.csv',
    'demographics': 'demographics.csv',
    'credit_history': 'credit_history.csv',
    'loan_details': 'loan_details.csv',
    'financial_ratios': 'financial_ratios.csv',
    'geographic_data': 'geographic_data.csv'
}

datasets = {}
validation_results = {}

for name, filename in csv_files.items():
    filepath = data_folder / filename
    print(f"\nüìÑ {name.upper()}: {filename}")
    print("-" * 70)
    
    if not filepath.exists():
        print(f"‚ùå Fayl topilmadi: {filepath}")
        continue
    
    try:
        # CSV ni o'qish
        df = pd.read_csv(filepath)
        datasets[name] = df
        
        # Validatsiya
        print(f"‚úÖ Fayl muvaffaqiyatli o'qildi")
        print(f"   O'lchami: {df.shape}")
        print(f"   Qatorlar: {df.shape[0]:,}")
        print(f"   Ustunlar: {df.shape[1]}")
        
        # Ustunlar ro'yxati
        print(f"   Ustunlar: {list(df.columns)}")
        
        # Ma'lumot turlari
        print(f"\n   Ma'lumot turlari:")
        print(df.dtypes.value_counts())
        
        # Yo'qolgan qiymatlar
        missing = df.isnull().sum()
        if missing.sum() > 0:
            print(f"\n   ‚ö†Ô∏è Yo'qolgan qiymatlar:")
            for col, count in missing[missing > 0].items():
                pct = (count / len(df) * 100).round(2)
                print(f"      {col}: {count} ({pct}%)")
        else:
            print(f"\n   ‚úÖ Yo'qolgan qiymatlar yo'q")
        
        # Kalit ustunni tekshirish
        key_columns = {
            'application_metadata': 'customer_ref',
            'demographics': 'cust_id',
            'credit_history': 'customer_number',
            'loan_details': 'customer_id',
            'financial_ratios': 'cust_num',
            'geographic_data': 'id'
        }
        
        if name in key_columns:
            key_col = key_columns[name]
            if key_col in df.columns:
                print(f"\n   üîë Kalit ustun: {key_col}")
                print(f"      Noyob qiymatlar: {df[key_col].nunique():,}")
                print(f"      Min: {df[key_col].min()}, Max: {df[key_col].max()}")
                print(f"      Takrorlanganlar: {df[key_col].duplicated().sum()}")
                
                # Default ustunini tekshirish
                if name == 'application_metadata' and 'default' in df.columns:
                    print(f"\n   üéØ Target ustun: default")
                    print(f"      Qiymatlar taqsimoti:")
                    print(df['default'].value_counts().sort_index())
                    print(f"      Foizlar:")
                    print((df['default'].value_counts(normalize=True) * 100).round(2).sort_index())
        
        validation_results[name] = {
            'shape': df.shape,
            'columns': list(df.columns),
            'missing': missing.sum(),
            'dtypes': df.dtypes.to_dict()
        }
        
    except Exception as e:
        print(f"‚ùå Xatolik: {e}")
        validation_results[name] = {'error': str(e)}

print(f"\n‚úÖ Validatsiya yakunlandi!")
print(f"   Muvaffaqiyatli yuklangan datasetlar: {len(datasets)}")


üìã PHASE 1: CSV FAYLLARNI VALIDATSIYA QILISH

üìÑ APPLICATION_METADATA: application_metadata.csv
----------------------------------------------------------------------
‚úÖ Fayl muvaffaqiyatli o'qildi
   O'lchami: (89999, 14)
   Qatorlar: 89,999
   Ustunlar: 14
   Ustunlar: ['customer_ref', 'application_id', 'application_hour', 'application_day_of_week', 'account_open_year', 'preferred_contact', 'referral_code', 'account_status_code', 'random_noise_1', 'num_login_sessions', 'num_customer_service_calls', 'has_mobile_app', 'paperless_billing', 'default']

   Ma'lumot turlari:
int64      10
object      3
float64     1
Name: count, dtype: int64

   ‚úÖ Yo'qolgan qiymatlar yo'q

   üîë Kalit ustun: customer_ref
      Noyob qiymatlar: 89,999
      Min: 10000, Max: 99998
      Takrorlanganlar: 0

   üéØ Target ustun: default
      Qiymatlar taqsimoti:
default
0    85405
1     4594
Name: count, dtype: int64
      Foizlar:
default
0    94.9
1     5.1
Name: proportion, dtype: float64

üìÑ D

In [156]:
# PHASE 2: MA'LUMOTLARNI TOZALASH FUNKSIYALARI
print("="*70)
print("üßπ PHASE 2: TOZALASH FUNKSIYALARI")
print("="*70)

def clean_currency(value):
    """Currency va vergullarni olib tashlash: '$17,700' ‚Üí 17700.0"""
    if pd.isna(value):
        return np.nan
    
    # String ga o'tkazish
    if isinstance(value, (int, float)):
        return float(value)
    
    value_str = str(value).strip()
    
    # $ belgisini olib tashlash
    value_str = value_str.replace('$', '')
    
    # Vergullarni olib tashlash
    value_str = value_str.replace(',', '')
    
    # Bo'sh joylarni olib tashlash
    value_str = value_str.strip()
    
    # Raqamga o'tkazish
    try:
        return float(value_str)
    except:
        return np.nan


def standardize_loan_type(value):
    """Qarz turini standartlashtirish"""
    if pd.isna(value):
        return value
    
    value_str = str(value).strip()
    
    if 'personal' in value_str.lower():
        return 'Personal'
    elif 'mortgage' in value_str.lower():
        return 'Mortgage'
    elif 'credit' in value_str.lower() and 'card' in value_str.lower():
        return 'CreditCard'
    elif 'auto' in value_str.lower():
        return 'Auto'
    else:
        return value_str

def standardize_education(value):
    """Ta'lim darajasini standartlashtirish"""
    if pd.isna(value):
        return value
    
    value_str = str(value).strip()
    
    # Case standartlashtirish
    if 'graduate' in value_str.lower():
        return 'Graduate'
    elif 'bachelor' in value_str.lower():
        return 'Bachelor'
    elif 'high school' in value_str.lower() or 'highschool' in value_str.lower():
        return 'High School'
    elif 'some college' in value_str.lower() or 'somecollege' in value_str.lower():
        return 'Some College'
    elif 'master' in value_str.lower():
        return 'Master'
    elif 'doctorate' in value_str.lower() or 'phd' in value_str.lower():
        return 'Doctorate'
    else:
        return value_str

def standardize_marital_status(value):
    """Oilaviy holatni standartlashtirish"""
    if pd.isna(value):
        return value
    
    value_str = str(value).strip()
    
    if 'married' in value_str.lower():
        return 'Married'
    elif 'single' in value_str.lower():
        return 'Single'
    elif 'divorced' in value_str.lower():
        return 'Divorced'
    elif 'widowed' in value_str.lower():
        return 'Widowed'
    else:
        return value_str

print("‚úÖ Tozalash funksiyalari yaratildi!")


üßπ PHASE 2: TOZALASH FUNKSIYALARI
‚úÖ Tozalash funksiyalari yaratildi!


In [157]:
# PHASE 3: HAR BIR DATASETNI TOZALASH
print("="*70)
print("üßπ PHASE 3: DATASETLARNI TOZALASH")
print("="*70)

cleaned_datasets = {}

# 1. APPLICATION_METADATA - minimal tozalash
print("\n1Ô∏è‚É£ APPLICATION_METADATA tozalash...")
df_app = datasets['application_metadata'].copy()
# random_noise_1 ni keyinroq o'chiramiz
cleaned_datasets['application_metadata'] = df_app
print(f"   ‚úÖ Tozalandi: {df_app.shape}")

# 2. DEMOGRAPHICS tozalash
print("\n2Ô∏è‚É£ DEMOGRAPHICS tozalash...")
df_demo = datasets['demographics'].copy()

# annual_income tozalash
df_demo['annual_income'] = df_demo['annual_income'].apply(clean_currency)

# education standartlashtirish
df_demo['education'] = df_demo['education'].apply(standardize_education)

# marital_status standartlashtirish
df_demo['marital_status'] = df_demo['marital_status'].apply(standardize_marital_status)

# employment_length NaN to'ldirish
if df_demo['employment_length'].isnull().sum() > 0:
    median_emp = df_demo['employment_length'].median()
    df_demo['employment_length'] = df_demo['employment_length'].fillna(median_emp)
    print(f"   ‚ö†Ô∏è employment_length NaN to'ldirildi (median: {median_emp:.2f})")

cleaned_datasets['demographics'] = df_demo
print(f"   ‚úÖ Tozalandi: {df_demo.shape}")

# 3. CREDIT_HISTORY tozalash
print("\n3Ô∏è‚É£ CREDIT_HISTORY tozalash...")
df_credit = datasets['credit_history'].copy()

# num_delinquencies_2yrs NaN to'ldirish
if df_credit['num_delinquencies_2yrs'].isnull().sum() > 0:
    df_credit['num_delinquencies_2yrs'] = df_credit['num_delinquencies_2yrs'].fillna(0)
    print(f"   ‚ö†Ô∏è num_delinquencies_2yrs NaN to'ldirildi (0)")

cleaned_datasets['credit_history'] = df_credit
print(f"   ‚úÖ Tozalandi: {df_credit.shape}")

# 4. LOAN_DETAILS tozalash
print("\n4Ô∏è‚É£ LOAN_DETAILS tozalash...")
df_loan = datasets['loan_details'].copy()

# loan_amount tozalash
df_loan['loan_amount'] = df_loan['loan_amount'].apply(clean_currency)

# loan_type standartlashtirish
df_loan['loan_type'] = df_loan['loan_type'].apply(standardize_loan_type)

cleaned_datasets['loan_details'] = df_loan
print(f"   ‚úÖ Tozalandi: {df_loan.shape}")

# 5. FINANCIAL_RATIOS tozalash
print("\n5Ô∏è‚É£ FINANCIAL_RATIOS tozalash...")
df_fin = datasets['financial_ratios'].copy()

# Barcha currency ustunlarini tozalash
currency_columns = [
    'monthly_income', 'existing_monthly_debt', 'monthly_payment',
    'revolving_balance', 'credit_usage_amount', 'available_credit',
    'total_monthly_debt_payment', 'total_debt_amount', 'monthly_free_cash_flow'
]

for col in currency_columns:
    if col in df_fin.columns:
        df_fin[col] = df_fin[col].apply(clean_currency)
        print(f"   ‚úÖ {col} tozalandi")

# revolving_balance NaN to'ldirish
if 'revolving_balance' in df_fin.columns and df_fin['revolving_balance'].isnull().sum() > 0:
    df_fin['revolving_balance'] = df_fin['revolving_balance'].fillna(0)
    print(f"   ‚ö†Ô∏è revolving_balance NaN to'ldirildi (0)")

cleaned_datasets['financial_ratios'] = df_fin
print(f"   ‚úÖ Tozalandi: {df_fin.shape}")

# 6. GEOGRAPHIC_DATA - tozalash kerak emas
print("\n6Ô∏è‚É£ GEOGRAPHIC_DATA tozalash...")
df_geo = datasets['geographic_data'].copy()
cleaned_datasets['geographic_data'] = df_geo
print(f"   ‚úÖ Tozalandi: {df_geo.shape}")

print(f"\n‚úÖ Barcha datasetlar tozalandi!")


üßπ PHASE 3: DATASETLARNI TOZALASH

1Ô∏è‚É£ APPLICATION_METADATA tozalash...


   ‚úÖ Tozalandi: (89999, 14)

2Ô∏è‚É£ DEMOGRAPHICS tozalash...
   ‚ö†Ô∏è employment_length NaN to'ldirildi (median: 5.20)
   ‚úÖ Tozalandi: (89999, 8)

3Ô∏è‚É£ CREDIT_HISTORY tozalash...
   ‚ö†Ô∏è num_delinquencies_2yrs NaN to'ldirildi (0)
   ‚úÖ Tozalandi: (89999, 12)

4Ô∏è‚É£ LOAN_DETAILS tozalash...
   ‚úÖ Tozalandi: (89999, 10)

5Ô∏è‚É£ FINANCIAL_RATIOS tozalash...
   ‚úÖ monthly_income tozalandi
   ‚úÖ existing_monthly_debt tozalandi
   ‚úÖ monthly_payment tozalandi
   ‚úÖ revolving_balance tozalandi
   ‚úÖ credit_usage_amount tozalandi
   ‚úÖ available_credit tozalandi
   ‚úÖ total_monthly_debt_payment tozalandi
   ‚úÖ total_debt_amount tozalandi
   ‚úÖ monthly_free_cash_flow tozalandi
   ‚ö†Ô∏è revolving_balance NaN to'ldirildi (0)
   ‚úÖ Tozalandi: (89999, 16)

6Ô∏è‚É£ GEOGRAPHIC_DATA tozalash...
   ‚úÖ Tozalandi: (89999, 8)

‚úÖ Barcha datasetlar tozalandi!


In [158]:
# PHASE 4: DATASETLARNI BIRLASHTIRISH (JOIN)
print("="*70)
print("üîó PHASE 4: DATASETLARNI BIRLASHTIRISH")
print("="*70)

# Asosiy dataset - application_metadata (default target bor)
df_final = cleaned_datasets['application_metadata'].copy()
print(f"\nüìä Asosiy dataset: application_metadata")
print(f"   O'lchami: {df_final.shape}")

# 1. Demographics join
print("\n1Ô∏è‚É£ Demographics join...")
df_demo = cleaned_datasets['demographics'].copy()
df_final = df_final.merge(
    df_demo,
    left_on='customer_ref',
    right_on='cust_id',
    how='left',
    suffixes=('', '_demo')
)
print(f"   ‚úÖ Join qilindi: {df_final.shape}")

# 2. Credit History join
print("\n2Ô∏è‚É£ Credit History join...")
df_credit = cleaned_datasets['credit_history'].copy()
df_final = df_final.merge(
    df_credit,
    left_on='customer_ref',
    right_on='customer_number',
    how='left',
    suffixes=('', '_credit')
)
print(f"   ‚úÖ Join qilindi: {df_final.shape}")

# 3. Loan Details join
print("\n3Ô∏è‚É£ Loan Details join...")
df_loan = cleaned_datasets['loan_details'].copy()
df_final = df_final.merge(
    df_loan,
    left_on='customer_ref',
    right_on='customer_id',
    how='left',
    suffixes=('', '_loan')
)
print(f"   ‚úÖ Join qilindi: {df_final.shape}")

# 4. Financial Ratios join
print("\n4Ô∏è‚É£ Financial Ratios join...")
df_fin = cleaned_datasets['financial_ratios'].copy()
df_final = df_final.merge(
    df_fin,
    left_on='customer_ref',
    right_on='cust_num',
    how='left',
    suffixes=('', '_financial')
)
print(f"   ‚úÖ Join qilindi: {df_final.shape}")

# 5. Geographic Data join
print("\n5Ô∏è‚É£ Geographic Data join...")
df_geo = cleaned_datasets['geographic_data'].copy()
df_final = df_final.merge(
    df_geo,
    left_on='customer_ref',
    right_on='id',
    how='left',
    suffixes=('', '_geo')
)
print(f"   ‚úÖ Join qilindi: {df_final.shape}")

print(f"\n‚úÖ Barcha datasetlar birlashtirildi!")
print(f"   Final o'lchami: {df_final.shape}")
print(f"   Qatorlar: {df_final.shape[0]:,}")
print(f"   Ustunlar: {df_final.shape[1]}")


üîó PHASE 4: DATASETLARNI BIRLASHTIRISH

üìä Asosiy dataset: application_metadata
   O'lchami: (89999, 14)

1Ô∏è‚É£ Demographics join...
   ‚úÖ Join qilindi: (89999, 22)

2Ô∏è‚É£ Credit History join...
   ‚úÖ Join qilindi: (89999, 34)

3Ô∏è‚É£ Loan Details join...
   ‚úÖ Join qilindi: (89999, 44)

4Ô∏è‚É£ Financial Ratios join...
   ‚úÖ Join qilindi: (89999, 60)

5Ô∏è‚É£ Geographic Data join...
   ‚úÖ Join qilindi: (89999, 68)

‚úÖ Barcha datasetlar birlashtirildi!
   Final o'lchami: (89999, 68)
   Qatorlar: 89,999
   Ustunlar: 68


In [159]:
# PHASE 5: TAKRORLANGAN VA FOYDasIZ USTUNLARNI O'CHIRISH
print("="*70)
print("üóëÔ∏è PHASE 5: TAKRORLANGAN USTUNLARNI O'CHIRISH")
print("="*70)

# Takrorlangan kalit ustunlar (join qilgandan keyin faqat customer_ref saqlanadi)
columns_to_drop = [
    'cust_id',           # demographics dan
    'customer_number',  # credit_history dan
    'customer_id',      # loan_details dan
    'cust_num',         # financial_ratios dan
    'id',               # geographic_data dan
    'recent_inquiry_count',  # num_inquiries_6mo bilan bir xil
    'random_noise_1'    # foydasiz
]

# Faqat mavjud ustunlarni o'chiramiz
existing_drop_cols = [col for col in columns_to_drop if col in df_final.columns]

if existing_drop_cols:
    print(f"\nüóëÔ∏è O'chiriladigan ustunlar ({len(existing_drop_cols)} ta):")
    for col in existing_drop_cols:
        print(f"   - {col}")
    
    df_final = df_final.drop(columns=existing_drop_cols)
    print(f"\n‚úÖ Ustunlar o'chirildi")
    print(f"   Yangi o'lchami: {df_final.shape}")
else:
    print("\n‚ö†Ô∏è O'chiriladigan ustunlar topilmadi")

# Final ustunlar ro'yxati
print(f"\nüìã Final dataset ustunlari ({len(df_final.columns)} ta):")
for i, col in enumerate(df_final.columns.tolist(), 1):
    dtype = df_final[col].dtype
    null_count = df_final[col].isnull().sum()
    null_pct = (null_count / len(df_final) * 100).round(2) if null_count > 0 else 0
    print(f"   {i:3d}. {col:<40} | {str(dtype):<15} | NaN: {null_count:>6} ({null_pct:>5.2f}%)")


üóëÔ∏è PHASE 5: TAKRORLANGAN USTUNLARNI O'CHIRISH

üóëÔ∏è O'chiriladigan ustunlar (7 ta):
   - cust_id
   - customer_number
   - customer_id
   - cust_num
   - id
   - recent_inquiry_count
   - random_noise_1

‚úÖ Ustunlar o'chirildi
   Yangi o'lchami: (89999, 61)

üìã Final dataset ustunlari (61 ta):
     1. customer_ref                             | int64           | NaN:      0 ( 0.00%)
     2. application_id                           | int64           | NaN:      0 ( 0.00%)
     3. application_hour                         | int64           | NaN:      0 ( 0.00%)
     4. application_day_of_week                  | int64           | NaN:      0 ( 0.00%)
     5. account_open_year                        | int64           | NaN:      0 ( 0.00%)
     6. preferred_contact                        | object          | NaN:      0 ( 0.00%)
     7. referral_code                            | object          | NaN:      0 ( 0.00%)
     8. account_status_code                      | object       

In [160]:
# PHASE 6: FINAL VALIDATSIYA VA SAQLASH
print("="*70)
print("‚úÖ PHASE 6: FINAL VALIDATSIYA VA SAQLASH")
print("="*70)

# Validatsiya
print(f"\nüìä Final dataset ma'lumotlari:")
print(f"   O'lchami: {df_final.shape}")
print(f"   Qatorlar: {df_final.shape[0]:,}")
print(f"   Ustunlar: {df_final.shape[1]}")

# Default ustunini tekshirish
if 'default' in df_final.columns:
    print(f"\nüéØ Target ustun (default) taqsimoti:")
    print(df_final['default'].value_counts().sort_index())
    print(f"\n   Foizlar:")
    print((df_final['default'].value_counts(normalize=True) * 100).round(2).sort_index())
    print(f"\n   ‚ö†Ô∏è Eslatma: 0 = XAVFLI (to'lay olmaydi), 1 = XAVFSIZ (to'lay oladi)")

# Yo'qolgan qiymatlar
missing = df_final.isnull().sum()
missing_pct = (missing / len(df_final) * 100).round(2)
missing_df = pd.DataFrame({
    'Ustun': missing.index,
    'Yo_qolgan': missing.values,
    'Foiz': missing_pct.values
})
missing_df = missing_df[missing_df['Yo_qolgan'] > 0].sort_values('Yo_qolgan', ascending=False)

if len(missing_df) > 0:
    print(f"\n‚ö†Ô∏è Yo'qolgan qiymatlar bo'lgan ustunlar ({len(missing_df)} ta):")
    print(missing_df.head(20).to_string(index=False))
else:
    print(f"\n‚úÖ Yo'qolgan qiymatlar yo'q!")

# Birinchi 5 qatorni ko'rsatish
print(f"\nüîç Birinchi 5 qator:")
df_final.head(5)


‚úÖ PHASE 6: FINAL VALIDATSIYA VA SAQLASH

üìä Final dataset ma'lumotlari:
   O'lchami: (89999, 61)
   Qatorlar: 89,999
   Ustunlar: 61

üéØ Target ustun (default) taqsimoti:
default
0    85405
1     4594
Name: count, dtype: int64

   Foizlar:
default
0    94.9
1     5.1
Name: proportion, dtype: float64

   ‚ö†Ô∏è Eslatma: 0 = XAVFLI (to'lay olmaydi), 1 = XAVFSIZ (to'lay oladi)

‚úÖ Yo'qolgan qiymatlar yo'q!

üîç Birinchi 5 qator:


Unnamed: 0,customer_ref,application_id,application_hour,application_day_of_week,account_open_year,preferred_contact,referral_code,account_status_code,num_login_sessions,num_customer_service_calls,has_mobile_app,paperless_billing,default,age,annual_income,employment_length,employment_type,education,marital_status,num_dependents,credit_score,num_credit_accounts,oldest_credit_line_age,oldest_account_age_months,total_credit_limit,num_delinquencies_2yrs,num_inquiries_6mo,num_public_records,num_collections,account_diversity_index,loan_type,loan_amount,loan_term,interest_rate,loan_purpose,loan_to_value_ratio,origination_channel,loan_officer_id,marketing_campaign,monthly_income,existing_monthly_debt,monthly_payment,debt_to_income_ratio,debt_service_ratio,payment_to_income_ratio,credit_utilization,revolving_balance,credit_usage_amount,available_credit,total_monthly_debt_payment,annual_debt_payment,loan_to_annual_income,total_debt_amount,monthly_free_cash_flow,cost_of_living_index,housing_price_index,previous_zip_code,regional_median_income,regional_median_rent,regional_unemployment_rate,state
0,10000,620515,5,6,2013,Mail,REF0000,ACT-2,13,2,1,1,0,41,61800.0,2.2,Full-time,Graduate,Married,2,696,14,22.8,273.6,169100.0,0.0,2,1,0,0.499,Personal,17700.0,36,12.5,Debt Consolidation,0.0,Direct Mail,1045,W,5150.0,738.64,592.13,0.258,0.258402,0.115,0.841,142213.1,142213.1,26886.9,1330.77,15969.24,0.286408,159913.1,3819.23,73.0,91.0,451,56000,1380.0,4.8,OH
1,10001,624978,4,2,2015,Phone,REF0000,ACT-3,6,1,1,1,1,38,28600.0,7.0,FULL_TIME,High School,Married,0,659,13,3.5,42.0,78200.0,0.0,6,0,0,0.298,Mortgage,114000.0,180,6.83,Refinance,0.774,Branch,1011,B,2383.33,392.21,1013.86,0.59,0.589959,0.425,0.971,75932.2,75932.2,2267.8,1406.07,16872.84,3.986014,189932.2,977.26,87.0,92.0,537,61000,1510.0,4.4,PA
2,10002,564658,10,3,2020,Phone,REF0000,ACT-3,1,2,1,0,0,18,20700.0,0.8,FULL_TIME,Bachelor,Single,0,662,3,0.0,0.0,41400.0,0.0,2,0,0,0.174,Personal,9300.0,36,13.99,Major Purchase,0.0,Online,1084,K,1725.0,204.07,317.81,0.303,0.302539,0.184,0.539,22314.6,22314.6,19085.4,521.88,6262.56,0.449275,31614.6,1203.12,103.0,125.0,679,74000,1920.0,3.9,VA
3,10003,621493,7,5,2010,Email,REF0000,A01,4,1,1,1,0,27,31400.0,4.8,Full Time,Bachelor,Single,0,676,8,9.0,108.0,60000.0,0.0,1,0,0,0.263,Personal,8700.0,48,13.26,Medical,0.0,Online,1048,A,2616.67,288.71,234.52,0.2,0.199961,0.09,0.147,8820.0,8820.0,51180.0,523.23,6278.76,0.27707,17520.0,2093.436667,121.0,158.0,719,75000,1690.0,5.8,CA
4,10004,637785,1,2,2020,Mail,REF0000,ACT-3,6,2,1,0,0,26,24600.0,5.2,Fulltime,High School,Single,0,678,7,8.0,96.0,49700.0,0.0,1,0,0,0.298,Personal,7200.0,24,10.77,Debt Consolidation,0.0,Branch,1055,S,2050.0,248.77,334.81,0.285,0.284673,0.163,0.488,24253.6,24253.6,25446.4,583.58,7002.96,0.292683,31453.6,1466.42,127.0,152.0,933,78000,1700.0,5.8,WA


In [161]:
# FINAL DATASETNI SAQLASH
print("="*70)
print("üíæ FINAL DATASETNI SAQLASH")
print("="*70)

output_file = Path('final_dataset.csv')

df_final.to_csv(output_file, index=False)

print(f"\n‚úÖ Final dataset saqlandi: {output_file}")
print(f"   O'lchami: {df_final.shape}")
print(f"   Qatorlar: {df_final.shape[0]:,}")
print(f"   Ustunlar: {df_final.shape[1]}")

# Fayl hajmini tekshirish
if output_file.exists():
    file_size = output_file.stat().st_size / (1024**2)
    print(f"   Fayl hajmi: {file_size:.2f} MB")

print(f"\nüéâ Barcha ishlar muvaffaqiyatli yakunlandi!")
print(f"\nüìã Keyingi qadamlar:")
print(f"   1. Final datasetni ML model uchun tayyorlash")
print(f"   2. Feature engineering (yangi ustunlar yaratish)")
print(f"   3. Model training (Default Prediction va Loan Amount Prediction)")


# employment_type
# Full-time        50409
# FULL_TIME        12608
# Self-employed    10027
# Part-time         6711
# Contractor        4539
# SELF_EMPLOYED     3409
# PART_TIME         2296
# Name: count, dtype: int64

def clean_employment_type(value):
    """Employment type ni to'g'ri standartlashtirish"""
    if pd.isna(value):
        return value
    
    value_str = str(value).strip()
    value_lower = value_str.lower()
    
    # Full-time variantlari
    if value_lower in ['full-time', 'full time', 'fulltime', 'ft', 'full_time']:
        return 'Full-time'
    # Part-time variantlari
    elif value_lower in ['part-time', 'part time', 'parttime', 'pt', 'part_time']:
        return 'Part-time'
    # Self-employed variantlari
    elif value_lower in ['self-employed', 'self employed', 'selfemp', 'self emp', 'self-emp', 'self_employed']:
        return 'Self-employed'
    # Contractor variantlari
    elif value_lower in ['contractor', 'contract']:
        return 'Contractor'
    # Boshqa holatda original qiymatni qaytarish
    else:
        return value_str  # Original qiymatni qaytarish

df_final['employment_type'] = df_final['employment_type'].apply(clean_employment_type)


def clean_education(value):
    if value.lower() in ['some college', 'high school', 'high_school']:
        return 'Low Education'
    elif value.lower() in ['bachelor', 'bachelor']:
        return 'Medium Education'
    elif value.lower() in ['graduate', 'advanced']:
        return 'High Education'
    else:
        return value
    

df_final['education'] = df_final['education'].apply(clean_education)

def clean_marital_status(value):
    if value.lower() in ['married', 'married']:
        return 'Married'
    else:
        return 'Not Married'
    

df_final['marital_status'] = df_final['marital_status'].apply(clean_marital_status)


def clean_loan_type(value):
    if value.lower() in 'cc':
        return 'CreditCard'
    elif value.lower() in 'mortgage':
        return 'Home Loan'
    else:
        return value

df_final['loan_type'] = df_final['loan_type'].apply(clean_loan_type)


def clear_origination_channel(value):
    if value.lower() in ['online', 'direct mail']:
        return 'Digital'
    elif value.lower() in ['branch', 'broker']:
        return 'Physical'

df_final['origination_channel'] = df_final['origination_channel'].apply(clear_origination_channel)

def convert_marital_status(value):
    if value in 'Married':
        return 1
    else:
        return 0
    
def convert_origination_channel(value):
    if value in 'Digital':
        return 1
    else:
        return 0
    
df_final['marital_status'] = df_final['marital_status'].apply(convert_marital_status)
df_final['origination_channel'] = df_final['origination_channel'].apply(convert_origination_channel)

üíæ FINAL DATASETNI SAQLASH

‚úÖ Final dataset saqlandi: final_dataset.csv
   O'lchami: (89999, 61)
   Qatorlar: 89,999
   Ustunlar: 61
   Fayl hajmi: 31.87 MB

üéâ Barcha ishlar muvaffaqiyatli yakunlandi!

üìã Keyingi qadamlar:
   1. Final datasetni ML model uchun tayyorlash
   2. Feature engineering (yangi ustunlar yaratish)
   3. Model training (Default Prediction va Loan Amount Prediction)


In [162]:
# FOYDASIZ USTUNLARNI O'CHIRISH VA FOR_MODEL PAPKASIGA SAQLASH
print("="*70)
print("üóëÔ∏è FOYDASIZ USTUNLARNI O'CHIRISH")
print("="*70)

# O'chiriladigan ustunlar
columns_to_drop = ['state', 'marketing_campaign']

# Faqat mavjud ustunlarni o'chiramiz
existing_drop_cols = [col for col in columns_to_drop if col in df_final.columns]

if existing_drop_cols:
    print(f"\nüóëÔ∏è O'chiriladigan ustunlar ({len(existing_drop_cols)} ta):")
    for col in existing_drop_cols:
        print(f"   - {col}")
    
    df_final_cleaned = df_final.drop(columns=existing_drop_cols)
    print(f"\n‚úÖ Ustunlar o'chirildi")
    print(f"   Eski o'lchami: {df_final.shape}")
    print(f"   Yangi o'lchami: {df_final_cleaned.shape}")
else:
    print("\n‚ö†Ô∏è O'chiriladigan ustunlar topilmadi")
    df_final_cleaned = df_final.copy()

print(f"\nüìä Tozalangan dataset:")
print(f"   Qatorlar: {df_final_cleaned.shape[0]:,}")
print(f"   Ustunlar: {df_final_cleaned.shape[1]}")


üóëÔ∏è FOYDASIZ USTUNLARNI O'CHIRISH

üóëÔ∏è O'chiriladigan ustunlar (2 ta):
   - state
   - marketing_campaign

‚úÖ Ustunlar o'chirildi
   Eski o'lchami: (89999, 61)
   Yangi o'lchami: (89999, 59)

üìä Tozalangan dataset:
   Qatorlar: 89,999
   Ustunlar: 59


In [163]:
# FOR_MODEL PAPKASIGA SAQLASH
print("="*70)
print("üíæ FOR_MODEL PAPKASIGA SAQLASH")
print("="*70)

# for_model papkasini yaratish
for_model_folder = Path('for_model')
for_model_folder.mkdir(exist_ok=True)

print(f"\nüìÇ Papka yaratildi/yoki mavjud: {for_model_folder}")

# Tozalangan datasetni saqlash
output_file = for_model_folder / 'dataset_for_models.csv'

df_final_cleaned.to_csv(output_file, index=False)

print(f"\n‚úÖ Tozalangan dataset saqlandi: {output_file}")
print(f"   O'lchami: {df_final_cleaned.shape}")
print(f"   Qatorlar: {df_final_cleaned.shape[0]:,}")
print(f"   Ustunlar: {df_final_cleaned.shape[1]}")

# Fayl hajmini tekshirish
if output_file.exists():
    file_size = output_file.stat().st_size / (1024**2)
    print(f"   Fayl hajmi: {file_size:.2f} MB")

# O'chirilgan ustunlar ro'yxati
print(f"\nüìã O'chirilgan ustunlar:")
for col in existing_drop_cols:
    print(f"   - {col}")

print(f"\n‚úÖ Final dataset (final_dataset.csv) o'zgarmadi - original saqlanadi")
print(f"‚úÖ Tozalangan dataset for_model papkasiga saqlandi!")

# Birinchi 5 qatorni ko'rsatish
print(f"\nüîç Tozalangan dataset - birinchi 5 qator:")
df_final_cleaned.head(5)

üíæ FOR_MODEL PAPKASIGA SAQLASH

üìÇ Papka yaratildi/yoki mavjud: for_model

‚úÖ Tozalangan dataset saqlandi: for_model\dataset_for_models.csv
   O'lchami: (89999, 59)
   Qatorlar: 89,999
   Ustunlar: 59
   Fayl hajmi: 31.09 MB

üìã O'chirilgan ustunlar:
   - state
   - marketing_campaign

‚úÖ Final dataset (final_dataset.csv) o'zgarmadi - original saqlanadi
‚úÖ Tozalangan dataset for_model papkasiga saqlandi!

üîç Tozalangan dataset - birinchi 5 qator:


Unnamed: 0,customer_ref,application_id,application_hour,application_day_of_week,account_open_year,preferred_contact,referral_code,account_status_code,num_login_sessions,num_customer_service_calls,has_mobile_app,paperless_billing,default,age,annual_income,employment_length,employment_type,education,marital_status,num_dependents,credit_score,num_credit_accounts,oldest_credit_line_age,oldest_account_age_months,total_credit_limit,num_delinquencies_2yrs,num_inquiries_6mo,num_public_records,num_collections,account_diversity_index,loan_type,loan_amount,loan_term,interest_rate,loan_purpose,loan_to_value_ratio,origination_channel,loan_officer_id,monthly_income,existing_monthly_debt,monthly_payment,debt_to_income_ratio,debt_service_ratio,payment_to_income_ratio,credit_utilization,revolving_balance,credit_usage_amount,available_credit,total_monthly_debt_payment,annual_debt_payment,loan_to_annual_income,total_debt_amount,monthly_free_cash_flow,cost_of_living_index,housing_price_index,previous_zip_code,regional_median_income,regional_median_rent,regional_unemployment_rate
0,10000,620515,5,6,2013,Mail,REF0000,ACT-2,13,2,1,1,0,41,61800.0,2.2,Full-time,High Education,1,2,696,14,22.8,273.6,169100.0,0.0,2,1,0,0.499,Personal,17700.0,36,12.5,Debt Consolidation,0.0,1,1045,5150.0,738.64,592.13,0.258,0.258402,0.115,0.841,142213.1,142213.1,26886.9,1330.77,15969.24,0.286408,159913.1,3819.23,73.0,91.0,451,56000,1380.0,4.8
1,10001,624978,4,2,2015,Phone,REF0000,ACT-3,6,1,1,1,1,38,28600.0,7.0,Full-time,Low Education,1,0,659,13,3.5,42.0,78200.0,0.0,6,0,0,0.298,Home Loan,114000.0,180,6.83,Refinance,0.774,0,1011,2383.33,392.21,1013.86,0.59,0.589959,0.425,0.971,75932.2,75932.2,2267.8,1406.07,16872.84,3.986014,189932.2,977.26,87.0,92.0,537,61000,1510.0,4.4
2,10002,564658,10,3,2020,Phone,REF0000,ACT-3,1,2,1,0,0,18,20700.0,0.8,Full-time,Medium Education,0,0,662,3,0.0,0.0,41400.0,0.0,2,0,0,0.174,Personal,9300.0,36,13.99,Major Purchase,0.0,1,1084,1725.0,204.07,317.81,0.303,0.302539,0.184,0.539,22314.6,22314.6,19085.4,521.88,6262.56,0.449275,31614.6,1203.12,103.0,125.0,679,74000,1920.0,3.9
3,10003,621493,7,5,2010,Email,REF0000,A01,4,1,1,1,0,27,31400.0,4.8,Full-time,Medium Education,0,0,676,8,9.0,108.0,60000.0,0.0,1,0,0,0.263,Personal,8700.0,48,13.26,Medical,0.0,1,1048,2616.67,288.71,234.52,0.2,0.199961,0.09,0.147,8820.0,8820.0,51180.0,523.23,6278.76,0.27707,17520.0,2093.436667,121.0,158.0,719,75000,1690.0,5.8
4,10004,637785,1,2,2020,Mail,REF0000,ACT-3,6,2,1,0,0,26,24600.0,5.2,Full-time,Low Education,0,0,678,7,8.0,96.0,49700.0,0.0,1,0,0,0.298,Personal,7200.0,24,10.77,Debt Consolidation,0.0,0,1055,2050.0,248.77,334.81,0.285,0.284673,0.163,0.488,24253.6,24253.6,25446.4,583.58,7002.96,0.292683,31453.6,1466.42,127.0,152.0,933,78000,1700.0,5.8
