In [1]:
# dropping unnecessary columns for modelling
columns_to_drop = ["ID", "Customer_ID", "Month", "Name", "SSN"]
df = df.drop(columns=columns_to_drop)

NameError: name 'df' is not defined

In [None]:
def convert_data_types(df):
    """
    Convert incorrect data types in the credit score dataset to appropriate types.
    
    Parameters:
    df (pandas.DataFrame): The input dataframe with incorrect data types
    
    Returns:
    pandas.DataFrame: A dataframe with corrected data types
    """
    # Create a copy to avoid modifying the original dataframe
    df_converted = df.copy()
    
    # 1. Convert Age from object to int64
    df_converted['Age'] = pd.to_numeric(df_converted['Age'], errors='coerce').astype('Int64')
    
    # 2. Convert Annual_Income from object to float64
    # Remove non-numeric characters (like $, commas, etc.)
    df_converted['Annual_Income'] = df_converted['Annual_Income'].replace(r'[$,]', '', regex=True)
    df_converted['Annual_Income'] = pd.to_numeric(df_converted['Annual_Income'], errors='coerce')
    
    # 3. Convert Num_of_Loan from object to int64
    df_converted['Num_of_Loan'] = pd.to_numeric(df_converted['Num_of_Loan'], errors='coerce').astype('Int64')
    
    # 4. Convert Num_of_Delayed_Payment from object to int64
    df_converted['Num_of_Delayed_Payment'] = pd.to_numeric(df_converted['Num_of_Delayed_Payment'], errors='coerce').astype('Int64')
    
    # 5. Convert Changed_Credit_Limit from object to float64
    df_converted['Changed_Credit_Limit'] = pd.to_numeric(df_converted['Changed_Credit_Limit'], errors='coerce')
    
    # 6. Convert Outstanding_Debt from object to float64
    df_converted['Outstanding_Debt'] = df_converted['Outstanding_Debt'].replace(r'[$,]', '', regex=True)
    df_converted['Outstanding_Debt'] = pd.to_numeric(df_converted['Outstanding_Debt'], errors='coerce')
    
    # 7. Convert Credit_History_Age from object to float64 (in years)
    def extract_credit_history_age(age_str):
        if pd.isna(age_str) or not isinstance(age_str, str):
            return np.nan
        
        years = re.search(r'(\d+)\s*Years?', age_str)
        months = re.search(r'(\d+)\s*Months?', age_str)
        
        total_years = 0
        if years:
            total_years += int(years.group(1))
        if months:
            total_years += int(months.group(1)) / 12
            
        return total_years if total_years > 0 else np.nan
    
    df_converted['Credit_History_Age'] = df_converted['Credit_History_Age'].apply(extract_credit_history_age)
    
    # 8. Convert Amount_invested_monthly from object to float64
    df_converted['Amount_invested_monthly'] = df_converted['Amount_invested_monthly'].replace(r'[$,]', '', regex=True)
    df_converted['Amount_invested_monthly'] = pd.to_numeric(df_converted['Amount_invested_monthly'], errors='coerce')
    
    # 9. Convert Monthly_Balance from object to float64
    df_converted['Monthly_Balance'] = df_converted['Monthly_Balance'].replace(r'[$,]', '', regex=True)
    df_converted['Monthly_Balance'] = pd.to_numeric(df_converted['Monthly_Balance'], errors='coerce')
    
    # 10. Convert Credit_Score if needed (keeping as object if it's categorical)
    # Check if Credit_Score contains only numeric values
    try:
        if df_converted['Credit_Score'].str.isnumeric().all():
            df_converted['Credit_Score'] = pd.to_numeric(df_converted['Credit_Score'], errors='coerce')
    except:
        # Keep as is if we can't determine if it's numeric
        pass
    
    return df_converted

In [None]:
df_converted = convert_data_types(df)
print(df_converted.dtypes)  # Verify the data types