In [4]:
import numpy as np
import pandas as pd

def lower_case_column_names(data):
    data.columns = [column.lower() for column in data.columns]
    return data

def rename_columns(data):
    data.rename(columns={'customer':'id', 'st':'location', 'customer lifetime value':'lifetime_value',
                         'income':'income', 'monthly premium auto':'premium', 'number of open complaints':'open_complaints',
                         'policy type':'policy', 'vehicle class':'vehicle', 'total claim amount':'claim_amount'}, inplace=True)
    return data

def clean_gender(x):
    if x in ['M', 'Male']:
        return 'Male'
    elif x in ['F', 'Femal', 'female']:
        return 'Female'
    else:
        return np.nan

def clean_location(x):
    if x in ['Washington', 'WA']:
        return 'Washington'
    elif x in ['Oregon']:
        return 'Oregon'
    elif x in ['Arizona', 'AZ']:
        return 'Arizona'
    elif x in ['Cali', 'California']:
        return 'California'
    elif x in ['Nevada']:
        return 'Nevada'
    else:
        return np.nan

def main():
    file1 = pd.read_csv('https://raw.githubusercontent.com/data-bootcamp-v4/data/main/file1.csv')
    file1 = lower_case_column_names(file1)
    file1 = rename_columns(file1)
    
    file1['gender'] = list(map(clean_gender, file1['gender']))
    file1['location'] = list(map(clean_location, file1['location']))
    
    file1['location'] = file1['location'].astype(str)
    file1['lifetime_value'] = file1['lifetime_value'].str.replace('%', '').astype(float)
    
    nan_percentage = file1.isnull().mean() * 100
    print("NaN Percentage:")
    print(nan_percentage)
    
    null_counts = file1.isnull().sum()
    print("Null Counts:")
    print(null_counts)
    
    threshold = 8
    columns_to_check = ['id', 'location', 'gender', 'education', 'lifetime_value', 'income', 'premium',
                        'open_complaints', 'policy', 'vehicle', 'claim_amount']
    
    file1_dropped = file1.dropna(subset=columns_to_check, thresh=file1.shape[1] - threshold)
    file1_dropped = file1_dropped.reset_index(drop=True)
    
    len_missing_gender = len(file1_dropped[file1_dropped['gender'].isna()])
    print("Number of missing gender values after dropping rows:")
    print(len_missing_gender)
    
    file1_dropped['gender'] = file1_dropped['gender'].fillna('Female')
    
    file1_dropped.to_csv('file1_dropped.csv', index=False)
    
    duplicate_count = file1_dropped['id'].duplicated().sum()
    print("Number of duplicated IDs:")
    print(duplicate_count)

if __name__ == '__main__':
    main()


NaN Percentage:
id                 73.278443
location            0.000000
gender             76.197605
education          73.278443
lifetime_value     73.353293
income             73.278443
premium            73.278443
open_complaints    73.278443
policy             73.278443
vehicle            73.278443
claim_amount       73.278443
dtype: float64
Null Counts:
id                 2937
location              0
gender             3054
education          2937
lifetime_value     2940
income             2937
premium            2937
open_complaints    2937
policy             2937
vehicle            2937
claim_amount       2937
dtype: int64
Number of missing gender values after dropping rows:
117
Number of duplicated IDs:
0
