In [2]:
import numpy as np
import pandas as pd
import random

In [7]:
import numpy as np
import pandas as pd
import random

def generate_final_user_data(num_rows):
    np.random.seed(42)
    random.seed(42)
    
    # Initial feature distributions
    years_to_retire = np.random.randint(5, 40, num_rows)
    locations = ['Mumbai', 'Delhi', 'Bangalore', 'Hyderabad', 'Pune', 'Chennai', 'Kolkata', 'Lucknow']
    location = np.random.choice(locations, num_rows)
    
    salary = np.random.randint(20000, 500000, num_rows)
    number_of_dependents = np.random.randint(0, 5, num_rows)
    
    # Adjusting features based on logical relations
    current_savings = (salary * (40 - years_to_retire) * np.random.uniform(0.1, 0.5, num_rows) - number_of_dependents * 50000).astype(int)
    current_savings[current_savings < 0] = 0
    
    current_invested_amount = (current_savings * np.random.uniform(0.5, 0.9, num_rows) - number_of_dependents * 20000).astype(int)
    current_invested_amount[current_invested_amount < 0] = 0

    # More debt means less current savings or current invested amount
    debt_peak_years = 20
    debt = (np.random.uniform(0, 0.3, num_rows) * salary * (1 - abs(years_to_retire - debt_peak_years) / 40) + number_of_dependents * 20000).astype(int)
    debt[debt < 0] = 0
    
    # Higher debt results in lower other expenses but not vice versa
    other_expenses = (salary * np.random.uniform(0.3, 0.7, num_rows) + number_of_dependents * 10000 - debt * 0.1).astype(int)
    other_expenses[other_expenses < 0] = 0
    
    investment_amount = (salary * np.random.uniform(0.1, 0.3, num_rows) - other_expenses * 0.2 - debt * 0.1).astype(int)
    investment_amount[investment_amount < 0] = 0
    
    house_ownership = np.random.choice(['Own House', 'Rent', 'EMI'], num_rows, p=[0.5, 0.3, 0.2])
    
    # Function to create integer risk allocations with minimum values
    def allocate_risk(low, mid, high):
        def adjust_to_100(values):
            total = sum(values)
            if total != 100:
                diff = 100 - total
                values[0] += diff
            return values
        
        low = adjust_to_100([int(x) for x in low])
        mid = adjust_to_100([int(x) for x in mid])
        high = adjust_to_100([int(x) for x in high])
        
        return {
            's1_low': max(0, low[0]), 's2_low': max(0, low[1]), 's3_low': max(0, low[2]), 's4_low': max(0, low[3]), 's5_low': max(0, low[4]), 's6_low': max(0, low[5]),
            's1_mid': max(0, mid[0]), 's2_mid': max(0, mid[1]), 's3_mid': max(0, mid[2]), 's4_mid': max(0, mid[3]), 's5_mid': max(0, mid[4]), 's6_mid': max(0, mid[5]),
            's1_high': max(0, high[0]), 's2_high': max(0, high[1]), 's3_high': max(0, high[2]), 's4_high': max(0, high[3]), 's5_high': max(0, high[4]), 's6_high': max(0, high[5]),
        }

    risk_allocation = [allocate_risk(
        low=[random.uniform(5, 15), random.uniform(0, 5), random.uniform(20, 30), random.uniform(15, 25), random.uniform(15, 25), random.uniform(25, 35)],
        mid=[random.uniform(15, 25), random.uniform(5, 10), random.uniform(10, 20), random.uniform(10, 20), random.uniform(10, 20), random.uniform(20, 30)],
        high=[random.uniform(20, 30), random.uniform(10, 20), random.uniform(5, 10), random.uniform(5, 10), random.uniform(10, 20), random.uniform(25, 35)]
    ) for _ in range(num_rows)]
    
    # Creating the DataFrame
    df = pd.DataFrame({
        'years_to_retire': years_to_retire,
        'location': location,
        'salary': salary,
        'investment_amount': investment_amount,
        'current_savings': current_savings,
        'debt': debt,
        'other_expenses': other_expenses,
        'number_of_dependents': number_of_dependents,
        'current_invested_amount': current_invested_amount,
        'house_ownership': house_ownership,
    })
    
    risk_df = pd.DataFrame(risk_allocation)
    full_df = pd.concat([df, risk_df], axis=1)
    
    return full_df

# Generating the data for 50,000 rows with logical relations
user_data_final = generate_final_user_data(50000)

# Saving to CSV
file_path_final = 'please_be_final.csv'
user_data_final.to_csv(file_path_final, index=False)



In [4]:
# Generating the data for 50,000 rows with logical relations
user_data_logical = generate_logical_user_data(50000)

# Saving to CSV
file_path_logical = 'input_relation.csv'
user_data_logical.to_csv(file_path_logical, index=False)
