In [2]:
import numpy as np
import pandas as pd
import random

# <strong style='color: #A3CCDC'>My Logic</strong>

In [17]:
import numpy as np
import pandas as pd
import random

def generate_final_user_data(num_rows):
    np.random.seed(42)
    random.seed(42)
    
    # Initial feature distributions
    years_to_retire = np.random.randint(5, 40, num_rows)
    locations = ['Mumbai', 'Delhi', 'Bangalore', 'Hyderabad', 'Pune', 'Chennai', 'Kolkata', 'Lucknow']
    location = np.random.choice(locations, num_rows)
    
    salary = np.random.randint(20000, 500000, num_rows)
    number_of_dependents = np.random.randint(0, 5, num_rows)
    

    # More debt means less current savings or current invested amount
    debt_peak_years = 15
    debt = ((np.random.uniform(0, 0.3, num_rows) - salary * (1 - abs(years_to_retire - debt_peak_years) / 40)) + number_of_dependents * 20000 + (years_to_retire - debt_peak_years) * 10000).astype(int)
    debt[debt < 0] = 0


    # debt_peak_years = 15
    # debt_factor = np.random.uniform(0, 0.3, num_rows)
    # debt = (debt_factor * (1 / (salary / 1000)) * (1 - abs(years_to_retire - debt_peak_years) / 40) + number_of_dependents * 20000)
    # debt[debt < 0] = 0

    # Adjusting features based on logical relations
    current_savings = (salary * (40 - years_to_retire) * np.random.uniform(0.1, 0.5, num_rows) - number_of_dependents * 30000 - debt * 0.2).astype(int)
    current_savings[current_savings < 0] = 0
    
    current_invested_amount = (current_savings * np.random.uniform(0.5, 0.9, num_rows) - number_of_dependents * 20000).astype(int)
    current_invested_amount[current_invested_amount < 0] = 0
    
    # Higher debt results in lower other expenses but not vice versa
    other_expenses = (salary * np.random.uniform(0.3, 0.7, num_rows) + number_of_dependents * 10000 - debt * 0.1).astype(int)
    other_expenses[other_expenses < 0] = 0
    
    investment_amount = (salary * np.random.uniform(0.1, 0.3, num_rows) - other_expenses * 0.2 - debt * 0.1).astype(int)
    investment_amount[investment_amount < 0] = 0
    
    house_ownership = np.random.choice(['Own House', 'Rent', 'EMI'], num_rows, p=[0.5, 0.3, 0.2])
    
    # Function to create integer risk allocations with minimum values
    def allocate_risk(low, mid, high):
        def adjust_to_100(values):
            total = sum(values)
            if total != 100:
                diff = 100 - total
                values[0] += diff
            return values
        
        low = adjust_to_100([int(x) for x in low])
        mid = adjust_to_100([int(x) for x in mid])
        high = adjust_to_100([int(x) for x in high])
        
        return {
            's1_low': max(0, low[0]), 's2_low': max(0, low[1]), 's3_low': max(0, low[2]), 's4_low': max(0, low[3]), 's5_low': max(0, low[4]), 's6_low': max(0, low[5]),
            's1_mid': max(0, mid[0]), 's2_mid': max(0, mid[1]), 's3_mid': max(0, mid[2]), 's4_mid': max(0, mid[3]), 's5_mid': max(0, mid[4]), 's6_mid': max(0, mid[5]),
            's1_high': max(0, high[0]), 's2_high': max(0, high[1]), 's3_high': max(0, high[2]), 's4_high': max(0, high[3]), 's5_high': max(0, high[4]), 's6_high': max(0, high[5]),
        }

    risk_allocation = [allocate_risk(
        low=[random.uniform(5, 15), random.uniform(0, 5), random.uniform(20, 30), random.uniform(15, 20), random.uniform(35, 45), random.uniform(5, 15)],
        mid=[random.uniform(15, 25), random.uniform(5, 10), random.uniform(10, 20), random.uniform(20, 30), random.uniform(15, 25), random.uniform(25, 35)],
        high=[random.uniform(25, 40), random.uniform(25, 35), random.uniform(5, 10), random.uniform(5, 10), random.uniform(0, 15), random.uniform(15, 25)]
    ) for _ in range(num_rows)]
    
    # Creating the DataFrame
    df = pd.DataFrame({
        'years_to_retire': years_to_retire,
        'location': location,
        'salary': salary,
        'investment_amount': investment_amount,
        'current_savings': current_savings,
        'debt': debt,
        'other_expenses': other_expenses,
        'number_of_dependents': number_of_dependents,
        'current_invested_amount': current_invested_amount,
        'house_ownership': house_ownership,
    })
    
    risk_df = pd.DataFrame(risk_allocation)
    full_df = pd.concat([df, risk_df], axis=1)
    
    return full_df

# Generating the data for 50,000 rows with logical relations
user_data_final = generate_final_user_data(50000)

# Saving to CSV
file_path_final = 's6_test.csv'
user_data_final.to_csv(file_path_final, index=False)



# <strong style='color: #A3CCDC'>GPT Commerce Logic</strong>

In [23]:
import numpy as np
import pandas as pd
import random

def generate_final_user_data(num_rows):
    np.random.seed(42)
    random.seed(42)
    
    # Initial feature distributions
    years_to_retire = np.random.randint(5, 40, num_rows)
    locations = ['Mumbai', 'Delhi', 'Bangalore', 'Hyderabad', 'Pune', 'Chennai', 'Kolkata', 'Lucknow']
    location = np.random.choice(locations, num_rows)
    
    salary = np.random.randint(20000, 500000, num_rows)
    number_of_dependents = np.random.randint(0, 5, num_rows)

    # Refined debt calculation with stronger inverse proportionality to salary
    # More debt means less current savings or current invested amount
    debt_peak_years = 15
    debt = ((np.random.uniform(0, 0.3, num_rows) - salary * (1 - abs(years_to_retire - debt_peak_years) / 40)) + number_of_dependents * 20000 + (years_to_retire - debt_peak_years) * 10000).astype(int)
    debt[debt < 0] = 0

    # Adjusting features based on logical relations
    current_savings = (salary * (40 - years_to_retire) * np.random.uniform(0.1, 0.5, num_rows) - number_of_dependents * 50000 - debt * 0.1).astype(int)
    current_savings[current_savings < 0] = 0
    
    current_invested_amount = (current_savings * np.random.uniform(0.5, 0.9, num_rows) - number_of_dependents * 20000).astype(int)
    current_invested_amount[current_invested_amount < 0] = 0
    
    # Stronger impact of debt on other expenses
    other_expenses = (salary * np.random.uniform(0.3, 0.7, num_rows) + number_of_dependents * 10000 - debt * 0.5).astype(int)
    other_expenses[other_expenses < 0] = 0
    
    investment_amount = (salary * np.random.uniform(0.1, 0.3, num_rows) - other_expenses * 0.2 - debt * 0.1).astype(int)
    investment_amount[investment_amount < 0] = 0
    
    house_ownership = np.random.choice(['Own House', 'Rent', 'EMI'], num_rows, p=[0.5, 0.3, 0.2])
    
    # Function to create integer risk allocations with minimum values
    def allocate_risk(low, mid, high):
        def adjust_to_100(values):
            total = sum(values)
            if total != 100:
                diff = 100 - total
                values[0] += diff
            return values
        
        low = adjust_to_100([int(x) for x in low])
        mid = adjust_to_100([int(x) for x in mid])
        high = adjust_to_100([int(x) for x in high])
        
        return {
            's1_low': max(0, low[0]), 's2_low': max(0, low[1]), 's3_low': max(0, low[2]), 's4_low': max(0, low[3]), 's5_low': max(0, low[4]), 's6_low': max(0, low[5]),
            's1_mid': max(0, mid[0]), 's2_mid': max(0, mid[1]), 's3_mid': max(0, mid[2]), 's4_mid': max(0, mid[3]), 's5_mid': max(0, mid[4]), 's6_mid': max(0, mid[5]),
            's1_high': max(0, high[0]), 's2_high': max(0, high[1]), 's3_high': max(0, high[2]), 's4_high': max(0, high[3]), 's5_high': max(0, high[4]), 's6_high': max(0, high[5]),
        }

    risk_allocation = [allocate_risk(
        low=[random.uniform(5, 15), random.uniform(0, 5), random.uniform(20, 30), random.uniform(15, 20), random.uniform(35, 45), random.uniform(5, 15)],
        mid=[random.uniform(15, 25), random.uniform(5, 10), random.uniform(10, 20), random.uniform(20, 30), random.uniform(15, 25), random.uniform(25, 35)],
        high=[random.uniform(25, 40), random.uniform(25, 35), random.uniform(5, 10), random.uniform(5, 10), random.uniform(0, 15), random.uniform(15, 25)]
    ) for _ in range(num_rows)]
    
    # Creating the DataFrame
    df = pd.DataFrame({
        'years_to_retire': years_to_retire,
        'location': location,
        'salary': salary,
        'investment_amount': investment_amount,
        'current_savings': current_savings,
        'debt': debt,
        'other_expenses': other_expenses,
        'number_of_dependents': number_of_dependents,
        'current_invested_amount': current_invested_amount,
        'house_ownership': house_ownership,
    })
    
    risk_df = pd.DataFrame(risk_allocation)
    full_df = pd.concat([df, risk_df], axis=1)
    
    return full_df

# Generating the data for 50,000 rows with logical relations
user_data_final = generate_final_user_data(50000)

# Saving to CSV
file_path_final = 'commerce_logic.csv'
user_data_final.to_csv(file_path_final, index=False)


# <strong style='color: #A3CCDC'>Goal Savings</strong>

In [2]:
import numpy as np
import pandas as pd
import random

def generate_final_user_data(num_rows):
    np.random.seed(42)
    random.seed(42)
    
    # Initial feature distributions
    years_to_retire = np.random.randint(5, 40, num_rows)
    locations = ['Mumbai', 'Delhi', 'Bangalore', 'Hyderabad', 'Pune', 'Chennai', 'Kolkata', 'Lucknow']
    location = np.random.choice(locations, num_rows)
    
    salary = np.random.randint(20000, 500000, num_rows)
    number_of_dependents = np.random.randint(0, 5, num_rows)
    

    # More debt means less current savings or current invested amount
    debt_peak_years = 15
    debt = ((np.random.uniform(0, 0.3, num_rows) - salary * (1 - abs(years_to_retire - debt_peak_years) / 40)) + number_of_dependents * 20000 + (years_to_retire - debt_peak_years) * 10000).astype(int)
    debt[debt < 0] = 0

    # Adjusting features based on logical relations
    current_savings = (salary * (40 - years_to_retire) * np.random.uniform(0.1, 0.5, num_rows) - number_of_dependents * 30000 - debt * 0.2).astype(int)
    current_savings[current_savings < 0] = 0
    
    current_invested_amount = (current_savings * np.random.uniform(0.5, 0.9, num_rows) - number_of_dependents * 20000).astype(int)
    current_invested_amount[current_invested_amount < 0] = 0
    
    # Higher debt results in lower other expenses but not vice versa
    other_expenses = (salary * np.random.uniform(0.3, 0.7, num_rows) + number_of_dependents * 10000 - debt * 0.1).astype(int)
    other_expenses[other_expenses < 0] = 0
    
    investment_amount = (salary * np.random.uniform(0.1, 0.3, num_rows) - other_expenses * 0.2 - debt * 0.1).astype(int)
    investment_amount[investment_amount < 0] = 0
    
    house_ownership = np.random.choice(['Own House', 'Rent', 'EMI'], num_rows, p=[0.5, 0.3, 0.2])
    
    # Function to create integer risk allocations with minimum values
    def allocate_risk(low, mid, high):
        def adjust_to_100(values):
            total = sum(values)
            if total != 100:
                diff = 100 - total
                values[0] += diff
            return values
        
        low = adjust_to_100([int(x) for x in low])
        mid = adjust_to_100([int(x) for x in mid])
        high = adjust_to_100([int(x) for x in high])
        
        return {
            's1_low': max(0, low[0]), 's2_low': max(0, low[1]), 's3_low': max(0, low[2]), 's4_low': max(0, low[3]), 's5_low': max(0, low[4]), 's6_low': max(0, low[5]),
            's1_mid': max(0, mid[0]), 's2_mid': max(0, mid[1]), 's3_mid': max(0, mid[2]), 's4_mid': max(0, mid[3]), 's5_mid': max(0, mid[4]), 's6_mid': max(0, mid[5]),
            's1_high': max(0, high[0]), 's2_high': max(0, high[1]), 's3_high': max(0, high[2]), 's4_high': max(0, high[3]), 's5_high': max(0, high[4]), 's6_high': max(0, high[5]),
        }

    risk_allocation = [allocate_risk(
        low=[random.uniform(5, 15), random.uniform(0, 5), random.uniform(20, 30), random.uniform(15, 20), random.uniform(35, 45), random.uniform(5, 15)],
        mid=[random.uniform(15, 25), random.uniform(5, 10), random.uniform(10, 20), random.uniform(20, 30), random.uniform(15, 25), random.uniform(25, 35)],
        high=[random.uniform(25, 40), random.uniform(25, 35), random.uniform(5, 10), random.uniform(5, 10), random.uniform(0, 15), random.uniform(15, 25)]
    ) for _ in range(num_rows)]
    
    # Creating the DataFrame
    df = pd.DataFrame({
        'years_to_retire': years_to_retire,
        'location': location,
        'salary': salary,
        'investment_amount': investment_amount,
        'current_savings': current_savings,
        'debt': debt,
        'other_expenses': other_expenses,
        'number_of_dependents': number_of_dependents,
        'current_invested_amount': current_invested_amount,
        'house_ownership': house_ownership,
    })
    
    risk_df = pd.DataFrame(risk_allocation)
    full_df = pd.concat([df, risk_df], axis=1)
    
    # Function to calculate goal amount based on returns and years_to_retire
    def calculate_goal_amount(investment_amount, allocations, years_to_retire, returns):
        months_to_retire = years_to_retire * 12
        total_amount = 0
        for alloc, ret in zip(allocations, returns):
            monthly_investment = (investment_amount * (alloc / 100))
            future_value = monthly_investment * (((1 + ret/12) ** months_to_retire - 1) / (ret/12))
            total_amount += future_value
        return total_amount

    
    # Returns for each investment type
    returns_low = [0.08, 0.15, 0.05, 0.06, 0.04, 0.05]
    returns_mid = [0.08, 0.15, 0.05, 0.06, 0.04, 0.05]
    returns_high = [0.08, 0.15, 0.05, 0.06, 0.04, 0.05]
    
    # Calculating goal amounts
    full_df['goal_low'] = full_df.apply(lambda row: calculate_goal_amount(row['investment_amount'], [row['s1_low'], row['s2_low'], row['s3_low'], row['s4_low'], row['s5_low'], row['s6_low']], row['years_to_retire'], returns_low), axis=1)
    full_df['goal_mid'] = full_df.apply(lambda row: calculate_goal_amount(row['investment_amount'], [row['s1_mid'], row['s2_mid'], row['s3_mid'], row['s4_mid'], row['s5_mid'], row['s6_mid']], row['years_to_retire'], returns_mid), axis=1)
    full_df['goal_high'] = full_df.apply(lambda row: calculate_goal_amount(row['investment_amount'], [row['s1_high'], row['s2_high'], row['s3_high'], row['s4_high'], row['s5_high'], row['s6_high']], row['years_to_retire'], returns_high), axis=1)
    
    return full_df

# Generating the data for 50,000 rows with logical relations
user_data_final = generate_final_user_data(50000)

# Saving to CSV
file_path_final = 'goal_amount.csv'
user_data_final.to_csv(file_path_final, index=False)


# <strong style='color: #F5742F'>Don't Run</strong>

In [4]:
# Generating the data for 50,000 rows with logical relations
user_data_logical = generate_logical_user_data(50000)

# Saving to CSV
file_path_logical = 'input_relation.csv'
user_data_logical.to_csv(file_path_logical, index=False)


# <strong>New Data 6th June</strong>

In [1]:
import numpy as np
import pandas as pd
import random

def generate_final_user_data(num_rows):
    np.random.seed(42)
    random.seed(42)
    
    # Initial feature distributions
    years_to_retire = np.random.randint(5, 40, num_rows)
    locations = ['Mumbai', 'Delhi', 'Bangalore', 'Hyderabad', 'Pune', 'Chennai', 'Kolkata', 'Lucknow']
    location = np.random.choice(locations, num_rows)
    
    salary = np.random.randint(20000, 500000, num_rows)
    number_of_dependents = np.random.randint(0, 5, num_rows)
    

    # More debt means less current savings or current invested amount
    debt_peak_years = 15
    debt = ((np.random.uniform(0, 0.3, num_rows) - salary * (1 - abs(years_to_retire - debt_peak_years) / 40)) + number_of_dependents * 20000 + (years_to_retire - debt_peak_years) * 10000).astype(int)
    debt[debt < 0] = 0

    # Adjusting features based on logical relations
    current_savings = (salary * (40 - years_to_retire) * np.random.uniform(0.1, 0.5, num_rows) - number_of_dependents * 30000 - debt * 0.2).astype(int)
    current_savings[current_savings < 0] = 0
    
    current_invested_amount = (current_savings * np.random.uniform(0.5, 0.9, num_rows) - number_of_dependents * 20000).astype(int)
    current_invested_amount[current_invested_amount < 0] = 0
    
    # Higher debt results in lower other expenses but not vice versa
    other_expenses = (salary * np.random.uniform(0.3, 0.7, num_rows) + number_of_dependents * 10000 - debt * 0.1).astype(int)
    other_expenses[other_expenses < 0] = 0
    
    investment_amount = (salary * np.random.uniform(0.1, 0.3, num_rows) - other_expenses * 0.2 - debt * 0.1).astype(int)
    investment_amount[investment_amount < 0] = 0
    
    house_ownership = np.random.choice(['Own House', 'Rent', 'EMI'], num_rows, p=[0.5, 0.3, 0.2])
    
    # Function to create integer risk allocations with minimum values
    def allocate_risk(low, mid, high, adjustment_factor):
        def adjust_to_100(values):
            total = sum(values)
            if total != 100:
                diff = 100 - total
                values[0] += diff
            return values
        
        low = adjust_to_100([int(x * adjustment_factor) for x in low])
        mid = adjust_to_100([int(x * adjustment_factor) for x in mid])
        high = adjust_to_100([int(x * adjustment_factor) for x in high])
        
        return {
            's1_low': max(0, low[0]), 's2_low': max(0, low[1]), 's3_low': max(0, low[2]), 's4_low': max(0, low[3]), 's5_low': max(0, low[4]), 's6_low': max(0, low[5]),
            's1_mid': max(0, mid[0]), 's2_mid': max(0, mid[1]), 's3_mid': max(0, mid[2]), 's4_mid': max(0, mid[3]), 's5_mid': max(0, mid[4]), 's6_mid': max(0, mid[5]),
            's1_high': max(0, high[0]), 's2_high': max(0, high[1]), 's3_high': max(0, high[2]), 's4_high': max(0, high[3]), 's5_high': max(0, high[4]), 's6_high': max(0, high[5]),
        }

    # Normalizing features for adjustment
    max_salary = np.max(salary)
    max_years_to_retire = np.max(years_to_retire)
    max_debt = np.max(debt)
    
    normalized_salary = salary / max_salary
    normalized_years_to_retire = years_to_retire / max_years_to_retire
    normalized_debt = debt / max_debt
    
    risk_allocation = [allocate_risk(
        low=[random.uniform(5, 15), random.uniform(0, 5), random.uniform(20, 30), random.uniform(15, 20), random.uniform(35, 45), random.uniform(5, 15)],
        mid=[random.uniform(15, 25), random.uniform(5, 10), random.uniform(10, 20), random.uniform(20, 30), random.uniform(15, 25), random.uniform(25, 35)],
        high=[random.uniform(25, 40), random.uniform(25, 35), random.uniform(5, 10), random.uniform(5, 10), random.uniform(0, 15), random.uniform(15, 25)],
        adjustment_factor=(1 + 0.1 * normalized_salary[i] + 0.1 * normalized_years_to_retire[i] - 0.05 * normalized_debt[i])
    ) for i in range(num_rows)]
    
    # Creating the DataFrame
    df = pd.DataFrame({
        'years_to_retire': years_to_retire,
        'location': location,
        'salary': salary,
        'investment_amount': investment_amount,
        'current_savings': current_savings,
        'debt': debt,
        'other_expenses': other_expenses,
        'number_of_dependents': number_of_dependents,
        'current_invested_amount': current_invested_amount,
        'house_ownership': house_ownership,
    })
    
    risk_df = pd.DataFrame(risk_allocation)
    full_df = pd.concat([df, risk_df], axis=1)
    
    return full_df

# Generating the data for 50,000 rows with logical relations
user_data_final = generate_final_user_data(50000)

# Saving to CSV
file_path_final = 'new_data.csv'
user_data_final.to_csv(file_path_final, index=False)

In [2]:
import numpy as np
import pandas as pd
import random

def generate_final_user_data(num_rows):
    np.random.seed(42)
    random.seed(42)
    
    # Initial feature distributions
    years_to_retire = np.random.randint(5, 40, num_rows)
    locations = ['Mumbai', 'Delhi', 'Bangalore', 'Hyderabad', 'Pune', 'Chennai', 'Kolkata', 'Lucknow']
    location = np.random.choice(locations, num_rows)
    
    salary = np.random.randint(20000, 500000, num_rows)
    number_of_dependents = np.random.randint(0, 5, num_rows)
    
    # More debt means less current savings or current invested amount
    debt_peak_years = 15
    debt = ((np.random.uniform(0, 0.3, num_rows) - salary * (1 - abs(years_to_retire - debt_peak_years) / 40)) + number_of_dependents * 20000 + (years_to_retire - debt_peak_years) * 10000).astype(int)
    debt[debt < 0] = 0

    # Adjusting features based on logical relations
    current_savings = (salary * (40 - years_to_retire) * np.random.uniform(0.1, 0.5, num_rows) - number_of_dependents * 30000 - debt * 0.2).astype(int)
    current_savings[current_savings < 0] = 0
    
    current_invested_amount = (current_savings * np.random.uniform(0.5, 0.9, num_rows) - number_of_dependents * 20000).astype(int)
    current_invested_amount[current_invested_amount < 0] = 0
    
    # Higher debt results in lower other expenses but not vice versa
    other_expenses = (salary * np.random.uniform(0.3, 0.7, num_rows) + number_of_dependents * 10000 - debt * 0.1).astype(int)
    other_expenses[other_expenses < 0] = 0
    
    investment_amount = (salary * np.random.uniform(0.1, 0.3, num_rows) - other_expenses * 0.2 - debt * 0.1).astype(int)
    investment_amount[investment_amount < 0] = 0
    
    house_ownership = np.random.choice(['Own House', 'Rent', 'EMI'], num_rows, p=[0.5, 0.3, 0.2])
    
    # Function to create integer risk allocations with minimum values
    def allocate_risk(low, mid, high, adjustment_factor):
        def adjust_to_100(values):
            total = sum(values)
            if total != 100:
                diff = 100 - total
                values[0] += diff
            return values
        
        low = adjust_to_100([int(x * adjustment_factor) for x in low])
        mid = adjust_to_100([int(x * adjustment_factor) for x in mid])
        high = adjust_to_100([int(x * adjustment_factor) for x in high])
        
        return {
            's1_low': max(0, low[0]), 's2_low': max(0, low[1]), 's3_low': max(0, low[2]), 's4_low': max(0, low[3]), 's5_low': max(0, low[4]), 's6_low': max(0, low[5]),
            's1_mid': max(0, mid[0]), 's2_mid': max(0, mid[1]), 's3_mid': max(0, mid[2]), 's4_mid': max(0, mid[3]), 's5_mid': max(0, mid[4]), 's6_mid': max(0, mid[5]),
            's1_high': max(0, high[0]), 's2_high': max(0, high[1]), 's3_high': max(0, high[2]), 's4_high': max(0, high[3]), 's5_high': max(0, high[4]), 's6_high': max(0, high[5]),
        }

    # Normalizing features for adjustment
    max_salary = np.max(salary)
    max_years_to_retire = np.max(years_to_retire)
    max_debt = np.max(debt)
    
    normalized_salary = salary / max_salary
    normalized_years_to_retire = years_to_retire / max_years_to_retire
    normalized_debt = debt / max_debt
    
    risk_allocation = [allocate_risk(
        low=[random.uniform(5, 15), random.uniform(0, 5), random.uniform(20, 30), random.uniform(15, 20), random.uniform(35, 45), random.uniform(5, 15)],
        mid=[random.uniform(15, 25), random.uniform(5, 10), random.uniform(10, 20), random.uniform(20, 30), random.uniform(15, 25), random.uniform(25, 35)],
        high=[random.uniform(25, 40), random.uniform(25, 35), random.uniform(5, 10), random.uniform(5, 10), random.uniform(0, 15), random.uniform(15, 25)],
        adjustment_factor=(1 + 0.1 * normalized_salary[i] + 0.1 * normalized_years_to_retire[i] - 0.05 * normalized_debt[i])
    ) for i in range(num_rows)]
    
    # Creating the DataFrame
    df = pd.DataFrame({
        'years_to_retire': years_to_retire,
        'location': location,
        'salary': salary,
        'investment_amount': investment_amount,
        'current_savings': current_savings,
        'debt': debt,
        'other_expenses': other_expenses,
        'number_of_dependents': number_of_dependents,
        'current_invested_amount': current_invested_amount,
        'house_ownership': house_ownership,
    })
    
    risk_df = pd.DataFrame(risk_allocation)
    full_df = pd.concat([df, risk_df], axis=1)
    
    return full_df

# Generating the data for 50,000 rows with logical relations
user_data_final = generate_final_user_data(50000)

# Adding outliers to make the dataset look more realistic
def add_outliers(data, num_outliers, feature_names):
    for feature in feature_names:
        outliers = np.random.choice(data.index, size=num_outliers, replace=False)
        data.loc[outliers, feature] = data.loc[outliers, feature] * np.random.uniform(1.5, 3)
    return data

# Adding outliers to selected features
features_with_outliers = ['salary', 'investment_amount', 'current_savings', 'debt', 'other_expenses', 'current_invested_amount']
user_data_final = add_outliers(user_data_final, 100, features_with_outliers)

# Saving to CSV
file_path_final = 'new_data_with_outliers.csv'
user_data_final.to_csv(file_path_final, index=False)

# Displaying a sample of the data
print(user_data_final.head())


   years_to_retire   location    salary  investment_amount  current_savings  \
0               33    Lucknow  183246.0                0.0         119928.0   
1               19    Kolkata   29717.0                0.0         108246.0   
2               12    Chennai  159948.0            18725.0        1339139.0   
3               25       Pune  176368.0            17375.0         664623.0   
4               23  Bangalore  480555.0            35500.0        1463199.0   

       debt  other_expenses  number_of_dependents  current_invested_amount  \
0  159214.0        134129.0                     4                      0.0   
1   73254.0         37478.0                     3                  27235.0   
2       0.0        114612.0                     2                 691978.0   
3   27724.0         99438.0                     3                 324632.0   
4       0.0        199024.0                     3                 733991.0   

  house_ownership  ...  s3_mid  s4_mid  s5_mid  s6_mid  

  988660.57089761  446269.78481141  370351.28433155  859849.53154379
 1033331.59712931  437933.74801769  708624.79431687  609209.0213001
  738326.19252032  662304.17982664 1009629.5025523   744609.60413789
  108516.03828272  214836.29619997  462948.46556143  890132.36005465
  783796.68542471  680315.30503114  868130.5086493   423351.74019439
  808791.5814807   749595.80950142 1018399.40968905  399733.33634344
  339929.70540642  808428.18753857  118466.42513451  847110.92208147
  773491.71418091  482780.96525164  597164.16390912  208288.59807903
  798592.32483824  863342.51816329  814500.16995344  144991.9805225
  492405.39874928  960060.36645818  860543.28361513   73751.35115225
  846060.38323058 1065741.93199229  501347.09211321  301852.62743374
 1045026.27490333  804864.72451817  959639.71044032  270508.24813434
  177043.3262184   787655.26837388  397674.1040047   276315.94404584
  790148.37105565  402325.54646397   69227.64716961  137988.38818326
  232470.81313717  327012.70255433  

In [3]:
import numpy as np
import pandas as pd
import random

def generate_final_user_data(num_rows):
    np.random.seed(42)
    random.seed(42)
    
    # Initial feature distributions
    years_to_retire = np.random.randint(5, 40, num_rows)
    locations = ['Mumbai', 'Delhi', 'Bangalore', 'Hyderabad', 'Pune', 'Chennai', 'Kolkata', 'Lucknow']
    location = np.random.choice(locations, num_rows)
    
    salary = np.random.randint(20000, 500000, num_rows)
    number_of_dependents = np.random.randint(0, 5, num_rows)
    
    # More debt means less current savings or current invested amount
    debt_peak_years = 15
    debt = ((np.random.uniform(0, 0.3, num_rows) - salary * (1 - abs(years_to_retire - debt_peak_years) / 40)) + number_of_dependents * 20000 + (years_to_retire - debt_peak_years) * 10000).astype(int)
    debt[debt < 0] = 0

    # Adjusting features based on logical relations
    current_savings = (salary * (40 - years_to_retire) * np.random.uniform(0.1, 0.5, num_rows) - number_of_dependents * 30000 - debt * 0.2).astype(int)
    current_savings[current_savings < 0] = 0
    
    current_invested_amount = (current_savings * np.random.uniform(0.5, 0.9, num_rows) - number_of_dependents * 20000).astype(int)
    current_invested_amount[current_invested_amount < 0] = 0
    
    # Higher debt results in lower other expenses but not vice versa
    other_expenses = (salary * np.random.uniform(0.3, 0.7, num_rows) + number_of_dependents * 10000 - debt * 0.1).astype(int)
    other_expenses[other_expenses < 0] = 0
    
    investment_amount = (salary * np.random.uniform(0.1, 0.3, num_rows) - other_expenses * 0.2 - debt * 0.1).astype(int)
    investment_amount[investment_amount < 0] = 0
    
    house_ownership = np.random.choice(['Own House', 'Rent', 'EMI'], num_rows, p=[0.5, 0.3, 0.2])
    
    # Normalize features
    max_salary = np.max(salary)
    max_years_to_retire = np.max(years_to_retire)
    max_debt = np.max(debt)
    
    normalized_salary = salary / max_salary
    normalized_years_to_retire = years_to_retire / max_years_to_retire
    normalized_debt = debt / max_debt
    
    # Create weighted sum targets
    def create_weighted_target():
        return (
            0.25 * normalized_salary +
            0.2 * normalized_years_to_retire +
            0.15 * normalized_debt +
            0.1 * (investment_amount / np.max(investment_amount)) +
            0.1 * (current_savings / np.max(current_savings)) +
            0.1 * (other_expenses / np.max(other_expenses)) +
            0.1 * (current_invested_amount / np.max(current_invested_amount))
        ) * 100

    target_values = np.array([create_weighted_target() for _ in range(18)]).T
    target_values = target_values.astype(int)

    # Adding outliers
    num_outliers = 100
    outlier_indices = np.random.choice(num_rows, num_outliers, replace=False)
    for idx in outlier_indices:
        feature_idx = np.random.randint(0, 18)
        target_values[idx, feature_idx] *= np.random.uniform(1.5, 3)

    # Ensuring targets are within specified ranges and integer
    target_ranges = {
        's1_low': (5, 15), 's2_low': (0, 5), 's3_low': (20, 30), 's4_low': (15, 20), 's5_low': (35, 45), 's6_low': (5, 15),
        's1_mid': (15, 25), 's2_mid': (5, 10), 's3_mid': (10, 20), 's4_mid': (20, 30), 's5_mid': (15, 25), 's6_mid': (25, 35),
        's1_high': (25, 40), 's2_high': (25, 35), 's3_high': (5, 10), 's4_high': (5, 10), 's5_high': (0, 15), 's6_high': (15, 25)
    }
    
    target_columns = list(target_ranges.keys())
    for i, col in enumerate(target_columns):
        min_val, max_val = target_ranges[col]
        target_values[:, i] = np.clip(target_values[:, i], min_val, max_val)

    target_df = pd.DataFrame(target_values, columns=target_columns)

    # Creating the DataFrame
    df = pd.DataFrame({
        'years_to_retire': years_to_retire,
        'location': location,
        'salary': salary,
        'investment_amount': investment_amount,
        'current_savings': current_savings,
        'debt': debt,
        'other_expenses': other_expenses,
        'number_of_dependents': number_of_dependents,
        'current_invested_amount': current_invested_amount,
        'house_ownership': house_ownership,
    })
    
    full_df = pd.concat([df, target_df], axis=1)
    
    return full_df

# Generating the data for 50,000 rows with logical relations
user_data_final = generate_final_user_data(50000)

# Saving to CSV
file_path_final = 'new_data_with_importance.csv'
user_data_final.to_csv(file_path_final, index=False)

# Displaying a sample of the data
print(user_data_final.head())


   years_to_retire   location  salary  investment_amount  current_savings  \
0               33    Lucknow  183246                  0           119928   
1               19    Kolkata   29717                  0           108246   
2               12    Chennai  159948              18725          1339139   
3               25       Pune  176368              17375           664623   
4               23  Bangalore  480555              35500          1463199   

     debt  other_expenses  number_of_dependents  current_invested_amount  \
0  159214          134129                     4                        0   
1   73254           37478                     3                    27235   
2       0          114612                     2                   691978   
3   27724           99438                     3                   324632   
4       0          199024                     3                   733991   

  house_ownership  ...  s3_mid  s4_mid  s5_mid  s6_mid  s1_high  s2_high  \
0   

In [18]:
import numpy as np
import pandas as pd
import random

def generate_final_user_data(num_rows):
    np.random.seed(42)
    random.seed(42)
    
    # Initial feature distributions
    years_to_retire = np.random.randint(5, 40, num_rows)
    locations = ['Mumbai', 'Delhi', 'Bangalore', 'Hyderabad', 'Pune', 'Chennai', 'Kolkata', 'Lucknow']
    location = np.random.choice(locations, num_rows)
    
    salary = np.random.randint(20000, 500000, num_rows)
    number_of_dependents = np.random.randint(0, 5, num_rows)
    
    # More debt means less current savings or current invested amount
    debt_peak_years = 15
    debt = ((np.random.uniform(0, 0.3, num_rows) - salary * (1 - abs(years_to_retire - debt_peak_years) / 40)) + number_of_dependents * 20000 + (years_to_retire - debt_peak_years) * 10000).astype(int)
    debt[debt < 0] = 0

    # Adjusting features based on logical relations
    current_savings = (salary * (40 - years_to_retire) * np.random.uniform(0.1, 0.5, num_rows) - number_of_dependents * 30000 - debt * 0.2).astype(int)
    current_savings[current_savings < 0] = 0
    
    current_invested_amount = (current_savings * np.random.uniform(0.5, 0.9, num_rows) - number_of_dependents * 20000).astype(int)
    current_invested_amount[current_invested_amount < 0] = 0
    
    # Higher debt results in lower other expenses but not vice versa
    other_expenses = (salary * np.random.uniform(0.3, 0.7, num_rows) + number_of_dependents * 10000 - debt * 0.1).astype(int)
    other_expenses[other_expenses < 0] = 0
    
    investment_amount = (salary * np.random.uniform(0.1, 0.3, num_rows) - other_expenses * 0.2 - debt * 0.1).astype(int)
    investment_amount[investment_amount < 0] = 0
    
    house_ownership = np.random.choice(['Own House', 'Rent', 'EMI'], num_rows, p=[0.5, 0.3, 0.2])
    
    # Normalize features
    max_salary = np.max(salary)
    max_years_to_retire = np.max(years_to_retire)
    max_debt = np.max(debt)
    max_years = np.max(years_to_retire)
    
    normalized_salary = salary / max_salary
    normalized_years_to_retire = years_to_retire / max_years_to_retire
    normalized_debt = debt / max_debt
    normalized_years = years_to_retire / max_years
    
    # Allocate risk with significant feature influence
    def allocate_risk(low, mid, high, adjustment_factor):
        def adjust_to_100(values):
            total = sum(values)
            if total != 100:
                diff = 100 - total
                values[0] += diff
            return values
        
        low = adjust_to_100([int(x * adjustment_factor) for x in low])
        mid = adjust_to_100([int(x * adjustment_factor) for x in mid])
        high = adjust_to_100([int(x * adjustment_factor) for x in high])
        
        return {
            's1_low': max(0, low[0]), 's2_low': max(0, low[1]), 's3_low': max(0, low[2]), 's4_low': max(0, low[3]), 's5_low': max(0, low[4]), 's6_low': max(0, low[5]),
            's1_mid': max(0, mid[0]), 's2_mid': max(0, mid[1]), 's3_mid': max(0, mid[2]), 's4_mid': max(0, mid[3]), 's5_mid': max(0, mid[4]), 's6_mid': max(0, mid[5]),
            's1_high': max(0, high[0]), 's2_high': max(0, high[1]), 's3_high': max(0, high[2]), 's4_high': max(0, high[3]), 's5_high': max(0, high[4]), 's6_high': max(0, high[5]),
        }
    
    risk_allocation = [allocate_risk(
        low=[random.uniform(0, 25), random.uniform(0, 15), random.uniform(10, 45), random.uniform(15, 20), random.uniform(25, 55), random.uniform(0, 25)],
        mid=[random.uniform(10, 30), random.uniform(0, 20), random.uniform(10, 20), random.uniform(15, 45), random.uniform(10, 45), random.uniform(15, 35)],
        high=[random.uniform(20, 45), random.uniform(15, 50), random.uniform(0, 15), random.uniform(0, 15), random.uniform(0, 15), random.uniform(10, 25)],
        adjustment_factor=(1 + 0.1 * normalized_salary[i] + 0.2 * normalized_years_to_retire[i] - 0.15 * normalized_debt[i] + 0.2 * normalized_years[i])
    ) for i in range(num_rows)]
    
    # Creating the DataFrame
    df = pd.DataFrame({
        'years_to_retire': years_to_retire,
        'location': location,
        'salary': salary,
        'investment_amount': investment_amount,
        'current_savings': current_savings,
        'debt': debt,
        'other_expenses': other_expenses,
        'number_of_dependents': number_of_dependents,
        'current_invested_amount': current_invested_amount,
        'house_ownership': house_ownership,
    })
    
    risk_df = pd.DataFrame(risk_allocation)
    
    
    # Adding outliers to selected features
    def add_outliers(data, num_outliers, feature_names):
        for feature in feature_names:
            outliers = np.random.choice(data.index, size=num_outliers, replace=False)
            data.loc[outliers, feature] = data.loc[outliers, feature] * np.random.uniform(1.5, 3)
        return data

    features_with_outliers = ['salary', 'investment_amount', 'current_savings', 'debt', 'other_expenses', 'current_invested_amount']
    df = add_outliers(df, 5000, features_with_outliers)

    full_df = pd.concat([df, risk_df], axis=1)
    
    return full_df

# Generate the data for 50,000 rows with logical relations
user_data_final = generate_final_user_data(50000)

# Save to CSV
file_path_final = 'real_madrid.csv'
user_data_final.to_csv(file_path_final, index=False)

print(user_data_final.head())


  922104.41979954  526130.55896651]' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  data.loc[outliers, feature] = data.loc[outliers, feature] * np.random.uniform(1.5, 3)
   6813.86322909      0.        ]' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  data.loc[outliers, feature] = data.loc[outliers, feature] * np.random.uniform(1.5, 3)
  398844.39176697 5472961.19836151]' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  data.loc[outliers, feature] = data.loc[outliers, feature] * np.random.uniform(1.5, 3)
      0.          44834.7216905 ]' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  data.loc[outliers, feature] = data.loc[outliers, feature] * np.random.uniform(1.5, 3)
 397387.20661176 366977.79887087]' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  data.loc[outliers, feature] = da

   years_to_retire   location    salary  investment_amount  current_savings  \
0               33    Lucknow  183246.0                0.0         119928.0   
1               19    Kolkata   29717.0                0.0         108246.0   
2               12    Chennai  159948.0            18725.0        1339139.0   
3               25       Pune  176368.0            17375.0         664623.0   
4               23  Bangalore  480555.0            35500.0        1463199.0   

            debt  other_expenses  number_of_dependents  \
0  159214.000000        134129.0                     4   
1  167850.089575         37478.0                     3   
2       0.000000        114612.0                     2   
3   27724.000000         99438.0                     3   
4       0.000000        199024.0                     3   

   current_invested_amount house_ownership  ...  s3_mid  s4_mid  s5_mid  \
0                      0.0            Rent  ...      18      20      22   
1                  27235.0

In [19]:
import numpy as np
import pandas as pd
import random

# Adjusting Returns for Each Risk Level
returns_low = [0.04, 0.06, 0.05, 0.06, 0.04, 0.05]  # Slightly lower average returns
returns_mid = [0.06, 0.08, 0.07, 0.08, 0.06, 0.07]  # Moderate returns
returns_high = [0.10, 0.12, 0.11, 0.12, 0.10, 0.11]  # Higher potential returns

def calculate_goal_amount(profit, investment_amount, years_to_retire, returns):
    average_return = sum(returns) / len(returns)  # Simpler average return calculation
    months_to_retire = years_to_retire * 12
    future_value = investment_amount * ((1 + average_return / 12) ** months_to_retire - 1) / (average_return / 12)
    return future_value + profit  # Include profit in the future value for more direct impact


# Calculate Profits Function Modification
def calculate_profit(row, allocations, returns):
    investment_portion = [row['investment_amount'] * (alloc / 100) for alloc in allocations]
    annual_return = sum([part * ret for part, ret in zip(investment_portion, returns)])
    # Consider adding a risk factor that modifies profit based on the risk level
    risk_factor = 1 + (0.01 * sum(allocations))  # Higher risk leads to a small increase in profit
    profit = annual_return * risk_factor - (row['debt'] * 0.001)
    return profit

def generate_final_user_data(num_rows):
    np.random.seed(42)
    random.seed(42)
    
    # Initial feature distributions
    years_to_retire = np.random.randint(5, 40, num_rows)
    locations = ['Mumbai', 'Delhi', 'Bangalore', 'Hyderabad', 'Pune', 'Chennai', 'Kolkata', 'Lucknow']
    location = np.random.choice(locations, num_rows)
    
    salary = np.random.randint(20000, 500000, num_rows)
    number_of_dependents = np.random.randint(0, 5, num_rows)
    
    debt_peak_years = 15
    debt = ((np.random.uniform(0, 0.3, num_rows) - salary * (1 - abs(years_to_retire - debt_peak_years) / 40)) + number_of_dependents * 20000 + (years_to_retire - debt_peak_years) * 10000).astype(int)
    debt[debt < 0] = 0

    current_savings = (salary * (40 - years_to_retire) * np.random.uniform(0.1, 0.5, num_rows) - number_of_dependents * 30000 - debt * 0.2).astype(int)
    current_savings[current_savings < 0] = 0
    
    current_invested_amount = (current_savings * np.random.uniform(0.5, 0.9, num_rows) - number_of_dependents * 20000).astype(int)
    current_invested_amount[current_invested_amount < 0] = 0
    
    other_expenses = (salary * np.random.uniform(0.3, 0.7, num_rows) + number_of_dependents * 10000 - debt * 0.1).astype(int)
    other_expenses[other_expenses < 0] = 0
    
    investment_amount = (salary * np.random.uniform(0.1, 0.3, num_rows) - other_expenses * 0.2 - debt * 0.1).astype(int)
    investment_amount[investment_amount < 0] = 0
    
    house_ownership = np.random.choice(['Own House', 'Rent', 'EMI'], num_rows, p=[0.5, 0.3, 0.2])
    
    max_salary = np.max(salary)
    max_years_to_retire = np.max(years_to_retire)
    max_debt = np.max(debt)
    max_years = np.max(years_to_retire)
    
    normalized_salary = salary / max_salary
    normalized_years_to_retire = years_to_retire / max_years_to_retire
    normalized_debt = debt / max_debt
    normalized_years = years_to_retire / max_years
    
    def allocate_risk(low, mid, high, adjustment_factor):
        def adjust_to_100(values):
            total = sum(values)
            if total != 100:
                diff = 100 - total
                values[0] += diff/6
                values[1] += diff/6
                values[2] += diff/6
                values[3] += diff/6
                values[4] += diff/6
                values[5] += diff/6
            return values
        
        low = adjust_to_100([int(x * adjustment_factor) for x in low])
        mid = adjust_to_100([int(x * adjustment_factor) for x in mid])
        high = adjust_to_100([int(x * adjustment_factor) for x in high])
        
        return {
            's1_low': max(0, low[0]), 's2_low': max(0, low[1]), 's3_low': max(0, low[2]), 's4_low': max(0, low[3]), 's5_low': max(0, low[4]), 's6_low': max(0, low[5]),
            's1_mid': max(0, mid[0]), 's2_mid': max(0, mid[1]), 's3_mid': max(0, mid[2]), 's4_mid': max(0, mid[3]), 's5_mid': max(0, mid[4]), 's6_mid': max(0, mid[5]),
            's1_high': max(0, high[0]), 's2_high': max(0, high[1]), 's3_high': max(0, high[2]), 's4_high': max(0, high[3]), 's5_high': max(0, high[4]), 's6_high': max(0, high[5]),
        }
    
    risk_allocation = [allocate_risk(
        low=[random.uniform(0, 20), random.uniform(0, 15), random.uniform(10, 45), random.uniform(15, 20), random.uniform(25, 55), random.uniform(0, 25)],
        mid=[random.uniform(10, 30), random.uniform(0, 20), random.uniform(10, 20), random.uniform(15, 45), random.uniform(10, 45), random.uniform(15, 35)],
        high=[random.uniform(20, 50), random.uniform(15, 50), random.uniform(0, 15), random.uniform(0, 15), random.uniform(0, 15), random.uniform(10, 20)],
        adjustment_factor=(1 + 0.25 * normalized_salary[i] + 0.3 * normalized_years_to_retire[i] - 0.15 * normalized_debt[i] + 0.35 * normalized_years[i])
    ) for i in range(num_rows)]
    
    df = pd.DataFrame({
        'years_to_retire': years_to_retire,
        'location': location,
        'salary': salary,
        'investment_amount': investment_amount,
        'current_savings': current_savings,
        'debt': debt,
        'other_expenses': other_expenses,
        'number_of_dependents': number_of_dependents,
        'current_invested_amount': current_invested_amount,
        'house_ownership': house_ownership,
    })
    
    risk_df = pd.DataFrame(risk_allocation)
    full_df = pd.concat([df, risk_df], axis=1)

    # Calculate profits based on a hypothetical profit formula
    full_df['profit_low'] = full_df.apply(lambda row: calculate_profit(row, [row['s1_low'], row['s2_low'], row['s3_low'], row['s4_low'], row['s5_low'], row['s6_low']], returns_low), axis=1)
    full_df['profit_mid'] = full_df.apply(lambda row: calculate_profit(row, [row['s1_mid'], row['s2_mid'], row['s3_mid'], row['s4_mid'], row['s5_mid'], row['s6_mid']], returns_mid), axis=1)
    full_df['profit_high'] = full_df.apply(lambda row: calculate_profit(row, [row['s1_high'], row['s2_high'], row['s3_high'], row['s4_high'], row['s5_high'], row['s6_high']], returns_high), axis=1)

    full_df['goal_low'] = full_df.apply(lambda row: calculate_goal_amount(row['profit_low'], row['investment_amount'], row['years_to_retire'], returns_low), axis=1)
    full_df['goal_mid'] = full_df.apply(lambda row: calculate_goal_amount(row['profit_mid'], row['investment_amount'], row['years_to_retire'], returns_mid), axis=1)
    full_df['goal_high'] = full_df.apply(lambda row: calculate_goal_amount(row['profit_high'], row['investment_amount'], row['years_to_retire'], returns_high), axis=1)

    # Adding outliers to selected features
    def add_outliers(data, num_outliers, feature_names):
        for feature in feature_names:
            outliers = np.random.choice(data.index, size=num_outliers, replace=False)
            data.loc[outliers, feature] = data.loc[outliers, feature] * np.random.uniform(1.5, 3)
        return data

    features_with_outliers = ['salary', 'investment_amount', 'current_savings', 'debt', 'other_expenses', 'current_invested_amount']
    full_df = add_outliers(full_df, 8000, features_with_outliers)

    return full_df

# Generate the data for 50,000 rows with logical relations
user_data_final = generate_final_user_data(50000)

# Save to CSV
file_path_final = 'giraffe.csv'
user_data_final.to_csv(file_path_final, index=False)

print(user_data_final.head())


 1067765.92613058  220075.77612923]' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  data.loc[outliers, feature] = data.loc[outliers, feature] * np.random.uniform(1.5, 3)
      0.          31903.26534459]' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  data.loc[outliers, feature] = data.loc[outliers, feature] * np.random.uniform(1.5, 3)
 3147571.03063376   76749.90148789]' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  data.loc[outliers, feature] = data.loc[outliers, feature] * np.random.uniform(1.5, 3)
 146499.34921059 256204.30168658]' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  data.loc[outliers, feature] = data.loc[outliers, feature] * np.random.uniform(1.5, 3)
 837350.40497057 226042.32390402]' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  data.loc[outliers, feature] = da

   years_to_retire   location         salary  investment_amount  \
0               33    Lucknow  183246.000000                0.0   
1               19    Kolkata   29717.000000                0.0   
2               12    Chennai  352267.480339            18725.0   
3               25       Pune  176368.000000            17375.0   
4               23  Bangalore  480555.000000            35500.0   

   current_savings           debt  other_expenses  number_of_dependents  \
0         119928.0  159214.000000        134129.0                     4   
1         108246.0  167850.089575         37478.0                     3   
2        1339139.0       0.000000        114612.0                     2   
3         664623.0   27724.000000         99438.0                     3   
4        1463199.0       0.000000        199024.0                     3   

   current_invested_amount house_ownership  ...    s3_high   s4_high  \
0                      0.0            Rent  ...  11.333333  8.333333   
1 

In [11]:
import numpy as np
import pandas as pd
import random

def calculate_goal_amount(profit, investment_amount, salary, debt, years_to_retire, returns):
    months_to_retire = years_to_retire * 12
    future_value = profit + (investment_amount * salary / (debt + 1)) * ((1 + np.mean(returns) / 12) ** months_to_retire - 1) / (np.mean(returns) / 12)
    return future_value

def normalize_allocations(allocations):
    """ Normalizes the list of allocations to sum to 100%, proportionally adjusting the values. """
    total = sum(allocations)
    normalized = [int(round(100 * a / total)) for a in allocations]
    diff = 100 - sum(normalized)  # In case rounding errors occur
    for i in range(abs(diff)):  # Adjust to make sure the sum is exactly 100
        index = i % len(allocations)
        normalized[index] += np.sign(diff)
    return normalized

def allocate_risk(normalized_salary, normalized_years_to_retire, normalized_debt, normalized_years):
    adjustment_factor = (1 + 0.15 * normalized_salary + 0.2 * normalized_years_to_retire - 0.1 * normalized_debt + 0.2 * normalized_years)
    
    # Define initial allocations with flexibility to adjust as per requirement
    initial_low = [15, 15, 20, 20, 15, 15]
    initial_mid = [20, 20, 15, 15, 15, 15]
    initial_high = [25, 25, 10, 10, 15, 15]
    
    # Adjusting allocations based on the adjustment factor and normalizing them
    low_allocations = normalize_allocations([int(a * adjustment_factor) for a in initial_low])
    mid_allocations = normalize_allocations([int(a * adjustment_factor) for a in initial_mid])
    high_allocations = normalize_allocations([int(a * adjustment_factor) for a in initial_high])

    return {
        's1_low': low_allocations[0], 's2_low': low_allocations[1], 's3_low': low_allocations[2],
        's4_low': low_allocations[3], 's5_low': low_allocations[4], 's6_low': low_allocations[5],
        's1_mid': mid_allocations[0], 's2_mid': mid_allocations[1], 's3_mid': mid_allocations[2],
        's4_mid': mid_allocations[3], 's5_mid': mid_allocations[4], 's6_mid': mid_allocations[5],
        's1_high': high_allocations[0], 's2_high': high_allocations[1], 's3_high': high_allocations[2],
        's4_high': high_allocations[3], 's5_high': high_allocations[4], 's6_high': high_allocations[5],
    }

def generate_final_user_data(num_rows):
    np.random.seed(42)
    random.seed(42)
    
    # Data generation as before
    years_to_retire = np.random.randint(5, 40, num_rows)
    locations = ['Mumbai', 'Delhi', 'Bangalore', 'Hyderabad', 'Pune', 'Chennai', 'Kolkata', 'Lucknow']
    location = np.random.choice(locations, num_rows)
    
    salary = np.random.randint(20000, 500000, num_rows)
    number_of_dependents = np.random.randint(0, 5, num_rows)
    
    # Financial calculations as before
    debt_peak_years = 15
    debt = ((np.random.uniform(0, 0.3, num_rows) - salary * (1 - abs(years_to_retire - debt_peak_years) / 40)) + number_of_dependents * 20000 + (years_to_retire - debt_peak_years) * 10000).astype(int)
    debt[debt < 0] = 0

    current_savings = (salary * (40 - years_to_retire) * np.random.uniform(0.1, 0.5, num_rows) - number_of_dependents * 30000 - debt * 0.2).astype(int)
    current_invested_amount = (current_savings * np.random.uniform(0.5, 0.9, num_rows) - number_of_dependents * 20000).astype(int)
    other_expenses = (salary * np.random.uniform(0.3, 0.7, num_rows) + number_of_dependents * 10000 - debt * 0.1).astype(int)
    investment_amount = (salary * np.random.uniform(0.1, 0.3, num_rows) - other_expenses * 0.2 - debt * 0.1).astype(int)

    house_ownership = np.random.choice(['Own House', 'Rent', 'EMI'], num_rows, p=[0.5, 0.3, 0.2])

    max_salary = np.max(salary)
    max_years_to_retire = np.max(years_to_retire)
    max_debt = np.max(debt)
    max_years = np.max(years_to_retire)
    
    normalized_salary = salary / max_salary
    normalized_years_to_retire = years_to_retire / max_years_to_retire
    normalized_debt = debt / max_debt
    normalized_years = years_to_retire / max_years
    
    # Allocate risks based on normalized data
    risk_allocation = [allocate_risk(
        normalized_salary[i], normalized_years_to_retire[i], normalized_debt[i], normalized_years[i]
    ) for i in range(num_rows)]
    
    df = pd.DataFrame({
        'years_to_retire': years_to_retire,
        'location': location,
        'salary': salary,
        'investment_amount': investment_amount,
        'current_savings': current_savings,
        'debt': debt,
        'other_expenses': other_expenses,
        'number_of_dependents': number_of_dependents,
        'current_invested_amount': current_invested_amount,
        'house_ownership': house_ownership,
    })
    
    risk_df = pd.DataFrame(risk_allocation)
    full_df = pd.concat([df, risk_df], axis=1)

    # Calculate profits based on a hypothetical profit formula
    full_df['profit_low'] = full_df.apply(lambda row: (row['investment_amount'] * np.sum([row['s1_low'], row['s2_low'], row['s3_low'], row['s4_low'], row['s5_low'], row['s6_low']]) / 600) + row['salary'] - row['debt'] * 0.1, axis=1)
    full_df['profit_mid'] = full_df.apply(lambda row: (row['investment_amount'] * np.sum([row['s1_mid'], row['s2_mid'], row['s3_mid'], row['s4_mid'], row['s5_mid'], row['s6_mid']]) / 600) + row['salary'] - row['debt'] * 0.1, axis=1)
    full_df['profit_high'] = full_df.apply(lambda row: (row['investment_amount'] * np.sum([row['s1_high'], row['s2_high'], row['s3_high'], row['s4_high'], row['s5_high'], row['s6_high']]) / 600) + row['salary'] - row['debt'] * 0.1, axis=1)

    # Adding goal calculations based on profit
    returns_low = [0.08, 0.15, 0.05, 0.06, 0.04, 0.05]
    returns_mid = [0.08, 0.15, 0.05, 0.06, 0.04, 0.05]
    returns_high = [0.08, 0.15, 0.05, 0.06, 0.04, 0.05]

    full_df['goal_low'] = full_df.apply(lambda row: calculate_goal_amount(row['profit_low'], row['investment_amount'], row['salary'], row['debt'], row['years_to_retire'], returns_low), axis=1)
    full_df['goal_mid'] = full_df.apply(lambda row: calculate_goal_amount(row['profit_mid'], row['investment_amount'], row['salary'], row['debt'], row['years_to_retire'], returns_mid), axis=1)
    full_df['goal_high'] = full_df.apply(lambda row: calculate_goal_amount(row['profit_high'], row['investment_amount'], row['salary'], row['debt'], row['years_to_retire'], returns_high), axis=1)

    # Adding outliers to selected features
    def add_outliers(data, num_outliers, feature_names):
        for feature in feature_names:
            outliers = np.random.choice(data.index, size=num_outliers, replace=False)
            data.loc[outliers, feature] = data.loc[outliers, feature] * np.random.uniform(1.5, 3)
        return data

    features_with_outliers = ['salary', 'investment_amount', 'current_savings', 'debt', 'other_expenses', 'current_invested_amount']
    full_df = add_outliers(full_df, 5000, features_with_outliers)

    return full_df

user_data_final = generate_final_user_data(50000)

# Save to CSV
file_path_final = 'pineapple.csv'
user_data_final.to_csv(file_path_final, index=False)

print(user_data_final.head())


  922104.41979954  526130.55896651]' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  data.loc[outliers, feature] = data.loc[outliers, feature] * np.random.uniform(1.5, 3)
   6813.86322909 -52688.01897369]' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  data.loc[outliers, feature] = data.loc[outliers, feature] * np.random.uniform(1.5, 3)
  398844.39176697 5472961.19836151]' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  data.loc[outliers, feature] = data.loc[outliers, feature] * np.random.uniform(1.5, 3)
      0.          44834.7216905 ]' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  data.loc[outliers, feature] = data.loc[outliers, feature] * np.random.uniform(1.5, 3)
 397387.20661176 366977.79887087]' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  data.loc[outliers, feature] = da

   years_to_retire   location    salary  investment_amount  current_savings  \
0               33    Lucknow  183246.0            -3487.0         119928.0   
1               19    Kolkata   29717.0            -9977.0         108246.0   
2               12    Chennai  159948.0            18725.0        1339139.0   
3               25       Pune  176368.0            17375.0         664623.0   
4               23  Bangalore  480555.0            35500.0        1463199.0   

            debt  other_expenses  number_of_dependents  \
0  159214.000000        134129.0                     4   
1  167850.089575         37478.0                     3   
2       0.000000        114612.0                     2   
3   27724.000000         99438.0                     3   
4       0.000000        199024.0                     3   

   current_invested_amount house_ownership  ...  s3_high  s4_high  s5_high  \
0                 -14833.0            Rent  ...       10       10       15   
1                  2