In [2]:
import numpy as np
import pandas as pd

## Dataset Creation of four groups (Random Noise, Yes, No, Considered but No)

In this file, the first synthetic dataset is created. It contains three distinct homeowner groups, each reflecting a different behavior regarding energy consultations.

The target variable is called booked_energy_consultation, which indicates whether a homeowner booked an energy consultation or not.
- The “yes” group represents homeowners who booked a consultation.
- The “no” group includes those who decided not to book one.
- The “considered_but_no” group consists of homeowners who initially showed interest but did not follow through.

Additionally, a random noise group was added to simulate real-world unpredictability and increase the dataset’s realism.
Each group is first stored in a separate DataFrame. These are then combined into one final dataset and exported as an Excel file.

### Creating Random Noise

In real-world datasets, not all data points follow clear or predictable patterns. To make our synthetic dataset more realistic, we introduced a group of randomly generated entries that do not reflect any of the predefined customer profiles. This random noise simulates real-life variability and uncertainty, which helps prevent the model from overfitting.

In [3]:
# Set seed for reproducibility
np.random.seed(42)

# Define the size of the dataset
n = 1500

# Create age variable with specified parameters
age = np.clip(np.round(np.random.normal(40, 25, n)), 20, 99).astype(int)

# Create gender variable with specified probabilities
gender = np.random.choice(['Male', 'Female', 'Other', 'Prefer not to say'], size=n, p=[0.46, 0.46, 0.04, 0.04])

# Create household_size variable with specified parameters
household_size = np.clip(np.round(np.random.normal(3, 1, n)), 1, 10).astype(int)

# Initially set occupation_status
occupation_status = np.random.choice(['Employed', 'Self-employed', 'Unemployed'], size=n, p=[0.8, 0.16, 0.04])

# Adjust occupation_status based on age
occupation_status = np.where(age >= 67, 'Retired', occupation_status)

# Create initial income variable with a general distribution
income = np.clip(np.round(np.random.normal(53000, 19000, n), -2), 10000, 500000).astype(int)

# Adjust income for 'Unemployed' and 'Retired'
unemployed_or_retired = np.isin(occupation_status, ['Unemployed', 'Retired'])
income[unemployed_or_retired] = np.clip(np.round(np.random.normal(5000, 2000, sum(unemployed_or_retired))), 0, 20000).astype(int)

# Create house_type variable with specified probabilities
house_type = np.random.choice(['Detached', 'Multi-family House'], size=n, p=[0.8, 0.2])

# Create house_age variable with specified parameters
house_age = np.clip(np.round(np.random.normal(1980, 35, n)), 1900, 2020).astype(int)

# Create house_size with a broad range to allow correlation adjustments later
house_size = np.clip(np.round(np.random.normal(150, 200, n), -2), 90, 3000).astype(int)

# Create a positively correlated energy_bill based on house_size
energy_bill = np.clip(np.round(house_size / 1500 * np.random.normal(123, 70, n) + 123), 90, 500).astype(int)

# Create location variable with specified probabilities
location = np.random.choice(['Urban', 'Rural'], size=n, p=[0.3, 0.7])

# Create energy_source variable with specified probabilities
energy_source = np.random.choice(['Non-renewable sources', 'Renewable sources'], size=n, p=[0.6, 0.4])

# Create discrete rating variables for various categories
knowledge_energy = np.clip(np.round(np.random.normal(3, 1.5, n)), 1, 5).astype(int)
energy_awareness = np.clip(np.round(np.random.normal(3, 1.5, n)), 1, 5).astype(int)
attitude_energy_reduction = np.clip(np.round(np.random.normal(3, 1.5, n)), 1, 5).astype(int)
investment_willingness = np.clip(np.round(np.random.normal(3, 1.5, n)), 1, 5).astype(int)
perceived_efficiency = np.clip(np.round(np.random.normal(3.5, 1.5, n)), 1, 5).astype(int)
environment_concern = np.clip(np.round(np.random.normal(3, 1.5, n)), 1, 5).astype(int)

# Create belief_climate_change variable with specified probabilities
belief_climate_change = np.random.choice(['Yes', 'No'], size=n, p=[0.75, 0.25])

# Create financial_awareness variable with specified probabilities
financial_awareness = np.random.choice(['Yes', 'No'], size=n, p=[0.2, 0.8])

# Create previous_renovations variable with specified parameters
previous_renovations = np.clip(np.round(np.random.normal(2, 1, n)), 0, 10).astype(int)

# Create booked_energy_consultation variable with specified probabilities
booked_energy_consultation = np.random.choice(['Yes', 'Considered but not used', 'No'], size=n, p=[(1/3), (1/3), (1/3)])

# Compile all variables into a DataFrame
df = pd.DataFrame({
    'age': age,
    'gender': gender,
    'household_size': household_size,
    'occupation_status': occupation_status,
    'income': income,
    'house_type': house_type,
    'house_age': house_age,
    'house_size': house_size,
    'location': location,
    'energy_bill': energy_bill,
    'energy_source': energy_source,
    'knowledge_energy': knowledge_energy,
    'energy_awareness': energy_awareness,
    'attitude_energy_reduction': attitude_energy_reduction,
    'investment_willingness': investment_willingness,
    'belief_climate_change': belief_climate_change,
    'financial_awareness': financial_awareness,
    'perceived_efficiency': perceived_efficiency,
    'environment_concern': environment_concern,
    'previous_renovations': previous_renovations,
    'booked_energy_consultation': booked_energy_consultation
})

# Display the first few rows of the dataset and check
print(df.head())

df_random = df.copy()

   age  gender  household_size occupation_status  income house_type  \
0   52    Male               3        Unemployed    2047   Detached   
1   37  Female               2     Self-employed   24300   Detached   
2   56  Female               3          Employed   27700   Detached   
3   78    Male               1           Retired    8611   Detached   
4   34    Male               4     Self-employed   39700   Detached   

   house_age  house_size location  energy_bill  ... knowledge_energy  \
0       2003         300    Rural          166  ...                4   
1       1989         200    Rural          134  ...                3   
2       1934         300    Urban          136  ...                2   
3       1965          90    Rural          131  ...                3   
4       2011         200    Rural          137  ...                3   

   energy_awareness  attitude_energy_reduction  investment_willingness  \
0                 2                          2                    

### Group 1: Yes

This group represents homeowners who decided to book an energy consultation. Their profiles were designed based on common patterns observed in our interviews, such as high environmental awareness, high income, and recent renovation activity.These are traits typically associated with engaged and proactive customers.

In [4]:
# Set seed for reproducibility
np.random.seed(42)

# Define the size of the target dataset
n_target = 2000  # adjust this as needed for your overall dataset

# Generate data for the target population
age_target = np.random.normal((50+61)/2, 3, n_target).astype(int)
gender_target = np.array(['Male'] * n_target)
household_size_target = np.array([2] * n_target)
occupation_status_target = np.array(['Employed'] * n_target)
income_target = np.random.normal(80000, 6000, n_target).astype(int)
house_type_target = np.random.choice(['Detached', 'Multi-family House'], size=n_target)  # Assuming house type is not important
house_age_target = np.random.normal((1950 + 1980) / 2, 7, n_target).astype(int)
house_size_target = np.random.normal((200 + 350) / 2, 15, n_target).astype(int)
location_target = np.random.choice(['Urban', 'Rural'], size=n_target)
energy_bill_target = np.random.normal((150 + 175) / 2, 15, n_target).astype(int)
energy_source_target = np.array(['Non-renewable sources'] * n_target)
knowledge_energy_target = np.array([2] * n_target)
energy_awareness_target = np.array([2] * n_target)
attitude_energy_reduction_target = np.random.randint(1, 6, n_target)
investment_willingness_target = np.array([4] * n_target)
belief_climate_change_target = np.array(['Yes'] * n_target)
financial_awareness_target = np.random.choice(['Yes', 'No'], size=n_target, p=[0.2, 0.8])
perceived_efficiency_target = np.array([2] * n_target)
environment_concern_target = np.random.randint(1, 6, n_target)
previous_renovations_target = np.random.randint(0, 11, n_target)
booked_energy_consultation_target = np.array(['Yes'] * n_target)

# Compile all variables into a DataFrame
df_target_persona = pd.DataFrame({
    'age': age_target,
    'gender': gender_target,
    'household_size': household_size_target,
    'occupation_status': occupation_status_target,
    'income': income_target,
    'house_type': house_type_target,
    'house_age': house_age_target,
    'house_size': house_size_target,
    'location': location_target,
    'energy_bill': energy_bill_target,
    'energy_source': energy_source_target,
    'knowledge_energy': knowledge_energy_target,
    'energy_awareness': energy_awareness_target,
    'attitude_energy_reduction': attitude_energy_reduction_target,
    'investment_willingness': investment_willingness_target,
    'belief_climate_change': belief_climate_change_target,
    'financial_awareness': financial_awareness_target,
    'perceived_efficiency': perceived_efficiency_target,
    'environment_concern': environment_concern_target,
    'previous_renovations': previous_renovations_target,
    'booked_energy_consultation': booked_energy_consultation_target
})

df_yes = df_target_persona.copy()

### Group: No (Did not book consultation)

This group includes homeowners who chose not to book an energy consultation. Their profiles reflect lower environmental awareness, limited knowledge of subsidies, or financial constraints—factors that, according to our interviews, commonly discourage engagement with energy consulting services.

In [5]:
# Set seed for reproducibility
np.random.seed(42)

# Define the size of the target dataset
n_target = 2000  # adjust this to your needs

# Generate data for the target population
age_target = np.round(np.random.normal(25, 4, n_target)).astype(int)
household_size_target = np.ones(n_target)
occupation_status_target = np.random.choice(['Employed', 'Unemployed'], size=n_target, p=[0.5, 0.5])
income_target = np.clip(np.round(np.random.normal(25000, 5000, n_target)), 10000, 50000).astype(int)
house_type_target = np.random.choice(['Detached', 'Multi-family House'], size=n_target, p=[0.8, 0.2])
house_age_target = np.clip(np.round(np.random.normal((2010 + 2020) / 2, 7, n_target)), 2010, 2020).astype(int)
house_size_target = np.clip(np.round(np.random.normal((100 + 120) / 2, 5, n_target)), 100, 120).astype(int)
location_target = np.random.choice(['Urban', 'Rural'], size=n_target, p=[0.3, 0.7])
energy_bill_target = np.clip(np.round(np.random.normal((90 + 120) / 2, 5, n_target)), 90, 120).astype(int)
energy_source_target = np.array(['Renewable sources'] * n_target)
knowledge_energy_target = np.array([4] * n_target)
energy_awareness_target = np.array([4] * n_target)
attitude_energy_reduction_target = np.array([2] * n_target)
investment_willingness_target = np.array([1] * n_target)
belief_climate_change_target = np.random.choice(['Yes', 'No'], size=n_target, p=[0.75, 0.25])
financial_awareness_target = np.array(['No'] * n_target)
perceived_efficiency_target = np.random.randint(1, 6, n_target)
environment_concern_target = np.array([2] * n_target)
previous_renovations_target = np.random.randint(0, 11, n_target)
booked_energy_consultation_target = np.array(['No'] * n_target)

# Compile all variables into a DataFrame
df_target = pd.DataFrame({
    'age': age_target,
    'gender': np.random.choice(['Male', 'Female', 'Other', 'Prefer not to say'], size=n_target, p=[0.46, 0.46, 0.04, 0.04]),
    'household_size': household_size_target,
    'occupation_status': occupation_status_target,
    'income': income_target,
    'house_type': house_type_target,
    'house_age': house_age_target,
    'house_size': house_size_target,
    'location': location_target,
    'energy_bill': energy_bill_target,
    'energy_source': energy_source_target,
    'knowledge_energy': knowledge_energy_target,
    'energy_awareness': energy_awareness_target,
    'attitude_energy_reduction': attitude_energy_reduction_target,
    'investment_willingness': investment_willingness_target,
    'belief_climate_change': belief_climate_change_target,
    'financial_awareness': financial_awareness_target,
    'perceived_efficiency': perceived_efficiency_target,
    'environment_concern': environment_concern_target,
    'previous_renovations': previous_renovations_target,
    'booked_energy_consultation': booked_energy_consultation_target
})

df_no = df_target.copy()

### Group: Considered but not used

This group includes homeowners who expressed interest in an energy consultation but ultimately decided against it. They tend to be informed and environmentally conscious, often living in rural areas with moderate income and good awareness of energy issues. However, their lower willingness to invest and uncertainty about the actual efficiency gains may have prevented them from following through.

In [7]:
# Set seed for reproducibility
np.random.seed(42)

n_target = 2000

# Generate data for the target population
age_target = np.random.normal((35 + 45) / 2, 3, n_target).astype(int)
gender_target = np.array(['Female'] * n_target)
household_size_target = np.array([3] * n_target)
occupation_status_target = np.random.choice(['Employed', 'Self-employed'], size=n_target, p=[0.5, 0.5])
income_target = np.clip(np.round(np.random.normal(50000, 7000, n_target)), 10000, 500000).astype(int)
house_type_target = np.array(['Multi-family House'] * n_target)
house_age_target = np.clip(np.random.normal((1985 + 2003) / 2, 7, n_target), 1975, 2005).astype(int)
house_size_target = np.clip(np.random.normal((120 + 200) / 2, 10, n_target), 120, 200).astype(int)
location_target = np.array(['Rural'] * n_target)
energy_bill_target = np.clip(np.random.normal((120 + 145) / 2, 5, n_target), 120, 145).astype(int)
energy_source_target = np.array(['Non-renewable sources'] * n_target)
knowledge_energy_target = np.array([4] * n_target)
energy_awareness_target = np.array([4] * n_target)
attitude_energy_reduction_target = np.array([4] * n_target)
investment_willingness_target = np.array([2] * n_target)
belief_climate_change_target = np.array(['Yes'] * n_target)
financial_awareness_target = np.random.choice(['Yes', 'No'], n_target)
perceived_efficiency_target = np.random.randint(1, 6, n_target)
environment_concern_target = np.array([4] * n_target)
previous_renovations_target = np.clip(np.random.normal(5, 1, n_target), 4, 6).astype(int)
booked_energy_consultation_target = np.array(['Considered but not used'] * n_target)

# Compile all variables into a DataFrame
df_target_persona = pd.DataFrame({
    'age': age_target,
    'gender': gender_target,
    'household_size': household_size_target,
    'occupation_status': occupation_status_target,
    'income': income_target,
    'house_type': house_type_target,
    'house_age': house_age_target,
    'house_size': house_size_target,
    'location': location_target,
    'energy_bill': energy_bill_target,
    'energy_source': energy_source_target,
    'knowledge_energy': knowledge_energy_target,
    'energy_awareness': energy_awareness_target,
    'attitude_energy_reduction': attitude_energy_reduction_target,
    'investment_willingness': investment_willingness_target,
    'belief_climate_change': belief_climate_change_target,
    'financial_awareness': financial_awareness_target,
    'perceived_efficiency': perceived_efficiency_target,
    'environment_concern': environment_concern_target,
    'previous_renovations': previous_renovations_target,
    'booked_energy_consultation': booked_energy_consultation_target
})

# Corrected variable name to not invoke as a function
df_considered = df_target.copy()

## Combining the three datasets

After generating all customer groups (booked, not booked, considered but not used, and random noise), I concatenated them into a single dataset. To ensure a realistic and unbiased distribution, I then shuffled the rows and reset the index.

In [8]:
# Combine the DataFrames
combined_df = pd.concat([df_random, df_yes, df_no, df_considered], ignore_index=True)
combined_df = combined_df.sample(frac=1).reset_index(drop=True)

# Save the combined DataFrame to an Excel file
combined_df.to_excel('combined_data.xlsx', index=False)
#print("Data combined successfully and saved to 'combined_data.xlsx'")