In [1]:
import pandas as pd
import random
import numpy as np
from faker import Faker
import re

# Creating a Database with Fake Leads

This dataset is designed to resemble a typical lead list, like those that can be purchased online. Such lists usually include contact information for potential customers but are often of low quality and contain many uninterested individuals.

The goal of this dataset is to apply the trained model to identify and prioritize leads with a high likelihood of booking an energy consultation—saving time and effort for energy consultants by filtering out unlikely candidates.

Fake contact information (name, email, phone number, and gender) was generated using the Faker library. The remaining customer characteristics were created in the same way as in 01_Create_Datasets.ipynb.

### Creating 1000 Fake Characters

In [2]:
# Initialize Faker to generate fake data
fake = Faker('de_DE')  # Set the locale to German

# Number of fake entries to generate
num_entries = 1000

# Create lists to store fake data
names = []
emails = []
phone_numbers = []
gender = []

# Possible email domains
email_domains = ['gmail.com', 'outlook.com', 'yahoo.com', 'gmx.de', 'hotmail.com', 'web.de', 'strato.de', 'ionos.com']

# Generate fake data
for _ in range(num_entries):
    # Generate a German name
    name = fake.name()
    genderx = np.random.choice(['Male', 'Female', 'Prefer not to say', 'Other'], p=[0.46, 0.46, 0.04, 0.04])
    if genderx == 'Male':
        name = fake.first_name_male() + ' ' + fake.last_name_male()
    elif genderx == 'Female':
        name = fake.first_name_female() + ' ' + fake.last_name_female()
    else:
        name = fake.first_name() + ' ' + fake.last_name()

    # Randomly select an email domain
    email_domain = random.choice(email_domains)

    # Randomly select the structure of the email address
    email_structure = np.random.choice(['with_dot', 'without_dot', 'random'], p=[0.4, 0.5, 0.1])

    if email_structure == 'with_dot':
        # Generate email with a dot between the first and last name
        email = (name.replace(" ", ".").lower() + "@" + email_domain).replace("ß", "ss")
    elif email_structure == 'without_dot':
        # Generate email without a dot between the first and last name
        email = (name.replace(" ", "").lower() + "@" + email_domain).replace("ß", "ss")
    else:
        # Generate a random email address
        email = fake.email()
        for domain in ['example.com', 'example.org', 'example.net']:
            email = email.replace(domain, random.choice(email_domains))

    names.append(name)
    emails.append(email)
    gender.append(genderx)
    # Generate German phone numbers
    phone_numbers.append('+49' + fake.numerify(text='###########'))

# Create DataFrame using pandas
data = {'Name': names, 'Email': emails, 'Phone Number': phone_numbers, 'gender': gender}
df_fake = pd.DataFrame(data)

df_fake.head()


Unnamed: 0,Name,Email,Phone Number,gender
0,Trude Nette,trudenette@yahoo.com,4915681574010,Female
1,Cristina Weller,cristinaweller@yahoo.com,4955177386314,Female
2,Christos Naser,christosnaser@ionos.com,4918989527203,Male
3,Benjamin Paffrath,benjaminpaffrath@gmail.com,4958133646093,Male
4,Sören Harloff,sören.harloff@strato.de,4917199779468,Male


### Create 1000 Random Observations

These random observations resemble random noise

In [3]:
# Setting seed for reproducibility
np.random.seed(0)

# Defining the size of the dataset
n = 1000

# Creating different variables, just as in 01_Create Datasets.ipynb
age = np.clip(np.round(np.random.normal(40, 25, n)), 20, 99).astype(int)
gender = np.random.choice(['Male', 'Female', 'Other', 'Prefer not to say'], size=n, p=[0.46, 0.46, 0.04, 0.04])
household_size = np.clip(np.round(np.random.normal(3, 1, n)), 1, 10).astype(int)
occupation_status = np.random.choice(['Employed', 'Self-employed', 'Unemployed'], size=n, p=[0.8, 0.16, 0.04])
occupation_status = np.where(age >= 67, 'Retired', occupation_status)
income = np.clip(np.round(np.random.normal(53000, 19000, n), -2), 10000, 500000).astype(int)
unemployed_or_retired = np.isin(occupation_status, ['Unemployed', 'Retired'])
income[unemployed_or_retired] = np.clip(np.round(np.random.normal(5000, 2000, sum(unemployed_or_retired))), 0, 20000).astype(int)
house_type = np.random.choice(['Detached', 'Multi-family House'], size=n, p=[0.8, 0.2])
house_age = np.clip(np.round(np.random.normal(1980, 35, n)), 1900, 2020).astype(int)
house_size = np.clip(np.round(np.random.normal(150, 200, n), -2), 90, 3000).astype(int)
energy_bill = np.clip(np.round(house_size / 1500 * np.random.normal(123, 70, n) + 123), 90, 500).astype(int)
location = np.random.choice(['Urban', 'Rural'], size=n, p=[0.3, 0.7])
energy_source = np.random.choice(['Non-renewable sources', 'Renewable sources'], size=n, p=[0.6, 0.4])

knowledge_energy = np.clip(np.round(np.random.normal(3, 1.5, n)), 1, 5).astype(int)
energy_awareness = np.clip(np.round(np.random.normal(3, 1.5, n)), 1, 5).astype(int)
attitude_energy_reduction = np.clip(np.round(np.random.normal(3, 1.5, n)), 1, 5).astype(int)
investment_willingness = np.clip(np.round(np.random.normal(3, 1.5, n)), 1, 5).astype(int)
perceived_efficiency = np.clip(np.round(np.random.normal(3.5, 1.5, n)), 1, 5).astype(int)
environment_concern = np.clip(np.round(np.random.normal(3, 1.5, n)), 1, 5).astype(int)
belief_climate_change = np.random.choice(['Yes', 'No'], size=n, p=[0.75, 0.25])
financial_awareness = np.random.choice(['Yes', 'No'], size=n, p=[0.2, 0.8])
previous_renovations = np.clip(np.round(np.random.normal(2, 1, n)), 0, 10).astype(int)

# Compiling all variables into a DataFrame
df = pd.DataFrame({
    'age': age,
    'gender': gender,
    'household_size': household_size,
    'occupation_status': occupation_status,
    'income': income,
    'house_type': house_type,
    'house_age': house_age,
    'house_size': house_size,
    'location': location,
    'energy_bill': energy_bill,
    'energy_source': energy_source,
    'knowledge_energy': knowledge_energy,
    'energy_awareness': energy_awareness,
    'attitude_energy_reduction': attitude_energy_reduction,
    'investment_willingness': investment_willingness,
    'belief_climate_change': belief_climate_change,
    'financial_awareness': financial_awareness,
    'perceived_efficiency': perceived_efficiency,
    'environment_concern': environment_concern,
    'previous_renovations': previous_renovations,
})

# Display the first few rows of the dataset
df.head()

Unnamed: 0,age,gender,household_size,occupation_status,income,house_type,house_age,house_size,location,energy_bill,energy_source,knowledge_energy,energy_awareness,attitude_energy_reduction,investment_willingness,belief_climate_change,financial_awareness,perceived_efficiency,environment_concern,previous_renovations
0,84,Female,3,Retired,4418,Detached,1950,300,Rural,168,Renewable sources,3,1,4,1,Yes,Yes,4,5,3
1,50,Female,5,Employed,75600,Detached,1992,300,Rural,146,Non-renewable sources,4,5,2,1,Yes,No,3,2,2
2,64,Female,1,Employed,63000,Detached,2011,400,Rural,148,Renewable sources,1,2,2,2,No,No,5,1,1
3,96,Prefer not to say,3,Retired,4159,Detached,1994,200,Rural,135,Renewable sources,4,4,4,2,Yes,No,5,4,4
4,87,Female,2,Retired,8555,Detached,2014,200,Rural,131,Renewable sources,4,4,1,4,No,No,2,5,1


### Creating Group Yes

To introduce some high-quality leads into the dataset, an additional “yes” group was created. This group of 100 represents homeowners with a strong interest in booking an energy consultation.

In [4]:
# Initialize Faker to generate fake data
fake = Faker('de_DE')  # Set the locale to German

# Number of fake entries to generate
num_entries = 100

# Create lists to store fake data
names = []
emails = []
phone_numbers = []
# Possible email domains
email_domains = ['gmail.com', 'outlook.com', 'yahoo.com', 'gmx.de', 'hotmail.com', 'web.de', 'strato.de', 'ionos.com']

# Generate fake data, just like above
for _ in range(num_entries):
    
    name = fake.first_name_male() + ' ' + fake.last_name_male()
    email_domain = random.choice(email_domains)

    email_structure = np.random.choice(['with_dot', 'without_dot', 'random'], p=[0.4, 0.5, 0.1])

    if email_structure == 'with_dot':
        email = (name.replace(" ", ".").lower() + "@" + email_domain).replace("ß", "ss")
    elif email_structure == 'without_dot':
        email = (name.replace(" ", "").lower() + "@" + email_domain).replace("ß", "ss")
    else:
        email = fake.email()
        for domain in ['example.com', 'example.org', 'example.net']:
            email = email.replace(domain, random.choice(email_domains))

    names.append(name)
    emails.append(email)

    phone_numbers.append('+49' + fake.numerify(text='###########'))

# Create DataFrame using pandas
data = {'Name': names, 'Email': emails, 'Phone Number': phone_numbers}
df_fake_yes = pd.DataFrame(data)

In [5]:
# Set seed for reproducibility
np.random.seed(42)

# Define the size of the target dataset
n_target = 100  # adjust this as needed for your overall dataset

# Generate data for the target population
#age_target = np.random.randint(50, 61, n_target)
age_target = np.random.normal((50+61)/2, 3, n_target).astype(int)
gender_target = np.array(['Male'] * n_target)
household_size_target = np.array([2] * n_target)
occupation_status_target = np.array(['Employed'] * n_target)
income_target = np.random.normal(80000, 6000, n_target).astype(int)
house_type_target = np.random.choice(['Detached', 'Multi-family House'], size=n_target)  # Assuming house type is not important
house_age_target = np.random.normal((1950 + 1980) / 2, 7, n_target).astype(int)
house_size_target = np.random.normal((200 + 350) / 2, 15, n_target).astype(int)
location_target = np.random.choice(['Urban', 'Rural'], size=n_target)  # Random as 'Not important'
energy_bill_target = np.random.normal((150 + 175) / 2, 15, n_target).astype(int)
energy_source_target = np.array(['Non-renewable sources'] * n_target)
knowledge_energy_target = np.array([2] * n_target)
energy_awareness_target = np.array([2] * n_target)
attitude_energy_reduction_target = np.random.randint(1, 6, n_target)  # Random as 'Not important'
investment_willingness_target = np.array([4] * n_target)
belief_climate_change_target = np.array(['Yes'] * n_target)
financial_awareness_target = np.random.choice(['Yes', 'No'], size=n_target, p=[0.2, 0.8])
perceived_efficiency_target = np.array([2] * n_target)
environment_concern_target = np.random.randint(1, 6, n_target)  # Random as 'Not important'
previous_renovations_target = np.random.randint(0, 11, n_target)  # No specific correlation mentioned

# Compile all variables into a DataFrame
df_target_persona = pd.DataFrame({
    'age': age_target,
    'gender': gender_target,
    'household_size': household_size_target,
    'occupation_status': occupation_status_target,
    'income': income_target,
    'house_type': house_type_target,
    'house_age': house_age_target,
    'house_size': house_size_target,
    'location': location_target,
    'energy_bill': energy_bill_target,
    'energy_source': energy_source_target,
    'knowledge_energy': knowledge_energy_target,
    'energy_awareness': energy_awareness_target,
    'attitude_energy_reduction': attitude_energy_reduction_target,
    'investment_willingness': investment_willingness_target,
    'belief_climate_change': belief_climate_change_target,
    'financial_awareness': financial_awareness_target,
    'perceived_efficiency': perceived_efficiency_target,
    'environment_concern': environment_concern_target,
    'previous_renovations': previous_renovations_target,
})

df_yes = df_target_persona.copy()

# Merging the datasets

Now the different datasets need to be merged into one

In [6]:
# Merging the two dataframes of the "yes" group. The individuals of this group are solely male, so the merge is simple.
fake_yes_df = pd.merge(df_fake_yes, df_yes, left_index=True, right_index=True, how='inner')

# Merging the two dataframes of the random noise group. This is a bit more complex as the observations have to be merged by gender.
df['merge_key'] = df.groupby('gender').cumcount()
df_fake['merge_key'] = df_fake.groupby('gender').cumcount()

merged_df = pd.merge(df, df_fake, on=['gender', 'merge_key'], suffixes=('_df', '_df_fake'))
merged_df.drop(columns='merge_key', inplace=True)

#merging the two bigger datasets into one
merged_df = pd.concat([merged_df, fake_yes_df], ignore_index=True)
#Shuffling to make it more realistic
merged_df = merged_df.sample(frac=1).reset_index(drop=True)

In [7]:
merged_df.shape

(1066, 23)

In [8]:
merged_df.head()

Unnamed: 0,age,gender,household_size,occupation_status,income,house_type,house_age,house_size,location,energy_bill,...,attitude_energy_reduction,investment_willingness,belief_climate_change,financial_awareness,perceived_efficiency,environment_concern,previous_renovations,Name,Email,Phone Number
0,54,Male,2,Employed,81039,Detached,1959,299,Rural,167,...,3,4,Yes,No,2,5,3,Klaus-Günter Klotz,klaus-günterklotz@strato.de,4956721630900
1,32,Male,4,Employed,44600,Detached,1943,500,Rural,189,...,5,4,No,No,3,4,1,Hans Josef Süßebier,hansjosefsüssebier@gmx.de,4904713020472
2,58,Male,2,Employed,82839,Multi-family House,1969,290,Urban,180,...,1,4,Yes,No,2,2,6,Werner Roskoth,wernerroskoth@gmx.de,4992978016506
3,50,Other,5,Employed,52700,Detached,1970,100,Urban,130,...,4,3,Yes,No,2,4,0,Gertraut Kitzmann,gertrautkitzmann@strato.de,4955861477184
4,53,Male,2,Employed,81284,Multi-family House,1971,297,Rural,162,...,1,4,Yes,No,2,4,7,German Junken,germanjunken@web.de,4905743784029


In [9]:
columns_to_move = ['Name', 'Email', 'Phone Number']
new_names = {'Name': 'name', 'Email': 'email', 'Phone Number': 'phone_number'}

# Rearrange columns: place the specified columns at the beginning
columns_new_order = columns_to_move + [col for col in merged_df.columns if col not in columns_to_move]
merged_df = merged_df[columns_new_order]

# Rename the specified columns
merged_df.rename(columns=new_names, inplace=True)

# Display the DataFrame to verify changes
merged_df.head()

Unnamed: 0,name,email,phone_number,age,gender,household_size,occupation_status,income,house_type,house_age,...,energy_source,knowledge_energy,energy_awareness,attitude_energy_reduction,investment_willingness,belief_climate_change,financial_awareness,perceived_efficiency,environment_concern,previous_renovations
0,Klaus-Günter Klotz,klaus-günterklotz@strato.de,4956721630900,54,Male,2,Employed,81039,Detached,1959,...,Non-renewable sources,2,2,3,4,Yes,No,2,5,3
1,Hans Josef Süßebier,hansjosefsüssebier@gmx.de,4904713020472,32,Male,4,Employed,44600,Detached,1943,...,Non-renewable sources,2,4,5,4,No,No,3,4,1
2,Werner Roskoth,wernerroskoth@gmx.de,4992978016506,58,Male,2,Employed,82839,Multi-family House,1969,...,Non-renewable sources,2,2,1,4,Yes,No,2,2,6
3,Gertraut Kitzmann,gertrautkitzmann@strato.de,4955861477184,50,Other,5,Employed,52700,Detached,1970,...,Non-renewable sources,3,2,4,3,Yes,No,2,4,0
4,German Junken,germanjunken@web.de,4905743784029,53,Male,2,Employed,81284,Multi-family House,1971,...,Non-renewable sources,2,2,1,4,Yes,No,2,4,7


In [10]:
final_df = merged_df.copy()

### Exporting the Dataset

In [13]:
final_df.to_excel('Potential_Customers.xlsx', index=False)