In [1]:
import pandas as pd

# Sample DataFrame
data = {
    'customer_id': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20],
    'age': [25, 30, 35, 40, 45, 30, 35, 40, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80],
    'gender': ['Male', 'Female', 'Male', 'Female', 'Male', 'Female', 'Male', 'Female', 'Male', 'Female',
               'Male', 'Female', 'Male', 'Female', 'Male', 'Female', 'Male', 'Female', 'Male', 'Female'],
    'membership': ['Basic', 'Premium', 'Basic', 'Premium', 'Basic', 'Premium', 'Basic', 'Premium', 'Basic',
                   'Premium', 'Basic', 'Premium', 'Basic', 'Premium', 'Basic', 'Premium', 'Basic', 'Premium',
                   'Basic', 'Premium']
}

df = pd.DataFrame(data)

# Add duplicate records for demonstration
df = pd.concat([df, df.head(5)], ignore_index=True)

# Display the first few rows of the DataFrame
print("Original DataFrame:")
print(df.head())

# Feature Engineering Techniques

# 1. Aggregation: Compute mean age for each gender
mean_age_by_gender = df.groupby('gender')['age'].mean()
print("\nMean Age by Gender:")
print(mean_age_by_gender)

# 2. Frequency Encoding: Replace membership with frequency of occurrence
membership_counts = df['membership'].value_counts()
df['membership_frequency'] = df['membership'].map(membership_counts)
print("\nMembership Frequency:")
print(df[['membership', 'membership_frequency']].head())

# 3. Target Encoding: Encode gender based on mean age for each gender
mean_age_by_gender = df.groupby('gender')['age'].mean()
df['gender_encoded'] = df['gender'].map(mean_age_by_gender)
print("\nGender Encoded by Mean Age:")
print(df[['gender', 'gender_encoded']].head())

# 4. Hashing: Convert gender into numerical representations using hashing
df['gender_hash'] = df['gender'].apply(lambda x: hash(x) % 100)
print("\nGender Hash:")
print(df[['gender', 'gender_hash']].head())

# 5. Feature Interactions: Create new feature 'age_group' by combining age and membership
df['age_group'] = df['age'] * df['membership_frequency']
print("\nAge Group (Age * Membership Frequency):")
print(df[['age', 'membership_frequency', 'age_group']].head())

# These are just a few examples of feature engineering techniques that can be applied to handle duplicate data.


Original DataFrame:
   customer_id  age  gender membership
0            1   25    Male      Basic
1            2   30  Female    Premium
2            3   35    Male      Basic
3            4   40  Female    Premium
4            5   45    Male      Basic

Mean Age by Gender:
gender
Female    45.000000
Male      41.923077
Name: age, dtype: float64

Membership Frequency:
  membership  membership_frequency
0      Basic                    13
1    Premium                    12
2      Basic                    13
3    Premium                    12
4      Basic                    13

Gender Encoded by Mean Age:
   gender  gender_encoded
0    Male       41.923077
1  Female       45.000000
2    Male       41.923077
3  Female       45.000000
4    Male       41.923077

Gender Hash:
   gender  gender_hash
0    Male           72
1  Female           22
2    Male           72
3  Female           22
4    Male           72

Age Group (Age * Membership Frequency):
   age  membership_frequency  age_group
0