Missing Data Imputation

In [1]:
import pandas as pd
import numpy as np

# Sample DataFrame with missing values
data = {
    'FeatureA': [10, 20, np.nan, 40, 50, np.nan],
    'FeatureB': [100, 110, 120, np.nan, 140, 150],
    'FeatureC': ['X', 'Y', 'X', 'Y', np.nan, 'Z'],
    'FeatureD': [1, 2, 3, 4, 5, 6]
}
df = pd.DataFrame(data)
print("Original DataFrame:\n", df)
print("\nMissing values before imputation:\n", df.isnull().sum())

Original DataFrame:
    FeatureA  FeatureB FeatureC  FeatureD
0      10.0     100.0        X         1
1      20.0     110.0        Y         2
2       NaN     120.0        X         3
3      40.0       NaN        Y         4
4      50.0     140.0      NaN         5
5       NaN     150.0        Z         6

Missing values before imputation:
 FeatureA    2
FeatureB    1
FeatureC    1
FeatureD    0
dtype: int64


a. Complete Case Analysis (CCA)

In [2]:
df_cca = df.dropna()
print("\nDataFrame after Complete Case Analysis:\n", df_cca)
print("\nMissing values after CCA:\n", df_cca.isnull().sum())


DataFrame after Complete Case Analysis:
    FeatureA  FeatureB FeatureC  FeatureD
0      10.0     100.0        X         1
1      20.0     110.0        Y         2

Missing values after CCA:
 FeatureA    0
FeatureB    0
FeatureC    0
FeatureD    0
dtype: int64


b. Mean / Median / Mode Imputation

In [3]:
# Mean imputation for numerical feature A
df_mean_imputed = df.copy()
mean_feature_a = df_mean_imputed['FeatureA'].mean()
df_mean_imputed['FeatureA'].fillna(mean_feature_a, inplace=True)
print("\nDataFrame after Mean Imputation (FeatureA):\n", df_mean_imputed)

# Median imputation for numerical feature B
df_median_imputed = df.copy()
median_feature_b = df_median_imputed['FeatureB'].median()
df_median_imputed['FeatureB'].fillna(median_feature_b, inplace=True)
print("\nDataFrame after Median Imputation (FeatureB):\n", df_median_imputed)

# Mode imputation for categorical feature C
df_mode_imputed = df.copy()
mode_feature_c = df_mode_imputed['FeatureC'].mode()[0] # .mode() can return multiple if tied
df_mode_imputed['FeatureC'].fillna(mode_feature_c, inplace=True)
print("\nDataFrame after Mode Imputation (FeatureC):\n", df_mode_imputed)


DataFrame after Mean Imputation (FeatureA):
    FeatureA  FeatureB FeatureC  FeatureD
0      10.0     100.0        X         1
1      20.0     110.0        Y         2
2      30.0     120.0        X         3
3      40.0       NaN        Y         4
4      50.0     140.0      NaN         5
5      30.0     150.0        Z         6

DataFrame after Median Imputation (FeatureB):
    FeatureA  FeatureB FeatureC  FeatureD
0      10.0     100.0        X         1
1      20.0     110.0        Y         2
2       NaN     120.0        X         3
3      40.0     120.0        Y         4
4      50.0     140.0      NaN         5
5       NaN     150.0        Z         6

DataFrame after Mode Imputation (FeatureC):
    FeatureA  FeatureB FeatureC  FeatureD
0      10.0     100.0        X         1
1      20.0     110.0        Y         2
2       NaN     120.0        X         3
3      40.0       NaN        Y         4
4      50.0     140.0        X         5
5       NaN     150.0        Z         6

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_mean_imputed['FeatureA'].fillna(mean_feature_a, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_median_imputed['FeatureB'].fillna(median_feature_b, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermedia

c. Random Sample Imputation

In [4]:
df_random_imputed = df.copy()
random_sample_a = df_random_imputed['FeatureA'].dropna().sample(df_random_imputed['FeatureA'].isnull().sum(), random_state=42)
random_sample_a.index = df_random_imputed[df_random_imputed['FeatureA'].isnull()].index
df_random_imputed.loc[df_random_imputed['FeatureA'].isnull(), 'FeatureA'] = random_sample_a
print("\nDataFrame after Random Sample Imputation (FeatureA):\n", df_random_imputed)


DataFrame after Random Sample Imputation (FeatureA):
    FeatureA  FeatureB FeatureC  FeatureD
0      10.0     100.0        X         1
1      20.0     110.0        Y         2
2      20.0     120.0        X         3
3      40.0       NaN        Y         4
4      50.0     140.0      NaN         5
5      50.0     150.0        Z         6


d. Replacement by Arbitrary Value

In [5]:
df_arbitrary_imputed = df.copy()
df_arbitrary_imputed['FeatureA'].fillna(-99, inplace=True)
print("\nDataFrame after Arbitrary Value Imputation (FeatureA with -99):\n", df_arbitrary_imputed)


DataFrame after Arbitrary Value Imputation (FeatureA with -99):
    FeatureA  FeatureB FeatureC  FeatureD
0      10.0     100.0        X         1
1      20.0     110.0        Y         2
2     -99.0     120.0        X         3
3      40.0       NaN        Y         4
4      50.0     140.0      NaN         5
5     -99.0     150.0        Z         6


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_arbitrary_imputed['FeatureA'].fillna(-99, inplace=True)


e. Missing Value Indicator

In [6]:
df_indicator = df.copy()
df_indicator['FeatureA_is_missing'] = df_indicator['FeatureA'].isnull().astype(int)
# Then apply another imputation method (e.g., mean) for FeatureA
df_indicator['FeatureA'].fillna(df_indicator['FeatureA'].mean(), inplace=True)
print("\nDataFrame with Missing Value Indicator (FeatureA):\n", df_indicator)


DataFrame with Missing Value Indicator (FeatureA):
    FeatureA  FeatureB FeatureC  FeatureD  FeatureA_is_missing
0      10.0     100.0        X         1                    0
1      20.0     110.0        Y         2                    0
2      30.0     120.0        X         3                    1
3      40.0       NaN        Y         4                    0
4      50.0     140.0      NaN         5                    0
5      30.0     150.0        Z         6                    1


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_indicator['FeatureA'].fillna(df_indicator['FeatureA'].mean(), inplace=True)


f. Multivariate Imputation (e.g., using IterativeImputer)

In [7]:
from sklearn.experimental import enable_iterative_imputer # Required for IterativeImputer
from sklearn.impute import IterativeImputer

# Create a fresh copy, as IterativeImputer works on numerical data
df_multi_impute = df[['FeatureA', 'FeatureB', 'FeatureD']].copy()
print("\nDataFrame for Multivariate Imputation:\n", df_multi_impute)

imputer = IterativeImputer(max_iter=10, random_state=0)
imputed_data = imputer.fit_transform(df_multi_impute)
df_multi_imputed = pd.DataFrame(imputed_data, columns=df_multi_impute.columns)
print("\nDataFrame after Multivariate Imputation:\n", df_multi_imputed)


DataFrame for Multivariate Imputation:
    FeatureA  FeatureB  FeatureD
0      10.0     100.0         1
1      20.0     110.0         2
2       NaN     120.0         3
3      40.0       NaN         4
4      50.0     140.0         5
5       NaN     150.0         6

DataFrame after Multivariate Imputation:
     FeatureA   FeatureB  FeatureD
0  10.000000  100.00000       1.0
1  20.000000  110.00000       2.0
2  29.998373  120.00000       3.0
3  40.000000  130.00678       4.0
4  50.000000  140.00000       5.0
5  59.996419  150.00000       6.0


3. Handling Outliers

In [8]:
# Sample DataFrame with outliers
data_outlier = {'Value': [10, 12, 11, 15, 13, 120, 14, 16, 8, 500]}
df_outlier = pd.DataFrame(data_outlier)
print("\nOriginal DataFrame with outliers:\n", df_outlier)

# A simple way to identify outliers (e.g., using IQR method)
Q1 = df_outlier['Value'].quantile(0.25)
Q3 = df_outlier['Value'].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
print(f"Q1: {Q1}, Q3: {Q3}, IQR: {IQR}")
print(f"Lower Bound (Outlier): {lower_bound}, Upper Bound (Outlier): {upper_bound}")


Original DataFrame with outliers:
    Value
0     10
1     12
2     11
3     15
4     13
5    120
6     14
7     16
8      8
9    500
Q1: 11.25, Q3: 15.75, IQR: 4.5
Lower Bound (Outlier): 4.5, Upper Bound (Outlier): 22.5


a. Removing Outliers

In [9]:
df_outlier_removed = df_outlier[(df_outlier['Value'] >= lower_bound) & (df_outlier['Value'] <= upper_bound)].copy()
print("\nDataFrame after Removing Outliers:\n", df_outlier_removed)


DataFrame after Removing Outliers:
    Value
0     10
1     12
2     11
3     15
4     13
6     14
7     16
8      8


b. Treating Outliers as NaN

In [10]:
df_outlier_to_nan = df_outlier.copy()
df_outlier_to_nan.loc[(df_outlier_to_nan['Value'] < lower_bound) | (df_outlier_to_nan['Value'] > upper_bound), 'Value'] = np.nan
print("\nDataFrame after Treating Outliers as NaN:\n", df_outlier_to_nan)


DataFrame after Treating Outliers as NaN:
    Value
0   10.0
1   12.0
2   11.0
3   15.0
4   13.0
5    NaN
6   14.0
7   16.0
8    8.0
9    NaN


c. Capping, Winsorization

In [11]:
df_outlier_capped = df_outlier.copy()
df_outlier_capped['Value'] = np.where(df_outlier_capped['Value'] < lower_bound, lower_bound, df_outlier_capped['Value'])
df_outlier_capped['Value'] = np.where(df_outlier_capped['Value'] > upper_bound, upper_bound, df_outlier_capped['Value'])
print("\nDataFrame after Capping Outliers (using IQR bounds):\n", df_outlier_capped)

# Example using percentiles
upper_cap_percentile = df_outlier['Value'].quantile(0.95)
lower_cap_percentile = df_outlier['Value'].quantile(0.05)
df_outlier_winsorized = df_outlier.copy()
df_outlier_winsorized['Value'] = np.where(df_outlier_winsorized['Value'] > upper_cap_percentile, upper_cap_percentile, df_outlier_winsorized['Value'])
df_outlier_winsorized['Value'] = np.where(df_outlier_winsorized['Value'] < lower_cap_percentile, lower_cap_percentile, df_outlier_winsorized['Value'])
print("\nDataFrame after Winsorization (using 5th & 95th percentile):\n", df_outlier_winsorized)


DataFrame after Capping Outliers (using IQR bounds):
    Value
0   10.0
1   12.0
2   11.0
3   15.0
4   13.0
5   22.5
6   14.0
7   16.0
8    8.0
9   22.5

DataFrame after Winsorization (using 5th & 95th percentile):
    Value
0   10.0
1   12.0
2   11.0
3   15.0
4   13.0
5  120.0
6   14.0
7   16.0
8    8.9
9  329.0


4. Data Transformation & Binning

In [12]:
# Sample DataFrame for transformations
data_transform = {
    'Numerical': [1, 5, 10, 20, 50, 100, 200, 500, 1000],
    'Category': ['Red', 'Blue', 'Green', 'Red', 'Blue', 'Red', 'Green', 'Blue', 'Red'],
    'Ordinal': ['Low', 'Medium', 'High', 'Medium', 'Low', 'High', 'Medium', 'Low', 'High'],
    'DateCol': pd.to_datetime(['2023-01-01', '2023-01-15', '2023-02-01', '2023-03-10', '2023-04-20', '2023-05-05', '2023-06-12', '2023-07-25', '2023-08-30']),
    'TransactionAmount': [100, 150, 10, 200, 120, 50, 80, 300, 180],
    'CustomerID': ['C1', 'C2', 'C1', 'C3', 'C2', 'C1', 'C3', 'C2', 'C1']
}
df_transform = pd.DataFrame(data_transform)
print("\nOriginal DataFrame for transformations:\n", df_transform)


Original DataFrame for transformations:
    Numerical Category Ordinal    DateCol  TransactionAmount CustomerID
0          1      Red     Low 2023-01-01                100         C1
1          5     Blue  Medium 2023-01-15                150         C2
2         10    Green    High 2023-02-01                 10         C1
3         20      Red  Medium 2023-03-10                200         C3
4         50     Blue     Low 2023-04-20                120         C2
5        100      Red    High 2023-05-05                 50         C1
6        200    Green  Medium 2023-06-12                 80         C3
7        500     Blue     Low 2023-07-25                300         C2
8       1000      Red    High 2023-08-30                180         C1


a. Binning (Discretization)

In [13]:
# Equal Frequency Binning (using qcut)
df_transform['Numerical_EqualFreq_Bins'] = pd.qcut(df_transform['Numerical'], q=3, labels=['Low', 'Medium', 'High'])
print("\nEqual Frequency Binning (Numerical):\n", df_transform[['Numerical', 'Numerical_EqualFreq_Bins']])

# Equal Length Binning (using cut)
df_transform['Numerical_EqualLen_Bins'] = pd.cut(df_transform['Numerical'], bins=3, labels=['Bin1', 'Bin2', 'Bin3'])
print("\nEqual Length Binning (Numerical):\n", df_transform[['Numerical', 'Numerical_EqualLen_Bins']])

# Note: Discretization with Trees/ChiMerge often requires a target variable
# and custom implementations or specific libraries.
# Example with KBinsDiscretizer (can do 'uniform' for equal-width, 'quantile' for equal-frequency)
from sklearn.preprocessing import KBinsDiscretizer
numerical_data = df_transform[['Numerical']]
est = KBinsDiscretizer(n_bins=3, encode='ordinal', strategy='uniform') # or 'quantile'
df_transform['Numerical_KBins_Uniform'] = est.fit_transform(numerical_data)
print("\nKBinsDiscretizer (uniform strategy):\n", df_transform[['Numerical', 'Numerical_KBins_Uniform']])


Equal Frequency Binning (Numerical):
    Numerical Numerical_EqualFreq_Bins
0          1                      Low
1          5                      Low
2         10                      Low
3         20                   Medium
4         50                   Medium
5        100                   Medium
6        200                     High
7        500                     High
8       1000                     High

Equal Length Binning (Numerical):
    Numerical Numerical_EqualLen_Bins
0          1                    Bin1
1          5                    Bin1
2         10                    Bin1
3         20                    Bin1
4         50                    Bin1
5        100                    Bin1
6        200                    Bin1
7        500                    Bin2
8       1000                    Bin3

KBinsDiscretizer (uniform strategy):
    Numerical  Numerical_KBins_Uniform
0          1                      0.0
1          5                      0.0
2         10          

b. Log Transform

In [14]:
df_transform['Numerical_Log'] = np.log(df_transform['Numerical'])
print("\nLog Transform (Numerical):\n", df_transform[['Numerical', 'Numerical_Log']])


Log Transform (Numerical):
    Numerical  Numerical_Log
0          1       0.000000
1          5       1.609438
2         10       2.302585
3         20       2.995732
4         50       3.912023
5        100       4.605170
6        200       5.298317
7        500       6.214608
8       1000       6.907755


c. Categorical Encoding

One Hot Encoding

In [15]:
df_one_hot = pd.get_dummies(df_transform['Category'], prefix='Category')
df_encoded = pd.concat([df_transform, df_one_hot], axis=1)
print("\nOne Hot Encoding (Category):\n", df_encoded[['Category', 'Category_Blue', 'Category_Green', 'Category_Red']])


One Hot Encoding (Category):
   Category  Category_Blue  Category_Green  Category_Red
0      Red          False           False          True
1     Blue           True           False         False
2    Green          False            True         False
3      Red          False           False          True
4     Blue           True           False         False
5      Red          False           False          True
6    Green          False            True         False
7     Blue           True           False         False
8      Red          False           False          True


Count and Frequency Encoding

In [16]:
count_map = df_transform['Category'].value_counts().to_dict()
df_transform['Category_Count'] = df_transform['Category'].map(count_map)

freq_map = df_transform['Category'].value_counts(normalize=True).to_dict()
df_transform['Category_Frequency'] = df_transform['Category'].map(freq_map)
print("\nCount and Frequency Encoding (Category):\n", df_transform[['Category', 'Category_Count', 'Category_Frequency']])


Count and Frequency Encoding (Category):
   Category  Category_Count  Category_Frequency
0      Red               4            0.444444
1     Blue               3            0.333333
2    Green               2            0.222222
3      Red               4            0.444444
4     Blue               3            0.333333
5      Red               4            0.444444
6    Green               2            0.222222
7     Blue               3            0.333333
8      Red               4            0.444444


Target Encoding/Mean Encoding (with proper precautions)

In [17]:
# Conceptual example, requires a target variable (e.g., 'Target' column)
# from category_encoders import TargetEncoder
# encoder = TargetEncoder(cols=['Category'])
# df_encoded_target = encoder.fit_transform(df_transform['Category'], df_transform['Target'])
print("\nTarget Encoding (Conceptual): Replaces category with mean of target for that category. Requires a target variable.")


Target Encoding (Conceptual): Replaces category with mean of target for that category. Requires a target variable.


Ordinal Encoding

In [18]:
from sklearn.preprocessing import OrdinalEncoder

ordinal_map = {'Low': 0, 'Medium': 1, 'High': 2}
df_transform['Ordinal_Encoded'] = df_transform['Ordinal'].map(ordinal_map)
print("\nOrdinal Encoding (Ordinal):\n", df_transform[['Ordinal', 'Ordinal_Encoded']])

# Or using sklearn's OrdinalEncoder (infers order or takes categories)
encoder = OrdinalEncoder(categories=[['Low', 'Medium', 'High']])
df_transform['Ordinal_Encoded_Sklearn'] = encoder.fit_transform(df_transform[['Ordinal']])
print("\nOrdinal Encoding (Ordinal) with Sklearn:\n", df_transform[['Ordinal', 'Ordinal_Encoded_Sklearn']])


Ordinal Encoding (Ordinal):
   Ordinal  Ordinal_Encoded
0     Low                0
1  Medium                1
2    High                2
3  Medium                1
4     Low                0
5    High                2
6  Medium                1
7     Low                0
8    High                2

Ordinal Encoding (Ordinal) with Sklearn:
   Ordinal  Ordinal_Encoded_Sklearn
0     Low                      0.0
1  Medium                      1.0
2    High                      2.0
3  Medium                      1.0
4     Low                      0.0
5    High                      2.0
6  Medium                      1.0
7     Low                      0.0
8    High                      2.0


Weight of Evidence (WoE)

In [19]:
# WoE = ln( (% of non-events in category) / (% of events in category) )
print("\nWeight of Evidence (WoE) Encoding (Conceptual): Useful for binary classification. Replaces category with a value reflecting its predictive power towards the target.")


Weight of Evidence (WoE) Encoding (Conceptual): Useful for binary classification. Replaces category with a value reflecting its predictive power towards the target.


Rare Label Encoding

In [20]:
threshold = 0.05 # 5% frequency
value_counts = df_transform['Category'].value_counts(normalize=True)
rare_labels = value_counts[value_counts < threshold].index
df_transform['Category_RareHandled'] = np.where(df_transform['Category'].isin(rare_labels), 'Rare', df_transform['Category'])
print("\nRare Label Encoding (Category):\n", df_transform[['Category', 'Category_RareHandled']])


Rare Label Encoding (Category):
   Category Category_RareHandled
0      Red                  Red
1     Blue                 Blue
2    Green                Green
3      Red                  Red
4     Blue                 Blue
5      Red                  Red
6    Green                Green
7     Blue                 Blue
8      Red                  Red


BaseN, Feature Hashing and others (brief overview)

In [21]:
# from category_encoders import HashingEncoder
# encoder = HashingEncoder(n_components=8) # Map to 8 new features
# df_hashed = encoder.fit_transform(df_transform['Category'])
print("\nBaseN/Feature Hashing (Conceptual): Advanced techniques for high-cardinality categoricals, balancing memory/speed with interpretability tradeoffs.")


BaseN/Feature Hashing (Conceptual): Advanced techniques for high-cardinality categoricals, balancing memory/speed with interpretability tradeoffs.


d. Grouping Operations

In [22]:
# Example: Mean transaction amount per customer
df_transform['AvgTransaction_by_Customer'] = df_transform.groupby('CustomerID')['TransactionAmount'].transform('mean')
print("\nGrouping Operation (Avg Transaction by Customer):\n", df_transform[['CustomerID', 'TransactionAmount', 'AvgTransaction_by_Customer']])


Grouping Operation (Avg Transaction by Customer):
   CustomerID  TransactionAmount  AvgTransaction_by_Customer
0         C1                100                        85.0
1         C2                150                       190.0
2         C1                 10                        85.0
3         C3                200                       140.0
4         C2                120                       190.0
5         C1                 50                        85.0
6         C3                 80                       140.0
7         C2                300                       190.0
8         C1                180                        85.0


e. Feature Split

In [23]:
df_split = pd.DataFrame({'ProductCode': ['A-Red-Small', 'B-Blue-Medium', 'A-Green-Large']})
df_split[['Product_Type', 'Product_Color', 'Product_Size']] = df_split['ProductCode'].str.split('-', expand=True)
print("\nFeature Split (ProductCode):\n", df_split)


Feature Split (ProductCode):
      ProductCode Product_Type Product_Color Product_Size
0    A-Red-Small            A           Red        Small
1  B-Blue-Medium            B          Blue       Medium
2  A-Green-Large            A         Green        Large


f. Date and Time Engineering

In [24]:
df_transform['Year'] = df_transform['DateCol'].dt.year
df_transform['Month'] = df_transform['DateCol'].dt.month
df_transform['Day'] = df_transform['DateCol'].dt.day
df_transform['DayOfWeek'] = df_transform['DateCol'].dt.dayofweek # Monday=0, Sunday=6
df_transform['DayOfYear'] = df_transform['DateCol'].dt.dayofyear
df_transform['WeekOfYear'] = df_transform['DateCol'].dt.isocalendar().week.astype(int)
df_transform['Quarter'] = df_transform['DateCol'].dt.quarter
# Time elapsed since a reference date
reference_date = pd.to_datetime('2023-01-01')
df_transform['Days_Since_Ref'] = (df_transform['DateCol'] - reference_date).dt.days
print("\nDate and Time Engineering (DateCol):\n", df_transform[['DateCol', 'Year', 'Month', 'Day', 'DayOfWeek', 'WeekOfYear', 'Days_Since_Ref']])


Date and Time Engineering (DateCol):
      DateCol  Year  Month  Day  DayOfWeek  WeekOfYear  Days_Since_Ref
0 2023-01-01  2023      1    1          6          52               0
1 2023-01-15  2023      1   15          6           2              14
2 2023-02-01  2023      2    1          2           5              31
3 2023-03-10  2023      3   10          4          10              68
4 2023-04-20  2023      4   20          3          16             109
5 2023-05-05  2023      5    5          4          18             124
6 2023-06-12  2023      6   12          0          24             162
7 2023-07-25  2023      7   25          1          30             205
8 2023-08-30  2023      8   30          2          35             241


g. Feature Creation (Arithmetic Operations)

In [25]:
df_arithmetic = pd.DataFrame({
    'FeatureX': [10, 20, 30, 40],
    'FeatureY': [2, 4, 5, 8],
    'Amount1': [100, 200, 50, 150],
    'Amount2': [20, 40, 10, 30]
})

df_arithmetic['Sum_XY'] = df_arithmetic['FeatureX'] + df_arithmetic['FeatureY']
df_arithmetic['Ratio_XY'] = df_arithmetic['FeatureX'] / df_arithmetic['FeatureY']
df_arithmetic['Product_Amounts'] = df_arithmetic['Amount1'] * df_arithmetic['Amount2']

# Aggregating Transaction Data: Feature over time window (conceptual)
# This would typically involve time-series data and rolling windows.
# df_transactions['rolling_avg_7d'] = df_transactions['transaction_value'].rolling('7D').mean()

print("\nFeature Creation (Arithmetic Operations):\n", df_arithmetic)


Feature Creation (Arithmetic Operations):
    FeatureX  FeatureY  Amount1  Amount2  Sum_XY  Ratio_XY  Product_Amounts
0        10         2      100       20      12       5.0             2000
1        20         4      200       40      24       5.0             8000
2        30         5       50       10      35       6.0              500
3        40         8      150       30      48       5.0             4500


h. Variable Transformation (Mathematical Functions)

In [26]:
from scipy.stats import boxcox, yeojohnson
# Ensure 'Numerical' feature doesn't have zeros or negative for log/sqrt
# df_transform['Numerical'] = df_transform['Numerical'] + 1 # Add 1 to handle 0 for log

df_transform['Numerical_Reciprocal'] = 1 / df_transform['Numerical']
df_transform['Numerical_SquareRoot'] = np.sqrt(df_transform['Numerical'])
df_transform['Numerical_Exponential'] = np.exp(df_transform['Numerical'])

# Box-Cox and Yeo-Johnson require positive values for Box-Cox
# For Box-Cox, data must be strictly positive.
# For Yeo-Johnson, data can contain zero or negative values.
transformed_boxcox, lambda_boxcox = boxcox(df_transform['Numerical'])
df_transform['Numerical_BoxCox'] = transformed_boxcox

transformed_yeojohnson, lambda_yeojohnson = yeojohnson(df_transform['Numerical'])
df_transform['Numerical_YeoJohnson'] = transformed_yeojohnson

print("\nVariable Transformations (Numerical):\n", df_transform[['Numerical', 'Numerical_Reciprocal', 'Numerical_SquareRoot', 'Numerical_Exponential', 'Numerical_BoxCox', 'Numerical_YeoJohnson']])


Variable Transformations (Numerical):
    Numerical  Numerical_Reciprocal  Numerical_SquareRoot  \
0          1                 1.000              1.000000   
1          5                 0.200              2.236068   
2         10                 0.100              3.162278   
3         20                 0.050              4.472136   
4         50                 0.020              7.071068   
5        100                 0.010             10.000000   
6        200                 0.005             14.142136   
7        500                 0.002             22.360680   
8       1000                 0.001             31.622777   

   Numerical_Exponential  Numerical_BoxCox  Numerical_YeoJohnson  
0           2.718282e+00          0.000000              0.694560  
1           1.484132e+02          1.676720              1.801218  
2           2.202647e+04          2.441945              2.414856  
3           4.851652e+08          3.234457              3.071898  
4           5.184706e+21

  result = getattr(ufunc, method)(*inputs, **kwargs)


5. Feature Scaling

In [27]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler, MaxAbsScaler, Normalizer

data_scale = {'Feature1': [10, 200, 30, 500, 100], 'Feature2': [0.1, 0.5, 0.2, 0.8, 0.3]}
df_scale = pd.DataFrame(data_scale)
print("\nOriginal DataFrame for Scaling:\n", df_scale)


Original DataFrame for Scaling:
    Feature1  Feature2
0        10       0.1
1       200       0.5
2        30       0.2
3       500       0.8
4       100       0.3


a. Standardization (Z-score Normalization)

In [28]:
scaler = StandardScaler()
df_scale_standardized = pd.DataFrame(scaler.fit_transform(df_scale), columns=df_scale.columns)
print("\nDataFrame after Standardization:\n", df_scale_standardized)


DataFrame after Standardization:
    Feature1  Feature2
0 -0.883578 -1.128152
1  0.178953  0.483494
2 -0.771733 -0.725241
3  1.856633  1.692228
4 -0.380274 -0.322329


b. MinMax Scaling

In [29]:
scaler = MinMaxScaler()
df_scale_minmax = pd.DataFrame(scaler.fit_transform(df_scale), columns=df_scale.columns)
print("\nDataFrame after MinMax Scaling (to 0-1):\n", df_scale_minmax)


DataFrame after MinMax Scaling (to 0-1):
    Feature1  Feature2
0  0.000000  0.000000
1  0.387755  0.571429
2  0.040816  0.142857
3  1.000000  1.000000
4  0.183673  0.285714


c. Mean Scaling

In [30]:
df_scale_mean = df_scale.copy()
df_scale_mean['Feature1'] = df_scale_mean['Feature1'] - df_scale_mean['Feature1'].mean()
df_scale_mean['Feature2'] = df_scale_mean['Feature2'] - df_scale_mean['Feature2'].mean()
print("\nDataFrame after Mean Scaling:\n", df_scale_mean)


DataFrame after Mean Scaling:
    Feature1  Feature2
0    -158.0     -0.28
1      32.0      0.12
2    -138.0     -0.18
3     332.0      0.42
4     -68.0     -0.08


d. Max Absolute Scaling

In [31]:
scaler = MaxAbsScaler()
df_scale_maxabs = pd.DataFrame(scaler.fit_transform(df_scale), columns=df_scale.columns)
print("\nDataFrame after Max Absolute Scaling:\n", df_scale_maxabs)


DataFrame after Max Absolute Scaling:
    Feature1  Feature2
0      0.02     0.125
1      0.40     0.625
2      0.06     0.250
3      1.00     1.000
4      0.20     0.375


e. Unit Norm-Scaling (Normalization)

In [32]:
scaler = Normalizer(norm='l2') # or 'l1'
df_scale_unitnorm = pd.DataFrame(scaler.fit_transform(df_scale), columns=df_scale.columns)
print("\nDataFrame after Unit Norm Scaling (L2):\n", df_scale_unitnorm)


DataFrame after Unit Norm Scaling (L2):
    Feature1  Feature2
0  0.999950  0.010000
1  0.999997  0.002500
2  0.999978  0.006667
3  0.999999  0.001600
4  0.999996  0.003000


6. Feature Selection Methods

In [33]:
from sklearn.datasets import make_classification
from sklearn.feature_selection import SelectKBest, chi2, f_classif
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE

# Create a synthetic dataset
X, y = make_classification(n_samples=100, n_features=10, n_informative=5, n_redundant=2, random_state=42)
df_fs = pd.DataFrame(X, columns=[f'feature_{i}' for i in range(10)])
df_fs['target'] = y
print("\nSynthetic Dataset for Feature Selection (first 5 rows):\n", df_fs.head())


Synthetic Dataset for Feature Selection (first 5 rows):
    feature_0  feature_1  feature_2  feature_3  feature_4  feature_5  \
0  -0.149037   3.639790  -4.772025  -0.006653  -1.712937  -2.745894   
1   2.643572   2.247720   0.269296  -0.202846   2.757147   2.674546   
2   0.343135  -0.945760   0.557920   1.323875  -1.249062   2.291925   
3  -0.300360  -1.511354  -0.632684  -0.804787  -1.254642   2.184507   
4  -0.798439   3.013817   0.664099   1.341312   2.697771   0.022003   

   feature_6  feature_7  feature_8  feature_9  target  
0  -1.024823   4.487371  -1.125419  -1.014853       0  
1  -2.024225   0.301308   0.180531   1.455804       0  
2  -0.744303  -0.262295   1.205628  -0.725942       1  
3  -0.436571  -0.967546   0.793658  -0.217025       0  
4   0.633598   0.667231  -1.524015  -1.111219       1  


a. Filter Methods

In [34]:
# Chi-Square Test (for non-negative features, typically categorical or binned numerical)
# Assuming some features are non-negative integers or binned for chi2
X_chi2 = (X - X.min()).astype(int) # Make features non-negative for chi2 example
selector_chi2 = SelectKBest(chi2, k=5)
selector_chi2.fit(X_chi2, y)
selected_features_chi2 = df_fs.columns[:-1][selector_chi2.get_support()]
print(f"\nSelected features (Chi-Square): {selected_features_chi2.tolist()}")

# F-score (ANOVA) - for numerical features and categorical target
selector_fscore = SelectKBest(f_classif, k=5)
selector_fscore.fit(X, y)
selected_features_fscore = df_fs.columns[:-1][selector_fscore.get_support()]
print(f"Selected features (F-score): {selected_features_fscore.tolist()}")

# Correlation Coefficient (for numerical features and numerical target - not directly applicable here for classification, but conceptually)
# For classification, you'd look at point-biserial correlation or convert target to numerical
# For regression, you'd calculate df_fs.corr()['target'].abs().sort_values(ascending=False)


Selected features (Chi-Square): ['feature_1', 'feature_2', 'feature_5', 'feature_7', 'feature_9']
Selected features (F-score): ['feature_1', 'feature_2', 'feature_5', 'feature_7', 'feature_9']


b. Wrapper Methods

In [35]:
estimator = LogisticRegression(solver='liblinear', random_state=42) # A simple estimator
selector_rfe = RFE(estimator, n_features_to_select=5, step=1)
selector_rfe.fit(X, y)
selected_features_rfe = df_fs.columns[:-1][selector_rfe.get_support()]
print(f"\nSelected features (RFE with Logistic Regression): {selected_features_rfe.tolist()}")

# Genetic Algorithms would typically involve a specialized library like 'TPOT' or custom implementation.


Selected features (RFE with Logistic Regression): ['feature_2', 'feature_4', 'feature_5', 'feature_7', 'feature_9']


c. Embedded Methods

In [36]:
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X, y)

feature_importances = pd.Series(model.feature_importances_, index=df_fs.columns[:-1])
selected_features_embedded = feature_importances.nlargest(5).index.tolist()
print(f"\nSelected features (Embedded - Random Forest Importance):\n{feature_importances.sort_values(ascending=False).head()}")
print(f"Top 5 selected features: {selected_features_embedded}")

# For Lasso Regression:
# from sklearn.linear_model import Lasso
# lasso = Lasso(alpha=0.01) # alpha is regularization strength
# lasso.fit(X, y)
# non_zero_features = df_fs.columns[:-1][lasso.coef_ != 0].tolist()
# print(f"\nSelected features (Lasso Regression): {non_zero_features}")


Selected features (Embedded - Random Forest Importance):
feature_9    0.245956
feature_2    0.167019
feature_5    0.118863
feature_6    0.098968
feature_7    0.092285
dtype: float64
Top 5 selected features: ['feature_9', 'feature_2', 'feature_5', 'feature_6', 'feature_7']
