## <b>This code performs several steps of feature engineering, including handling missing data, generate statistics, encoding categorical variables, and identifying duplicates.</b>

In [None]:
# Pandas library -> used for data manipulation and analysis.

import pandas as pd

In [None]:
# Loading Data: Reads data from a CSV file.

df = pd.read_csv("data.csv")
df

In [None]:
# Understanding Data Shape: Checks the number of rows and columns.

df.shape

In [None]:
# Identifying Missing Values: Detects missing values in the dataset.

df.isnull()

In [None]:
# Imputing Missing Values: Fills missing values with a specified value (0 in this case).
# Imputation: is the process of replacing missing data with substituted values.

impute_data = df.fillna(0)
impute_data

In [None]:
# Descriptive Statistics: Provides summary statistics of the impute_data DataFrame,
# provides -> count, mean, std, min, max and percentiles(25%, 50%, 75%)

impute_data.describe()

In [None]:
# Dropping Missing Values: Removes rows with any missing values.

delete_data = df.dropna(inplace=False)
delete_data

In [None]:
# Encoding Categorical Variables: Converts categorical data into numerical format using label encoding.
# Convert 'Gender' labels in the DataFrame (df) to numeric labels (0 or 1),

from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
df['Gender'] = le.fit_transform(df['Gender'])
df

In [None]:
# Dropping Missing Values: Removes rows with any missing values.

delete_data = df.dropna(inplace=False)
delete_data

In [None]:
# One-Hot Encoding: Converts categorical data into binary columns using one-hot encoding.# resulting in a new DataFrame df1, and displays it.

df1 = pd.get_dummies(df, columns=['Gender'])
df1

In [None]:
# Identifying Duplicates: Detects duplicate rows in the dataset.

duplicates = df[df.duplicated()]
print(duplicates)

## <b>Summary of Feature Engineering Steps:<b>

✔ <b>Loading Data:</b> Reads data from a CSV file.<br>
✔ <b>Understanding Data Shape:</b> Checks the number of rows and columns.<br>
✔ <b>Identifying Missing Values:</b> Detects missing values in the dataset.<br>
✔ <b>Imputing Missing Values:</b> Fills missing values with a specified value (0 in this case).<br>
✔ <b>Dropping Missing Values:</b> Removes rows with any missing values.<br>
✔ <b>Descriptive Statistics:</b> Provides summary statistics to understand data distribution.<br>
✔ <b>Encoding Categorical Variables:</b> Converts categorical data into numerical format using label encoding.<br>
✔ <b>One-Hot Encoding:</b> Converts categorical data into binary columns using one-hot encoding.<br>
✔ <b>Identifying Duplicates:</b> Detects duplicate rows in the dataset.<br>

##### Each of these steps is a part of the feature engineering process, which aims to prepare and transform raw data into a suitable format for modeling.