## <b>This code performs several steps of feature engineering, including handling missing data, generate statistics, encoding categorical variables, and identifying duplicates.</b>

In [None]:
# Pandas library -> used for data manipulation and analysis.

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder

In [None]:
# Loading Data: Reads data from a CSV file.

df = pd.read_csv("data.csv")
df

In [None]:
# Understanding Data: Data types and shape

print(df.dtypes)
print('\nData Shape:', df.shape)

In [None]:
# if there any Null value it will show True

df.isnull().values.any()

In [None]:
# Identifying Missing Values: Detects missing values in the dataset.

df.isnull()

In [None]:
# Identifying Missing Values: If there is no data inputed it will show False

# df.notnull()

In [None]:
# Data Cleaning: Removes rows with any missing values.

delete_row = df.dropna(inplace=False)
delete_row

In [None]:
# Identifying Duplicates: Detects duplicate rows in the dataset.

duplicates = df[df.duplicated()]
print(duplicates)

In [None]:
# Dropping Duplicates: Deletes duplicate rows from the dataset.

delete_duplicates = df.drop_duplicates()
delete_duplicates

In [None]:
# Handling Missing Values: Fills missing values with a specified value (0 in this case).
# Imputation: is the process of replacing missing data with substituted values.

impute_data = df.fillna(0)
impute_data

In [None]:
# Handling Missing Values: Fills missing values with:
# CGPA      -> mean CGPA
# Age       -> mean Age
# Semester  -> 0

mean_cgpa = df['CGPA'].mean()
mean_age = df['Age'].mean()

# fillna with mean CGPA and mean Age
df['CGPA'].fillna(df['CGPA'].mean(), inplace=True)
df['Age'].fillna(df['Age'].mean(), inplace=True)
df

In [None]:
# Encoding Categorical Variables: Converts categorical data into numerical format using label encoding.
# Convert 'Gender' and 'Semester' labels in the DataFrame (df) to numeric labels (0 or 1),

le = LabelEncoder()

df['Semester'] = le.fit_transform(df['Semester'])
df['Gender'] = le.fit_transform(df['Gender'])
df

In [None]:
# One-Hot Encoding: Converts categorical data into binary columns using one-hot encoding.# resulting in a new DataFrame df1, and displays it.

df1 = pd.get_dummies(df, columns=['Semester', 'Gender'])
df1

In [None]:
# Descriptive Statistics: Provides summary statistics of the impute_data DataFrame,
# provides -> count, mean, std, min, max and percentiles(25%, 50%, 75%)

df.describe()

In [None]:
# Descriptive Statistics: describle() only retures values for numerical values, not for descriptive values.
# Or we can make a numerical data frame manually

df_numeric = df[['Student ID', 'CGPA', 'Age', 'Semester', 'Gender']]
# since we used LevelEncoder() on 'Semester' and "Gender", these two collumns are numeric now.
df_numeric

In [None]:
# Correlation: Correlation is only possible for numeric values

corr = df_numeric.corr()
corr

In [None]:
# Correlation: Correlation for selective attributes is also possible

corr_CGPA_vs_Gender = df_numeric[['CGPA', 'Age', 'Gender']].corr()
corr_CGPA_vs_Gender

In [None]:
plt.figure(figsize= (8,5))
sns.heatmap(corr, annot=True)
plt.show()

In [None]:
group_cg = df_numeric.groupby('Age')['CGPA'].mean()
group_cg

In [None]:
sns.displot(df_numeric)
plt.show()

In [None]:
sort_id = df_numeric.sort_values(by='Student ID')
sort_id

In [None]:
sort_age = df_numeric.sort_values(by='Age')
sort_age

In [None]:
sort_age_decs = df_numeric.sort_values(by='Age', ascending=False)
sort_age_decs

#### Visualization

In [None]:
sns.boxplot(df_numeric)
plt.title('Boxplot: df_numeric')
plt.show()

In [None]:
sns.boxplot(df_numeric[['CGPA', 'Semester', 'Gender']])
plt.title('Boxplot: CGPA, Age and Gender')
plt.show()

In [None]:
sns.pairplot(df_numeric)
plt.show()

In [None]:
sns.pairplot(df_numeric[['CGPA', 'Semester', 'Gender']])
plt.show()

In [None]:
sns.scatterplot(df_numeric)
plt.show()

In [None]:
sns.scatterplot(df_numeric[['CGPA', 'Semester', 'Gender']])
plt.show()

In [None]:
df_numeric.hist()

## Summary of Feature Engineering Steps:

✔ Loading Data: Reads data from a CSV file.<br>
✔ Understanding Data Shape: Checks the number of rows and columns.<br>
✔ Identifying Missing Values: Detects missing values in the dataset.<br>
✔ Imputing Missing Values: Fills missing values with a specified value (0 in this case).<br>
✔ Dropping Missing Values: Removes rows with any missing values.<br>
✔ Descriptive Statistics: Provides summary statistics to understand data distribution.<br>
✔ Encoding Categorical Variables: Converts categorical data into numerical format using label encoding.<br>
✔ One-Hot Encoding: Converts categorical data into binary columns using one-hot encoding.<br>
✔ Identifying Duplicates: Detects duplicate rows in the dataset.<br>
✔ Dropping Duplicates: Deletes duplicate rows from the dataset.<br>

##### Each of these steps is a part of the feature engineering process, which aims to prepare and transform raw data into a suitable format for modeling.