# Feature Engineering

In [9]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

%matplotlib inline
sns.set_style('whitegrid')

In [10]:
DATA_PATH = Path('../../data/raw/')
PROCESSED_PATH = Path('../../data/processed/')
PROCESSED_PATH.mkdir(exist_ok=True)

train_df = pd.read_csv(DATA_PATH / 'train.csv')
test_df = pd.read_csv(DATA_PATH / 'test.csv')

# Store test IDs for submission
test_ids = test_df['id']

# Concatenate for consistent processing
combined_df = pd.concat([train_df.drop('exam_score', axis=1), test_df], ignore_index=True)

print("Combined DataFrame shape:", combined_df.shape)
combined_df.head()

Combined DataFrame shape: (900000, 12)


Unnamed: 0,id,age,gender,course,study_hours,class_attendance,internet_access,sleep_hours,sleep_quality,study_method,facility_rating,exam_difficulty
0,0,21,female,b.sc,7.91,98.8,no,4.9,average,online videos,low,easy
1,1,18,other,diploma,4.95,94.8,yes,4.7,poor,self-study,medium,moderate
2,2,20,female,b.sc,4.68,92.6,yes,5.8,poor,coaching,high,moderate
3,3,19,male,b.sc,2.0,49.5,yes,8.3,average,group study,high,moderate
4,4,23,male,bca,7.65,86.9,yes,9.6,good,self-study,high,easy


## 1. Handling Missing Values

In [11]:
combined_df.isnull().sum()

id                  0
age                 0
gender              0
course              0
study_hours         0
class_attendance    0
internet_access     0
sleep_hours         0
sleep_quality       0
study_method        0
facility_rating     0
exam_difficulty     0
dtype: int64

As identified in the initial EDA, there are no missing values in the dataset. No imputation is necessary.

## 2. Categorical Feature Encoding

### Ordinal Encoding

Features like `sleep_quality`, `facility_rating`, and `exam_difficulty` have a clear intrinsic order. We will map them to numerical values to preserve this relationship.

In [12]:
ordinal_mappings = {
    'sleep_quality': {'poor': 0, 'average': 1, 'good': 2},
    'facility_rating': {'low': 0, 'medium': 1, 'high': 2},
    'exam_difficulty': {'easy': 0, 'moderate': 1, 'hard': 2}
}

for col, mapping in ordinal_mappings.items():
    combined_df[col] = combined_df[col].map(mapping)

combined_df[['sleep_quality', 'facility_rating', 'exam_difficulty']].head()

Unnamed: 0,sleep_quality,facility_rating,exam_difficulty
0,1,0,0
1,0,1,1
2,0,2,1
3,1,2,1
4,2,2,0


### One-Hot Encoding

Nominal features without a clear order (`gender`, `course`, `study_method`, `internet_access`) will be converted into binary columns using one-hot encoding.

In [13]:
nominal_features = ['gender', 'course', 'study_method', 'internet_access']

combined_df = pd.get_dummies(combined_df, columns=nominal_features, drop_first=True)

print("DataFrame shape after OHE:", combined_df.shape)
combined_df.head()

DataFrame shape after OHE: (900000, 21)


Unnamed: 0,id,age,study_hours,class_attendance,sleep_hours,sleep_quality,facility_rating,exam_difficulty,gender_male,gender_other,...,course_b.tech,course_ba,course_bba,course_bca,course_diploma,study_method_group study,study_method_mixed,study_method_online videos,study_method_self-study,internet_access_yes
0,0,21,7.91,98.8,4.9,1,0,0,False,False,...,False,False,False,False,False,False,False,True,False,False
1,1,18,4.95,94.8,4.7,0,1,1,False,True,...,False,False,False,False,True,False,False,False,True,True
2,2,20,4.68,92.6,5.8,0,2,1,False,False,...,False,False,False,False,False,False,False,False,False,True
3,3,19,2.0,49.5,8.3,1,2,1,True,False,...,False,False,False,False,False,True,False,False,False,True
4,4,23,7.65,86.9,9.6,2,2,0,True,False,...,False,False,False,True,False,False,False,False,True,True


## 3. Creating New Features

Based on the EDA, we can create interaction features that might capture more complex relationships.

In [14]:
# Interaction between study and sleep
combined_df['study_sleep_ratio'] = combined_df['study_hours'] / (combined_df['sleep_hours'] + 1e-6)

# Total time for key activities
combined_df['time_management_index'] = combined_df['study_hours'] + combined_df['sleep_hours']

# A simple interaction between attendance and study hours
combined_df['attendance_study_interaction'] = combined_df['class_attendance'] * combined_df['study_hours']

combined_df[['study_sleep_ratio', 'time_management_index', 'attendance_study_interaction']].head()

Unnamed: 0,study_sleep_ratio,time_management_index,attendance_study_interaction
0,1.614285,12.81,781.508
1,1.053191,9.65,469.26
2,0.806896,10.48,433.368
3,0.240964,10.3,99.0
4,0.796875,17.25,664.785


## 3.5. Advanced Feature Creation

Let's create the more complex features as discussed to potentially capture non-linear relationships and more detailed interactions.

In [15]:
# Polynomial Features to capture non-linear effects
combined_df['study_hours_sq'] = combined_df['study_hours'] ** 2
combined_df['class_attendance_sq'] = combined_df['class_attendance'] ** 2

# Aggregate 'Wellbeing' Feature
# We add 1 to the quality/rating scores to avoid multiplying by zero for the 'low' or 'poor' categories.
combined_df['wellbeing_index'] = combined_df['sleep_hours'] * (combined_df['sleep_quality'] + 1)

# Complex Interaction Features
combined_df['quality_study_hours'] = combined_df['study_hours'] * (combined_df['sleep_quality'] + 1)
combined_df['study_environment_score'] = combined_df['internet_access_yes'].astype(int) * (combined_df['facility_rating'] + 1)

print("New advanced features created.")
combined_df[['study_hours_sq', 'wellbeing_index', 'quality_study_hours', 'study_environment_score']].head()

New advanced features created.


Unnamed: 0,study_hours_sq,wellbeing_index,quality_study_hours,study_environment_score
0,62.5681,9.8,15.82,0
1,24.5025,4.7,4.95,2
2,21.9024,5.8,4.68,3
3,4.0,16.6,4.0,3
4,58.5225,28.8,22.95,3


In [16]:
# Binning Age
age_bins = [17, 20, 23, 26] # Bins for 18-20, 21-23, 24-26
age_labels = [0, 1, 2] # Using numerical labels directly
combined_df['age_group'] = pd.cut(combined_df['age'], bins=age_bins, labels=age_labels, right=True)

# Binning Attendance
attendance_bins = [-1, 50, 80, 101] # Bins for [0-49], [50-79], [80-100]
attendance_labels = [0, 1, 2] # low, medium, high
combined_df['attendance_level'] = pd.cut(combined_df['class_attendance'], bins=attendance_bins, labels=attendance_labels, right=True)

# The result of pd.cut with numerical labels is already usable. Let's check the new columns.
combined_df[['age', 'age_group', 'class_attendance', 'attendance_level']].head()

Unnamed: 0,age,age_group,class_attendance,attendance_level
0,21,1,98.8,2
1,18,0,94.8,2
2,20,0,92.6,2
3,19,0,49.5,0
4,23,1,86.9,2


## 4. Finalizing and Saving Processed Data

In [17]:
# Drop original id column as it's not a feature
combined_df = combined_df.drop('id', axis=1)

# Separate back into train and test sets
train_processed = combined_df.iloc[:len(train_df)]
test_processed = combined_df.iloc[len(train_df):]

# Add the target variable back to the training set
train_processed['exam_score'] = train_df['exam_score']

print("Processed Train DataFrame shape:", train_processed.shape)
print("Processed Test DataFrame shape:", test_processed.shape)

Processed Train DataFrame shape: (630000, 31)
Processed Test DataFrame shape: (270000, 30)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_processed['exam_score'] = train_df['exam_score']


In [18]:
# Save the processed dataframes
train_processed.to_csv(PROCESSED_PATH / 'train_processed.csv', index=False)
test_processed.to_csv(PROCESSED_PATH / 'test_processed.csv', index=False)

print(f"Processed files saved to {PROCESSED_PATH}")

Processed files saved to ../../data/processed
