In [1]:
# Part 4 – Feature Engineering
# Let's create some new features:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder, OneHotEncoder

# Load the cleaned dataset
df = pd.read_csv('cleaned_students.csv')

# 1. Create Programming Average feature
df['Programming_Avg'] = (df['Python'] + df['DB']) / 2

# 2. Create isAdult binary feature
df['isAdult'] = np.where(df['Age'] >= 25, 1, 0)

# 3. Transform studyHOURS into categories
df['studyHOURS_Category'] = pd.cut(df['studyHOURS'], 
                                  bins=[0, 130, 150, 200], 
                                  labels=['Low', 'Medium', 'High'])

# Display the new features
print("New features created:")
print(df[['Python', 'DB', 'Programming_Avg', 'Age', 'isAdult', 'studyHOURS', 'studyHOURS_Category']].head(10))

# Check correlation of new features with target variables
correlation_with_python = df[['Programming_Avg', 'isAdult', 'studyHOURS']].corrwith(df['Python'])
correlation_with_db = df[['Programming_Avg', 'isAdult', 'studyHOURS']].corrwith(df['DB'])

print("\nCorrelation with Python scores:")
print(correlation_with_python)
print("\nCorrelation with DB scores:")
print(correlation_with_db)

New features created:
   Python  DB  Programming_Avg  Age  isAdult  studyHOURS studyHOURS_Category
0    74.0  50             62.0   25        1         130                 Low
1    88.0  59             73.5   25        1         130                 Low
2    80.0  91             85.5   27        1         156                High
3    85.0  60             72.5   29        1         160                High
4    80.0  89             84.5   31        1         156                High
5    83.0  90             86.5   33        1         160                High
6    79.0  58             68.5   34        1         156                High
7    80.0  55             67.5   36        1         156                High
8    85.0  99             92.0   38        1         156                High
9    70.0  76             73.0   40        1         144              Medium

Correlation with Python scores:
Programming_Avg    0.669220
isAdult            0.176173
studyHOURS         0.738956
dtype: float64

In [2]:
# Question: Which engineered feature do you think would add the most predictive power to the model?

# Based on the correlation analysis, the Programming_Avg feature would likely add the most predictive power since it directly combines both programming scores. However, for predicting individual subject scores, studyHOURS shows reasonable correlation and could be valuable.

In [3]:
# Part 5 – Feature Scaling
# Detect Numeric Columns:
# Identify numeric columns
numeric_columns = df.select_dtypes(include=[np.number]).columns.tolist()
print("Numeric columns:", numeric_columns)

# Remove non-feature numeric columns if needed
features_to_scale = ['Age', 'entryEXAM', 'studyHOURS', 'Python', 'DB', 'Programming_Avg']
print("Features selected for scaling:", features_to_scale)

Numeric columns: ['Age', 'entryEXAM', 'studyHOURS', 'Python', 'DB', 'Programming_Avg', 'isAdult']
Features selected for scaling: ['Age', 'entryEXAM', 'studyHOURS', 'Python', 'DB', 'Programming_Avg']


In [4]:
# Apply Scaling:
# Option 1: StandardScaler (mean=0, std=1)
standard_scaler = StandardScaler()
df_standard = df.copy()
df_standard[features_to_scale] = standard_scaler.fit_transform(df[features_to_scale])

print("StandardScaler results (first 5 rows):")
print(df_standard[features_to_scale].head())

# Option 2: MinMaxScaler (range 0-1)
minmax_scaler = MinMaxScaler()
df_minmax = df.copy()
df_minmax[features_to_scale] = minmax_scaler.fit_transform(df[features_to_scale])

print("\nMinMaxScaler results (first 5 rows):")
print(df_minmax[features_to_scale].head())

# Verify scaling results
print("\nStandardScaler statistics:")
print(df_standard[features_to_scale].describe().loc[['mean', 'std']])

print("\nMinMaxScaler statistics:")
print(df_minmax[features_to_scale].describe().loc[['min', 'max']])

StandardScaler results (first 5 rows):
        Age  entryEXAM  studyHOURS    Python        DB  Programming_Avg
0 -1.135224  -2.115336   -3.309518 -1.059057 -1.333821        -1.439059
1 -1.135224  -1.227324   -3.309518  1.059057 -0.783476        -0.290309
2 -0.914725   0.710158    0.289853 -0.151294  1.173304         0.908386
3 -0.694226   0.629429    0.843603  0.605176 -0.722327        -0.390200
4 -0.473728   0.548701    0.289853 -0.151294  1.051005         0.808495

MinMaxScaler results (first 5 rows):
        Age  entryEXAM  studyHOURS    Python        DB  Programming_Avg
0  0.083333   0.317460    0.000000  0.448276  0.253731         0.261905
1  0.083333   0.492063    0.000000  0.931034  0.388060         0.535714
2  0.125000   0.873016    0.866667  0.655172  0.865672         0.821429
3  0.166667   0.857143    1.000000  0.827586  0.402985         0.511905
4  0.208333   0.841270    0.866667  0.655172  0.835821         0.797619

StandardScaler statistics:
               Age     entryEXA

In [5]:
# Part 6 – Encoding Categorical Data
# Detect Categorical Columns:
# Identify categorical columns
categorical_columns = df.select_dtypes(include=['object', 'category']).columns.tolist()
print("Categorical columns:", categorical_columns)

# Remove name columns from encoding (they are identifiers, not features)
categorical_features = ['gender', 'country', 'residence', 'prevEducation', 'studyHOURS_Category']
print("Categorical features for encoding:", categorical_features)

Categorical columns: ['fNAME', 'lNAME', 'gender', 'country', 'residence', 'prevEducation', 'studyHOURS_Category']
Categorical features for encoding: ['gender', 'country', 'residence', 'prevEducation', 'studyHOURS_Category']


In [6]:
# Handle Encoding:
# Method 1: Label Encoding (for ordinal data or tree-based models)
label_encoder = LabelEncoder()
df_label_encoded = df.copy()

for col in categorical_features:
    if col in df_label_encoded.columns:
        df_label_encoded[col + '_LabelEncoded'] = label_encoder.fit_transform(df_label_encoded[col])

print("Label Encoded features (first 5 rows):")
print(df_label_encoded[[col for col in df_label_encoded.columns if 'LabelEncoded' in col]].head())

# Method 2: One-Hot Encoding (for nominal data and linear models)
df_onehot = df.copy()

# One-hot encode categorical features
onehot_encoded = pd.get_dummies(df_onehot[categorical_features], prefix=categorical_features)

# Drop original categorical columns and concatenate one-hot encoded ones
df_onehot = df_onehot.drop(columns=categorical_features)
df_onehot = pd.concat([df_onehot, onehot_encoded], axis=1)

print("\nOne-Hot Encoded features (first 5 rows, showing only encoded columns):")
onehot_columns = [col for col in df_onehot.columns if any(feat in col for feat in categorical_features)]
print(df_onehot[onehot_columns].head())

# Check the shape after one-hot encoding
print(f"\nOriginal shape: {df.shape}")
print(f"After one-hot encoding: {df_onehot.shape}")
print(f"Number of new columns: {df_onehot.shape[1] - df.shape[1] + len(categorical_features)}")

Label Encoded features (first 5 rows):
   gender_LabelEncoded  country_LabelEncoded  residence_LabelEncoded  \
0                    1                    11                       2   
1                    0                     7                       0   
2                    1                     7                       0   
3                    1                     7                       0   
4                    1                     7                       0   

   prevEducation_LabelEncoded  studyHOURS_Category_LabelEncoded  
0                           3                                 1  
1                           3                                 1  
2                           0                                 0  
3                           0                                 0  
4                           0                                 0  

One-Hot Encoded features (first 5 rows, showing only encoded columns):
   gender_female  gender_male  country_Denmark  country_Fran

In [7]:
# Complete Pipeline and Final Export
# Create a final processed dataset with recommended approaches
def create_final_dataset(df):
    # Create engineered features
    df_final = df.copy()
    df_final['Programming_Avg'] = (df_final['Python'] + df_final['DB']) / 2
    df_final['isAdult'] = np.where(df_final['Age'] >= 25, 1, 0)
    
    # Scale numerical features (using StandardScaler as default)
    numeric_features = ['Age', 'entryEXAM', 'studyHOURS', 'Python', 'DB', 'Programming_Avg']
    scaler = StandardScaler()
    df_final[numeric_features] = scaler.fit_transform(df_final[numeric_features])
    
    # One-hot encode categorical features
    categorical_features = ['gender', 'country', 'residence', 'prevEducation']
    df_final = pd.get_dummies(df_final, columns=categorical_features, drop_first=True)
    
    return df_final

# Create the final processed dataset
df_final_processed = create_final_dataset(df)

print("Final processed dataset shape:", df_final_processed.shape)
print("\nFinal dataset columns:")
print(df_final_processed.columns.tolist())

# Export the fully processed dataset
df_final_processed.to_csv('fully_processed_students.csv', index=False)
print("\nFully processed dataset saved as 'fully_processed_students.csv'")

# Also export the dataset with feature engineering only (for comparison)
df_feature_engineered = df.copy()
df_feature_engineered['Programming_Avg'] = (df_feature_engineered['Python'] + df_feature_engineered['DB']) / 2
df_feature_engineered['isAdult'] = np.where(df_feature_engineered['Age'] >= 25, 1, 0)
df_feature_engineered['studyHOURS_Category'] = pd.cut(df_feature_engineered['studyHOURS'], 
                                                     bins=[0, 130, 150, 200], 
                                                     labels=['Low', 'Medium', 'High'])
df_feature_engineered.to_csv('feature_engineered_students.csv', index=False)
print("Feature engineered dataset saved as 'feature_engineered_students.csv'")

Final processed dataset shape: (64, 28)

Final dataset columns:
['fNAME', 'lNAME', 'Age', 'entryEXAM', 'studyHOURS', 'Python', 'DB', 'Programming_Avg', 'isAdult', 'studyHOURS_Category', 'gender_male', 'country_France', 'country_Germany', 'country_Italy', 'country_Kenya', 'country_Netherlands', 'country_Nigeria', 'country_Norway', 'country_Somalia', 'country_Spain', 'country_UK', 'country_Uganda', 'residence_Private', 'residence_Sognsvann', 'prevEducation_Diploma', 'prevEducation_Doctorate', 'prevEducation_High School', 'prevEducation_Masters']

Fully processed dataset saved as 'fully_processed_students.csv'
Feature engineered dataset saved as 'feature_engineered_students.csv'
