In [1]:
# apply() and Lambda Functions

import pandas as pd
import numpy as np


# loead and setup
df = pd.read_csv('data/StudentPerformanceFactors.csv')

df.head()

Unnamed: 0,Hours_Studied,Attendance,Parental_Involvement,Access_to_Resources,Extracurricular_Activities,Sleep_Hours,Previous_Scores,Motivation_Level,Internet_Access,Tutoring_Sessions,Family_Income,Teacher_Quality,School_Type,Peer_Influence,Physical_Activity,Learning_Disabilities,Parental_Education_Level,Distance_from_Home,Gender,Exam_Score
0,23,84,Low,High,No,7,73,Low,Yes,0,Low,Medium,Public,Positive,3,No,High School,Near,Male,67
1,19,64,Low,Medium,No,8,59,Low,Yes,2,Medium,Medium,Public,Negative,4,No,College,Moderate,Female,61
2,24,98,Medium,Medium,Yes,7,91,Medium,Yes,2,Medium,Medium,Public,Neutral,4,No,Postgraduate,Near,Male,74
3,29,89,Low,Medium,Yes,8,98,Medium,Yes,1,Medium,Medium,Public,Negative,4,No,High School,Moderate,Male,71
4,19,92,Medium,Medium,Yes,6,65,Medium,Yes,3,Medium,High,Public,Neutral,4,No,College,Near,Female,70


In [None]:
# apply()

# Define a function to classify scores

def classify_score(score):
    if score >= 80:
        return 'Excellent'
    elif score >=65:
        return 'Good'
    else:
        return 'Needs Improvement'
    
# Apply the function to each value in Exam_Score
df['Performance'] = df['Exam_Score'].apply(classify_score)

df[['Exam_Score', 'Performance']].head(30)

Unnamed: 0,Exam_Score,Performance
0,67,Good
1,61,Needs Improvement
2,74,Good
3,71,Good
4,70,Good
5,71,Good
6,67,Good
7,66,Good
8,69,Good
9,72,Good


In [6]:
df[df['Exam_Score']>=80][['Exam_Score', 'Performance']].head(30)

Unnamed: 0,Exam_Score,Performance
94,100,Excellent
217,89,Excellent
404,86,Excellent
529,97,Excellent
558,83,Excellent
560,84,Excellent
637,80,Excellent
770,94,Excellent
836,94,Excellent
919,97,Excellent


In [7]:
# apply() with lambda functions
# Lambda - quick one-liner function
df['Study_Hours_Per_Day'] =df['Hours_Studied'].apply(lambda x: round(x/7, 1))
df[['Hours_Studied', 'Study_Hours_Per_Day']].head()

Unnamed: 0,Hours_Studied,Study_Hours_Per_Day
0,23,3.3
1,19,2.7
2,24,3.4
3,29,4.1
4,19,2.7


In [9]:
# map() - Dictonary mapping

# Encode Motiovation_Level as numbers
motivation_map = {'Low': 1, 'Medium': 2, 'High': 3}
df['Motivation_Score'] = df['Motivation_Level'].map(motivation_map)
df[df['Motivation_Level']=='High'][['Motivation_Level', 'Motivation_Score']].head()

Unnamed: 0,Motivation_Level,Motivation_Score
8,High,3
20,High,3
23,High,3
25,High,3
57,High,3


In [10]:
# Boolean encoding
df['Has_Internet'] = df['Internet_Access'].map({'Yes': 1, 'No': 0})
df['Does_Extra'] = df['Extracurricular_Activities'].map({'Yes': 1, 'No': 0})
df[['Internet_Access', 'Has_Internet', 'Extracurricular_Activities', 'Does_Extra']].head()

Unnamed: 0,Internet_Access,Has_Internet,Extracurricular_Activities,Does_Extra
0,Yes,1,No,0
1,Yes,1,No,0
2,Yes,1,Yes,1
3,Yes,1,Yes,1
4,Yes,1,Yes,1


In [11]:
# np.whare and np.select
# np.where -> 2 choices (if/else)
df['Pass_Fail'] = np.where(df['Exam_Score']>=65, 'Pass', 'Fail')
df[['Exam_Score', 'Pass_Fail']].head()

Unnamed: 0,Exam_Score,Pass_Fail
0,67,Pass
1,61,Fail
2,74,Pass
3,71,Pass
4,70,Pass


In [14]:
# np.select - Multiple conditions - > if/elif/elif.../else
conditions = [
    df['Exam_Score'] >= 90,
    df['Exam_Score'] >= 80,
    df['Exam_Score'] >= 65,
    df['Exam_Score'] < 65
]
choices = ['A', 'B', 'C', 'F']

df['Grade'] = np.select(conditions, choices, default='Unknown')
df[['Exam_Score', 'Grade']].head()  


Unnamed: 0,Exam_Score,Grade
0,67,C
1,61,F
2,74,C
3,71,C
4,70,C


In [15]:
df[df['Exam_Score']>=80][['Exam_Score', 'Grade']].head(30)

Unnamed: 0,Exam_Score,Grade
94,100,A
217,89,B
404,86,B
529,97,A
558,83,B
560,84,B
637,80,B
770,94,A
836,94,A
919,97,A


In [16]:
# Binning - pd.cut and pd.qcut

# pd.cut   - > fixed bin edges 
bins = [0, 60, 70, 80, 90, 100]
labels = ['Very Low', 'Low', 'Medium', 'High', 'Very High']

df['Score_Band'] = pd.cut(df['Exam_Score'], bins=bins, labels=labels)
df[['Exam_Score', 'Score_Band']].head(30)

Unnamed: 0,Exam_Score,Score_Band
0,67,Low
1,61,Low
2,74,Medium
3,71,Medium
4,70,Low
5,71,Medium
6,67,Low
7,66,Low
8,69,Low
9,72,Medium


In [17]:
# pd.qcut - Equal-sized Groups
df['Score_Quartile'] = pd.qcut(df['Exam_Score'], q=4, labels=['Q1', 'Q2', 'Q3', 'Q4'])
df['Score_Quartile'].value_counts()

Score_Quartile
Q1    2131
Q4    1625
Q2    1468
Q3    1383
Name: count, dtype: int64

In [18]:
# assign() - Chinable feature engineering

# Create multiple columns in a single chain 
df_enhanced = df.assign(
    Study_Efficiency = df['Exam_Score'] / df['Hours_Studied'],
    High_Performer = np.where(df['Exam_Score']>=80, True, False),
    Sleep_Category = pd.cut(df['Sleep_Hours'], bins=[0, 5, 7, 10], labels=['Low', 'Normal', 'High'])
)

df_enhanced[['Exam_Score', 'Hours_Studied', 'Study_Efficiency', 'High_Performer', 'Sleep_Hours', 'Sleep_Category']].head()

Unnamed: 0,Exam_Score,Hours_Studied,Study_Efficiency,High_Performer,Sleep_Hours,Sleep_Category
0,67,23,2.913043,False,7,Normal
1,61,19,3.210526,False,8,High
2,74,24,3.083333,False,7,Normal
3,71,29,2.448276,False,8,High
4,70,19,3.684211,False,6,Normal
