## **1) Import Libraries**

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

## **2) Data Load and Initial Inspection**

In [4]:
df = pd.read_csv('StudentsPerformance.csv')
df.head()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   gender                       1000 non-null   object
 1   race/ethnicity               1000 non-null   object
 2   parental level of education  1000 non-null   object
 3   lunch                        1000 non-null   object
 4   test preparation course      1000 non-null   object
 5   math score                   1000 non-null   int64 
 6   reading score                1000 non-null   int64 
 7   writing score                1000 non-null   int64 
dtypes: int64(3), object(5)
memory usage: 62.6+ KB


In [6]:
df.describe()

Unnamed: 0,math score,reading score,writing score
count,1000.0,1000.0,1000.0
mean,66.089,69.169,68.054
std,15.16308,14.600192,15.195657
min,0.0,17.0,10.0
25%,57.0,59.0,57.75
50%,66.0,70.0,69.0
75%,77.0,79.0,79.0
max,100.0,100.0,100.0


## **3) Data Cleaning**

In [8]:
# Standardize Column Names
df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_')

In [17]:
# Check for duplicate
df.duplicated().sum()

np.int64(0)

In [9]:
# Check for null
df.isnull().sum()

Unnamed: 0,0
gender,0
race/ethnicity,0
parental_level_of_education,0
lunch,0
test_preparation_course,0
math_score,0
reading_score,0
writing_score,0


In [11]:
# Check Unique values for Categorical Columns
df['gender'].unique()

array(['female', 'male'], dtype=object)

In [12]:
df['race/ethnicity'].unique()

array(['group B', 'group C', 'group A', 'group D', 'group E'],
      dtype=object)

In [13]:
df['parental_level_of_education'].unique()

array(["bachelor's degree", 'some college', "master's degree",
       "associate's degree", 'high school', 'some high school'],
      dtype=object)

In [14]:
df['lunch'].unique()

array(['standard', 'free/reduced'], dtype=object)

In [15]:
df['test_preparation_course'].unique()

array(['none', 'completed'], dtype=object)

In [20]:
# Validate Value Ranges
(df[['math_score', 'reading_score', 'writing_score']] > 100).sum()
(df[['math_score', 'reading_score', 'writing_score']] < 0).sum()

Unnamed: 0,0
math_score,0
reading_score,0
writing_score,0


In [23]:
# Create Total and Average Scores
df['total_score'] = df[['math_score', 'reading_score', 'writing_score']].sum(axis=1)
df['average_score'] = df['total_score'] / 3
df['average_score'] = df['average_score'].round(2)  # round to 2 decimal places

In [24]:
# Export cleaned data to CSV
df.to_csv('cleaned_student_performance.csv', index=False)

## **4) Exploratory Data Analysis (EDA)**

In [27]:
# 1. Which parental education level is linked with the highest average math score?
df.groupby('parental_level_of_education')['math_score'].mean().sort_values(ascending=False)

Unnamed: 0_level_0,math_score
parental_level_of_education,Unnamed: 1_level_1
master's degree,69.745763
bachelor's degree,69.389831
associate's degree,67.882883
some college,67.128319
some high school,63.497207
high school,62.137755


In [28]:
# 2. Is there a significant score difference between males and females across all subjects?
df.groupby('gender')[['math_score', 'reading_score', 'writing_score']].mean()

Unnamed: 0_level_0,math_score,reading_score,writing_score
gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
female,63.633205,72.608108,72.467181
male,68.728216,65.473029,63.311203


In [29]:
# 3. How much does completing the test preparation course improve performance in each subject?
df.groupby('test_preparation_course')[['math_score', 'reading_score', 'writing_score']].mean()

Unnamed: 0_level_0,math_score,reading_score,writing_score
test_preparation_course,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
completed,69.695531,73.893855,74.418994
none,64.077882,66.534268,64.504673


In [35]:
# 4. Which combination of gender, lunch type, and test preparation status produces the top 10% of scores?
df['total_score'] = df[['math_score', 'reading_score', 'writing_score']].sum(axis=1)
threshold = df['total_score'].quantile(0.90)
top_10 = df[df['total_score'] >= threshold]

top_10.groupby(['gender', 'lunch', 'test_preparation_course']).size().reset_index(name='count')

Unnamed: 0,gender,lunch,test_preparation_course,count
0,female,free/reduced,completed,6
1,female,free/reduced,none,2
2,female,standard,completed,29
3,female,standard,none,31
4,male,free/reduced,completed,3
5,male,free/reduced,none,2
6,male,standard,completed,20
7,male,standard,none,9


In [38]:
top_10.shape[0]

102

In [39]:
# 5. Does lunch type have a uniform impact across all race/ethnicity groups, or does its effect vary?
grouped_lunch_race = df.groupby(['race/ethnicity', 'lunch'])[['math_score', 'reading_score', 'writing_score', 'total_score']].mean().round(2)
grouped_lunch_race.reset_index(inplace=True)
grouped_lunch_race

Unnamed: 0,race/ethnicity,lunch,math_score,reading_score,writing_score,total_score
0,group A,free/reduced,55.22,60.56,57.19,172.97
1,group A,standard,65.98,67.47,66.4,199.85
2,group B,free/reduced,57.43,63.97,61.52,182.93
3,group B,standard,66.88,69.28,67.93,204.09
4,group C,free/reduced,56.41,63.41,61.41,181.24
5,group C,standard,68.94,72.27,71.4,212.6
6,group D,free/reduced,61.12,66.43,66.45,194.0
7,group D,standard,70.92,72.08,72.25,215.24
8,group E,free/reduced,66.56,68.73,67.2,202.49
9,group E,standard,76.83,74.81,73.15,224.79


In [41]:
# 6. What is the correlation between reading and writing scores? Is it stronger than math and writing?
df[['math_score', 'reading_score', 'writing_score']].corr()

Unnamed: 0,math_score,reading_score,writing_score
math_score,1.0,0.81758,0.802642
reading_score,0.81758,1.0,0.954598
writing_score,0.802642,0.954598,1.0


In [48]:
# 7. Identify the top 5% performing students and analyze their demographic profiles. What patterns emerge?

# Calculate the 95th percentile threshold
top_5_threshold = df['total_score'].quantile(0.95)
# Filter top 5% students
top_5_df = df[df['total_score'] >= top_5_threshold]
# Analyze their demographics
demographic_summary = top_5_df.groupby(['gender', 'race/ethnicity', 'parental_level_of_education', 'test_preparation_course']).size().reset_index(name='count')

demographic_summary.sort_values(by='count', ascending=False)

Unnamed: 0,gender,race/ethnicity,parental_level_of_education,test_preparation_course,count
21,female,group E,associate's degree,none,3
20,female,group E,associate's degree,completed,2
1,female,group B,associate's degree,completed,2
9,female,group C,some college,completed,2
15,female,group D,master's degree,none,2
7,female,group C,bachelor's degree,completed,2
22,female,group E,bachelor's degree,completed,2
34,male,group C,bachelor's degree,completed,2
25,female,group E,master's degree,completed,2
32,male,group C,associate's degree,completed,2


In [51]:
# 8. Can we cluster students into performance categories (e.g., low, medium, high performers) using just Pandas logic? If yes, how?
# Create performance categories based on total_score
def performance_category(score):
    if score >= df['total_score'].quantile(0.75):
        return 'High'
    elif score >= df['total_score'].quantile(0.25):
        return 'Medium'
    else:
        return 'Low'

df['performance_category'] = df['total_score'].apply(performance_category)
df['performance_category']


Unnamed: 0,performance_category
0,Medium
1,High
2,High
3,Low
4,Medium
...,...
995,High
996,Low
997,Medium
998,Medium


In [50]:
df['performance_category'].value_counts()

Unnamed: 0_level_0,count
performance_category,Unnamed: 1_level_1
Medium,499
High,254
Low,247


In [None]:
from google.colab import drive
drive.mount('/content/drive')