In [1]:
import pandas as pd
import numpy as np

df = pd.read_csv('data/StudentPerformanceFactors.csv')

df.head()

Unnamed: 0,Hours_Studied,Attendance,Parental_Involvement,Access_to_Resources,Extracurricular_Activities,Sleep_Hours,Previous_Scores,Motivation_Level,Internet_Access,Tutoring_Sessions,Family_Income,Teacher_Quality,School_Type,Peer_Influence,Physical_Activity,Learning_Disabilities,Parental_Education_Level,Distance_from_Home,Gender,Exam_Score
0,23,84,Low,High,No,7,73,Low,Yes,0,Low,Medium,Public,Positive,3,No,High School,Near,Male,67
1,19,64,Low,Medium,No,8,59,Low,Yes,2,Medium,Medium,Public,Negative,4,No,College,Moderate,Female,61
2,24,98,Medium,Medium,Yes,7,91,Medium,Yes,2,Medium,Medium,Public,Neutral,4,No,Postgraduate,Near,Male,74
3,29,89,Low,Medium,Yes,8,98,Medium,Yes,1,Medium,Medium,Public,Negative,4,No,High School,Moderate,Male,71
4,19,92,Medium,Medium,Yes,6,65,Medium,Yes,3,Medium,High,Public,Neutral,4,No,College,Near,Female,70


In [2]:
# Detect some missing values
print(df.isnull().sum())

Hours_Studied                  0
Attendance                     0
Parental_Involvement           0
Access_to_Resources            0
Extracurricular_Activities     0
Sleep_Hours                    0
Previous_Scores                0
Motivation_Level               0
Internet_Access                0
Tutoring_Sessions              0
Family_Income                  0
Teacher_Quality               78
School_Type                    0
Peer_Influence                 0
Physical_Activity              0
Learning_Disabilities          0
Parental_Education_Level      90
Distance_from_Home            67
Gender                         0
Exam_Score                     0
dtype: int64


In [3]:
# See the actual rows
# where teach quality is null
df[df['Teacher_Quality'].isnull()]

Unnamed: 0,Hours_Studied,Attendance,Parental_Involvement,Access_to_Resources,Extracurricular_Activities,Sleep_Hours,Previous_Scores,Motivation_Level,Internet_Access,Tutoring_Sessions,Family_Income,Teacher_Quality,School_Type,Peer_Influence,Physical_Activity,Learning_Disabilities,Parental_Education_Level,Distance_from_Home,Gender,Exam_Score
127,17,97,Medium,Medium,No,8,89,Medium,Yes,1,Low,,Public,Neutral,4,No,High School,Far,Male,69
396,10,80,High,Medium,Yes,6,93,High,Yes,4,Medium,,Public,Negative,3,No,College,Moderate,Female,67
457,14,86,Medium,Low,Yes,8,90,Medium,No,3,Medium,,Public,Negative,3,Yes,Postgraduate,Near,Male,65
570,20,71,High,Medium,No,8,77,Medium,Yes,0,High,,Private,Neutral,4,No,High School,Moderate,Male,66
593,22,82,High,Medium,Yes,7,71,Low,Yes,1,High,,Private,Positive,3,No,High School,Near,Female,69
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6414,22,63,Low,Low,Yes,7,94,Medium,Yes,0,Medium,,Public,Negative,3,Yes,High School,Far,Male,61
6427,29,75,Low,Medium,Yes,7,96,Low,Yes,2,Low,,Public,Positive,3,No,High School,Moderate,Male,68
6461,12,89,Medium,High,Yes,8,78,Low,Yes,4,Low,,Public,Neutral,5,No,Postgraduate,Near,Female,69
6579,9,84,Medium,Medium,No,6,74,Medium,Yes,5,High,,Public,Neutral,2,No,High School,Near,Male,67


In [4]:
# Null percentage per column
null_pct = (df.isnull().sum() / len(df) * 100).round(2)
print(null_pct[null_pct > 0])

Teacher_Quality             1.18
Parental_Education_Level    1.36
Distance_from_Home          1.01
dtype: float64


In [5]:
# Two approaches: **drop** the rows, or **fill** them.

# Droping rows with - dropna()

df_droped = df.dropna()
print(f"Before: {len(df)}, After: {len(df_droped)}")
print(f"Lost {len(df) - len(df_droped)} rows.")

Before: 6607, After: 6378
Lost 229 rows.


In [6]:
## Drop ONLY rows where specific columns are null
df_droped_specific = df.dropna(subset=['Teacher_Quality'])
print(f"Dropped only Teacher_Quality nulls: {len(df)} → {len(df_droped_specific)} rows")

Dropped only Teacher_Quality nulls: 6607 → 6529 rows


In [8]:
# Filling values -- fillna()

# Option 2: Fill nulls with a value
# For categorical columns → use MODE (most common value)
mode_value = df['Teacher_Quality'].mode()[0]
print(f"Most common Teacher_Quality: {mode_value}")

df_filled = df.copy()
df_filled['Teacher_Quality'] = df_filled['Teacher_Quality'].fillna(mode_value)
print(f"Nulls after fill: {df_filled['Teacher_Quality'].isnull().sum()}")

Most common Teacher_Quality: Medium
Nulls after fill: 0


In [9]:
# Fill all categorical nulls with their mode
for col in ['Teacher_Quality', 'Parental_Education_Level', 'Distance_from_Home']:
    df_filled[col] = df_filled[col].fillna(df_filled[col].mode()[0])
    
    
# Verify - no more nulls 
print(df_filled.isnull().sum().sum()," total nulls left after filling.")

0  total nulls left after filling.


In [10]:
# Handling Duplicates

# Check for duplicates in our dataset
print(f"Duplicates: {df.duplicated().sum()}")

Duplicates: 0


In [11]:
# Let's create some duplicates to practice
df_with_duplicates = pd.concat([df.head(100), df.head(20)], ignore_index=True)
print(f"Shape with duplicates: {df_with_duplicates.shape}")
print(f"Duplicates: {df_with_duplicates.duplicated().sum()}")

Shape with duplicates: (120, 20)
Duplicates: 20


In [12]:
df_with_duplicates.head(40)

Unnamed: 0,Hours_Studied,Attendance,Parental_Involvement,Access_to_Resources,Extracurricular_Activities,Sleep_Hours,Previous_Scores,Motivation_Level,Internet_Access,Tutoring_Sessions,Family_Income,Teacher_Quality,School_Type,Peer_Influence,Physical_Activity,Learning_Disabilities,Parental_Education_Level,Distance_from_Home,Gender,Exam_Score
0,23,84,Low,High,No,7,73,Low,Yes,0,Low,Medium,Public,Positive,3,No,High School,Near,Male,67
1,19,64,Low,Medium,No,8,59,Low,Yes,2,Medium,Medium,Public,Negative,4,No,College,Moderate,Female,61
2,24,98,Medium,Medium,Yes,7,91,Medium,Yes,2,Medium,Medium,Public,Neutral,4,No,Postgraduate,Near,Male,74
3,29,89,Low,Medium,Yes,8,98,Medium,Yes,1,Medium,Medium,Public,Negative,4,No,High School,Moderate,Male,71
4,19,92,Medium,Medium,Yes,6,65,Medium,Yes,3,Medium,High,Public,Neutral,4,No,College,Near,Female,70
5,19,88,Medium,Medium,Yes,8,89,Medium,Yes,3,Medium,Medium,Public,Positive,3,No,Postgraduate,Near,Male,71
6,29,84,Medium,Low,Yes,7,68,Low,Yes,1,Low,Medium,Private,Neutral,2,No,High School,Moderate,Male,67
7,25,78,Low,High,Yes,6,50,Medium,Yes,1,High,High,Public,Negative,2,No,High School,Far,Male,66
8,17,94,Medium,High,No,6,80,High,Yes,0,Medium,Low,Private,Neutral,1,No,College,Near,Male,69
9,23,98,Medium,Medium,Yes,8,71,Medium,Yes,0,High,High,Public,Positive,5,No,High School,Moderate,Male,72


In [19]:
df_with_duplicates.duplicated().head(110)

0      False
1      False
2      False
3      False
4      False
       ...  
105     True
106     True
107     True
108     True
109     True
Length: 110, dtype: bool

In [21]:
# Remove duplicates
df_clean = df_with_duplicates.drop_duplicates()
print(f"Before: {len(df_with_duplicates)}, After: {len(df_clean)}")
print(f"Removed {len(df_with_duplicates) - len(df_clean)} duplicate rows.")

Before: 120, After: 100
Removed 20 duplicate rows.


In [22]:
# Check duplicates on specific columns only
df_with_duplicates.duplicated(subset=['Hours_Studied', 'Exam_Score']).sum()

np.int64(38)

In [29]:
# Cleaning Text Data

# Create a messy DataFrame to practice
messy = pd.DataFrame({
    'Name': ['  Alice ', 'BOB', ' charlie', 'DIANA  ', 'Eve'],
    'City': ['new york', 'NEW YORK', 'New York ', ' new york', 'NEW YORK  '],
    'Score': ['85', '90', 'N/A', '78', '92']
})
messy

Unnamed: 0,Name,City,Score
0,Alice,new york,85.0
1,BOB,NEW YORK,90.0
2,charlie,New York,
3,DIANA,new york,78.0
4,Eve,NEW YORK,92.0


In [30]:
messy['Name'] = messy['Name'].str.strip().str.title()
messy

Unnamed: 0,Name,City,Score
0,Alice,new york,85.0
1,Bob,NEW YORK,90.0
2,Charlie,New York,
3,Diana,new york,78.0
4,Eve,NEW YORK,92.0


In [33]:
messy['City'] = messy['City'].str.strip().str.lower()
messy

Unnamed: 0,Name,City,Score
0,Alice,new york,85.0
1,Bob,new york,90.0
2,Charlie,new york,
3,Diana,new york,78.0
4,Eve,new york,92.0


In [34]:
# Replace 'N/A' string with actual NaN
messy['Score'] = messy['Score'].replace('N/A', np.nan)
messy

Unnamed: 0,Name,City,Score
0,Alice,new york,85.0
1,Bob,new york,90.0
2,Charlie,new york,
3,Diana,new york,78.0
4,Eve,new york,92.0


In [35]:
# Convert types with astype (when data is already clean)
messy['Score'] = messy['Score'].astype(float)
print(messy)
print(messy.dtypes)

      Name      City  Score
0    Alice  new york   85.0
1      Bob  new york   90.0
2  Charlie  new york    NaN
3    Diana  new york   78.0
4      Eve  new york   92.0
Name      object
City      object
Score    float64
dtype: object
