In [55]:
# Importing libraries 
# Importing essential libraries for data analysis and visualization
import pandas as pd  # For data manipulation and analysis
import numpy as np   # For numerical operations
import matplotlib.pyplot as plt  # For creating visualizations

In [56]:
# Read the csv file into a pandas dataframe
# Read the academic data CSV file into a pandas DataFrame
# This will load the student records containing their names, gender and subject marks
df = pd.read_csv("acdemic_data.csv")

In [None]:
# Display the first 5 rows of the DataFrame using head() method
# This gives us a quick overview of the data structure and content
print(df.head())

In [58]:
# Define a list of common missing value formats that may appear in the dataset
# These formats include various ways missing values might be represented in the data
missing_value_formats = [
    "n.a.",  # Common abbreviation for "not available"
    "?",     # Question mark often used to indicate unknown values
    "NA",    # Standard "not available" abbreviation
    "n/a",   # Another common "not available" format
    "na",    # Lowercase version of NA
    "--"     # Double dash sometimes used to represent missing values
]
# Read the CSV file with custom NA value formats
# This ensures consistent handling of missing values across different formats
# The na_values parameter specifies additional strings to recognize as NA/NaN
df = pd.read_csv("acdemic_data.csv", na_values=missing_value_formats)

In [None]:
# Display the first 10 rows of the Gender column from the DataFrame
# This helps verify the data after handling missing values and shows the gender distribution
print(df['Gender'].head(10))

In [None]:
# Null values are marked True
# Check for null values in the Gender column and display first 10 results
# Returns True for null values and False for non-null values
print(df['Gender'].isnull().head(10))

In [None]:
# notnull() returns False for NaN values and True for non-NaN values
# This is the opposite of isnull() - it marks all valid values as True
# and all missing/NaN values as False
print(df['Gender'].notnull().head(10))

In [62]:
# notnull() is a pandas method that returns a boolean mask where:
# - True indicates non-null values (valid data)
# - False indicates null values (NaN, None, etc.)
# This is useful for filtering out rows with missing data
null_filter = df['Gender'].notnull()

In [None]:
# Filter and display only rows where Gender column has valid (non-null) values
# This uses the boolean mask created earlier to show only complete records
# The null_filter variable contains True for valid entries and False for null values
print(df[null_filter]) 

In [None]:
# Check if there are any null values in the entire DataFrame
# df.isnull() creates a boolean mask of all null values
# .values converts the mask to a numpy array
# .any() returns True if any value in the array is True (indicating presence of null values)
print(df.isnull().values.any())

In [65]:
# Drop all rows that contain any null values (NaN, None, etc.)
# axis=0 specifies we're dropping rows (axis=1 would drop columns)
# inplace=True modifies the DataFrame directly instead of returning a copy
# This is a common data cleaning step to remove incomplete records
df.dropna(axis=0, inplace=True)


In [66]:
# Drop all rows that contain at least one null value (NaN, None, etc.)
# This is a data cleaning operation that removes incomplete records
# - axis=0 specifies we're dropping rows (not columns)
# - how='any' means drop if ANY column in the row has a null value
# - Returns a new DataFrame with only complete rows
new_df = df.dropna(axis = 0, how ='any')

In [67]:
# drop all rows with all null
new_df = df.dropna(axis = 0, how ='all')

In [68]:
# Drop all columns that contain at least one null value (NaN, None, etc.)
# - axis=1 specifies we're dropping columns (not rows)
# - how='any' means drop if ANY row in the column has a null value
# - Returns a new DataFrame with only columns that have no null values
# This is useful for removing columns with incomplete data
new_df = df.dropna(axis=1, how='any')


In [69]:
# Drop all columns that contain only null values (NaN, None, etc.)
# - axis=1 specifies we're dropping columns (not rows)
# - how='all' means drop if ALL values in the column are null
# - Returns a new DataFrame with only columns that have at least one non-null value
# This is useful for removing completely empty columns from the dataset
new_df = df.dropna(axis = 1, how ='all')

In [None]:
# Replacing Null values with a constant value (0 in this case)
# - fillna() is used to replace null values with a specified value
# - inplace=True modifies the DataFrame directly instead of returning a copy
# - This is useful when you want to replace missing values with a meaningful default value
# - Common use case: replacing missing numeric values with 0 or another meaningful constant
df['SPOS'].fillna(0, inplace=True)



In [None]:
# To check changes call 
# - head() method displays the first n rows of the DataFrame
# - n=10 specifies we want to see the first 10 rows
# - This helps verify that our null value replacements worked correctly
# - We can see the SPOS column values after filling nulls with 0
print(df['SPOS'].head(10))


In [None]:
#Replacing Null with the value from the previous row or the next row 
#method = 'pad’ for taking values from the previous row 
# Replacing Null values with values from adjacent rows
# - method='pad' uses forward fill (ffill) to take values from previous rows
# - inplace=True modifies the DataFrame directly instead of returning a copy
# - This is useful when you want to fill missing values with the most recent valid value
# - Common use case: time series data where missing values can be reasonably filled with previous values

df['DSBDA'] = df['DSBDA'].ffill()
print(df['DSBDA'].head(10))



In [None]:
# Using backward fill (bfill) to replace null values with values from subsequent rows
# - method='bfill' uses backward fill to take values from next rows
# - This is useful when forward fill isn't appropriate and you want to use future values
# - Common use case: when missing values should be filled with the next available value
df['SPOS'] = df['SPOS'].bfill()
print(df['SPOS'].head(10))


In [None]:
# Display the first 5 rows of the DataFrame by default
# - head() method shows the first n rows (default n=5)
# - Useful for quick inspection of data structure and content
# - Shows column names and data types
# - Helps verify data cleaning operations
df.head()

In [None]:
# Get the dimensions of the DataFrame
# - Returns a tuple containing (rows, columns)
# - First number represents total number of rows
# - Second number represents total number of columns
# - Useful for understanding the size and structure of the dataset
df.shape

In [None]:
# Create a box plot to visualize the distribution of SPOS scores
# - x parameter specifies the data to plot (SPOS column from DataFrame)
# - Box plot shows:
#   * Median (middle line)
#   * First and third quartiles (box)
#   * Whiskers (extend to min/max excluding outliers)
#   * Individual points represent outliers
# - Useful for identifying:
#   * Data distribution
#   * Potential outliers
#   * Data spread and skewness
plt.boxplot(x=df['SPOS'])