In [50]:
# Import required libraries
# pandas: for data manipulation and analysis
# numpy: for numerical operations
import pandas as pd
import numpy as np

In [51]:
# Read the CSV file into a pandas DataFrame
# This loads the data from 'data.csv' into a DataFrame named 'df'
df=pd.read_csv('data.csv')

In [None]:
# Display the data types of each column in the DataFrame
# This shows whether each column contains integers, floats, or strings
df.dtypes

In [None]:
# Generate descriptive statistics for the DataFrame
# This includes count, mean, standard deviation, min, 25th percentile, 
# median (50th percentile), 75th percentile, and max for each numeric column
df.describe()

In [None]:
# Group the data by Age and count the frequency of each age
# This shows how many patients are in each age group
df.groupby(by=['Age']).size()

In [None]:
# Check for missing values (NaN) in each column
# Returns the count of missing values for each column
df.isna().sum()

In [None]:
# Display all rows where values are null (missing)
# Returns a boolean DataFrame showing True for missing values
df.isnull()

In [None]:
# Check the data type of the string "Age"
type("Age")
# Check the data type of the string "BMI"
type("BMI")
# Convert the Age column to float type for numerical operations
df.Age.astype(float)

In [58]:
# Convert the Age column to float type
# This ensures consistent data type for age values
df.Age= df.Age.astype(float)

In [None]:
# Sort the DataFrame by Age in ascending order
# This arranges the data from youngest to oldest patient
df.sort_values('Age')

In [None]:
# Sort the DataFrame by Age in descending order
# This arranges the data from oldest to youngest patient
df.sort_values('Age',ascending=False)

In [None]:
# Rename the 'Age' column to 'Year Old'
# Note: This creates a new DataFrame with the renamed column
df.rename(columns={'Age':'Year Old'})

In [None]:
# Sort the DataFrame by index in ascending order
# This arranges the data based on the original row order
df.sort_index()

In [None]:
# Reset the index of the DataFrame
# This creates a new column 'index' with the original row numbers
df.reset_index()

In [None]:
# Drop the 'Age' column from the DataFrame
# This removes the Age column while keeping all other columns
df.drop(columns=['Age'])

In [None]:
# Reshape the DataFrame from wide to long format
# This converts the DataFrame to have one row per variable-value pair
pd.melt(df)

In [None]:
# Remove duplicate rows from the DataFrame
# This keeps only unique combinations of all columns
df.drop_duplicates()

In [None]:
# Display the first 5 rows of the DataFrame
# This shows the initial 5 records in the dataset
df.head(5)

# Display the last 5 rows of the DataFrame
# This shows the final 5 records in the dataset
df.tail(5)

In [None]:
# Count the number of non-null values in each column
# This shows how many valid entries exist for each variable
df.count(0)

In [None]:
for col in df.columns:
    print(f"Unique values in {col}:{df[col].unique()}")

In [None]:
# Calculate quantiles for numeric columns only
# This computes the quartiles (25th, 50th, 75th percentiles) for numeric data
# The numeric_only=True parameter excludes non-numeric columns like 'Classification'
df.quantile(numeric_only=True)

In [None]:
# Calculate the median (50th percentile) for numeric columns
# This computes the middle value for each numeric variable
# The numeric_only=True parameter excludes non-numeric columns
df.quantile(numeric_only=True)

In [None]:
# Calculate the minimum value for each column in the DataFrame
# This shows the smallest value present in each variable
df.min()

In [None]:
# Calculate the maximum value for each column in the DataFrame
# This shows the largest value present in each variable
df.max()

In [None]:
# Print unique values for each column in the DataFrame
# This helps identify distinct categories and potential data quality issues
for col in df.columns:
    print(f"Unique values in {col}:{df[col].unique()}")

In [None]:
# Calculate the mean (average) for numeric columns only
# This computes the arithmetic mean for each numeric variable
# The numeric_only=True parameter excludes non-numeric columns
df.mean(numeric_only=True)

In [None]:
# Calculate the mean (average) for numeric columns only
# This computes the arithmetic mean for each numeric variable
# The numeric_only=True parameter excludes non-numeric columns
df.mean(numeric_only=True)

In [77]:
# Convert all columns to numeric type, replacing non-numeric values with NaN
df = df.apply(pd.to_numeric, errors='coerce')

In [None]:
# Calculate the standard deviation for each column in the DataFrame
# This measures the spread/dispersion of values around the mean
df.std()

In [None]:
# Convert Age column to dummy variables (one-hot encoding)
# This creates binary columns for each unique age value
pd.get_dummies(df['Age'])

In [None]:
pd.get_dummies(df['Glucose'])
# Convert Glucose column to dummy variables (one-hot encoding)
# This creates binary columns for each unique glucose value


In [None]:
# Convert Glucose column to dummy variables (one-hot encoding)
# This creates binary columns for each unique glucose value
print(pd.get_dummies(df['Glucose']))

In [None]:
# Select rows 2 through 49 (inclusive) from the DataFrame
# This returns a subset of the data containing 48 rows
df.iloc[2:50]