In [1]:
# Import required libraries for data analysis
import pandas as pd  # For data manipulation and analysis
import numpy as np   # For numerical operations and array handling
import statistics as st  # For statistical functions

In [2]:
# Read the loan dataset from CSV file into a pandas DataFrame
# This will load the data for further analysis and processing
df = pd.read_csv("loan.csv")

In [3]:
# Print the dimensions of the DataFrame (rows, columns)
# This shows the total number of rows and columns in the dataset
print(df.shape)

(500, 15)


In [4]:
# Display detailed information about the DataFrame including:
# - Data types of each column
# - Number of non-null values
# - Memory usage
# - Index information
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 15 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Unnamed: 0         500 non-null    int64  
 1   Loan_ID            500 non-null    object 
 2   Gender             491 non-null    object 
 3   Married            497 non-null    object 
 4   Dependents         488 non-null    object 
 5   Education          500 non-null    object 
 6   Self_Employed      473 non-null    object 
 7   ApplicantIncome    500 non-null    int64  
 8   CoapplicantIncome  500 non-null    float64
 9   LoanAmount         482 non-null    float64
 10  Loan_Amount_Term   486 non-null    float64
 11  Credit_History     459 non-null    float64
 12  Property_Area      500 non-null    object 
 13  Loan_Status        500 non-null    object 
 14  Total_Income       500 non-null    object 
dtypes: float64(4), int64(2), object(9)
memory usage: 58.7+ KB
None


In [5]:
# Convert all columns to numeric type, replacing non-numeric values with NaN
# This ensures consistent data types for statistical calculations
# The 'coerce' parameter handles invalid values by converting them to NaN
df = df.apply(pd.to_numeric, errors='coerce')
df.mean()

Unnamed: 0            249.500000
Loan_ID                      NaN
Gender                       NaN
Married                      NaN
Dependents              0.533482
Education                    NaN
Self_Employed                NaN
ApplicantIncome      5493.644000
CoapplicantIncome    1506.307840
LoanAmount            144.020747
Loan_Amount_Term      342.543210
Credit_History          0.843137
Property_Area                NaN
Loan_Status                  NaN
Total_Income                 NaN
dtype: float64

In [6]:
# Calculate the maximum value for each numeric column in the DataFrame
# This returns a Series containing the highest value found in each column
df.max()

Unnamed: 0             499.0
Loan_ID                  NaN
Gender                   NaN
Married                  NaN
Dependents               2.0
Education                NaN
Self_Employed            NaN
ApplicantIncome      81000.0
CoapplicantIncome    20000.0
LoanAmount             700.0
Loan_Amount_Term       480.0
Credit_History           1.0
Property_Area            NaN
Loan_Status              NaN
Total_Income             NaN
dtype: float64

In [7]:
# Calculate the minimum value for each numeric column in the DataFrame
# This returns a Series containing the lowest value found in each column
df.min()

Unnamed: 0             0.0
Loan_ID                NaN
Gender                 NaN
Married                NaN
Dependents             0.0
Education              NaN
Self_Employed          NaN
ApplicantIncome      150.0
CoapplicantIncome      0.0
LoanAmount            17.0
Loan_Amount_Term      12.0
Credit_History         0.0
Property_Area          NaN
Loan_Status            NaN
Total_Income           NaN
dtype: float64

In [8]:
# Calculate the median value for each numeric column in the DataFrame
# This returns a Series containing the middle value of each column when sorted
# The median is less sensitive to outliers than the mean
df.median()

Unnamed: 0            249.5
Loan_ID                 NaN
Gender                  NaN
Married                 NaN
Dependents              0.0
Education               NaN
Self_Employed           NaN
ApplicantIncome      3854.0
CoapplicantIncome    1125.5
LoanAmount            126.5
Loan_Amount_Term      360.0
Credit_History          1.0
Property_Area           NaN
Loan_Status             NaN
Total_Income            NaN
dtype: float64

In [9]:
# Calculate the standard deviation for each numeric column in the DataFrame
# This returns a Series containing the measure of spread/dispersion for each column
# Standard deviation indicates how much the values deviate from the mean
df.std()

Unnamed: 0            144.481833
Loan_ID                      NaN
Gender                       NaN
Married                      NaN
Dependents              0.776468
Education                    NaN
Self_Employed                NaN
ApplicantIncome      6515.668972
CoapplicantIncome    2134.432188
LoanAmount             82.344919
Loan_Amount_Term       63.834977
Credit_History          0.364068
Property_Area                NaN
Loan_Status                  NaN
Total_Income                 NaN
dtype: float64

In [10]:
# Calculate the mean value of the 'LoanAmount' column
# Using loc accessor to select the specific column
# This provides a measure of central tendency for loan amounts
print(df.loc[:,'LoanAmount'].mean())

144.0207468879668


In [11]:
# Calculate the mean value for each row (axis=1) and display the first 5 rows
# This provides the average value across all numeric columns for each individual record
# The [0:5] slice limits the output to the first 5 rows for better readability
df.mean(axis = 1)[0:5]

0    1035.000000
1     940.285714
2     489.857143
3     775.000000
4     929.428571
dtype: float64

In [12]:
# Calculate the median value of the 'LoanAmount' column
# Using loc accessor to select the specific column
# The median represents the middle value when data is ordered
# This is more robust to outliers than the mean
print(df.loc[:,'LoanAmount'].median())

126.5


In [13]:
# Calculate the median value for each row (axis=1) and display the first 5 rows
# This provides the middle value across all numeric columns for each individual record
# The [0:5] slice limits the output to the first 5 rows for better readability
df.median(axis = 1)[0:5]

0      0.5
1    128.0
2      2.0
3    120.0
4      4.0
dtype: float64

In [14]:
# Calculate the variance for each numeric column in the DataFrame
# This returns a Series containing the measure of spread/dispersion for each column
# Variance indicates how far numbers are spread out from their average value
# It's the square of the standard deviation
df.var()

Unnamed: 0           2.087500e+04
Loan_ID                       NaN
Gender                        NaN
Married                       NaN
Dependents           6.029033e-01
Education                     NaN
Self_Employed                 NaN
ApplicantIncome      4.245394e+07
CoapplicantIncome    4.555801e+06
LoanAmount           6.780686e+03
Loan_Amount_Term     4.074904e+03
Credit_History       1.325456e-01
Property_Area                 NaN
Loan_Status                   NaN
Total_Income                  NaN
dtype: float64

In [15]:
# Group the DataFrame by 'Gender' and calculate the mean 'LoanAmount' for each gender
# This provides insight into average loan amounts by gender category
# The result shows how loan amounts are distributed between different genders
df[["Gender","LoanAmount"]].groupby("Gender").mean()

Unnamed: 0_level_0,LoanAmount
Gender,Unnamed: 1_level_1


In [16]:
# Group the DataFrame by 'Gender' and calculate the median 'LoanAmount' for each gender
# This provides insight into the middle loan amount by gender category
# The median is more robust to outliers than the mean, giving a better central tendency measure
# The result shows how loan amounts are distributed between different genders
df[["Gender","LoanAmount"]].groupby("Gender").median()

Unnamed: 0_level_0,LoanAmount
Gender,Unnamed: 1_level_1


In [17]:
# Group the DataFrame by 'Self_Employed' and calculate the mean 'LoanAmount' for each employment status
# This provides insight into average loan amounts by self-employment category
# The result shows how loan amounts are distributed between self-employed and non-self-employed individuals
# This analysis helps understand if self-employment status influences loan amounts
df[["Self_Employed","LoanAmount"]].groupby("Self_Employed").mean()

Unnamed: 0_level_0,LoanAmount
Self_Employed,Unnamed: 1_level_1
