In [1]:
import pandas as pd
import numpy as np

In [2]:
dataset = pd.read_csv("Placement.csv")
dataset.head()

Unnamed: 0,sl_no,gender,ssc_p,ssc_b,hsc_p,hsc_b,hsc_s,degree_p,degree_t,workex,etest_p,specialisation,mba_p,status,salary
0,1,M,67.0,Others,91.0,Others,Commerce,58.0,Sci&Tech,No,55.0,Mkt&HR,58.8,Placed,270000.0
1,2,M,79.33,Central,78.33,Others,Science,77.48,Sci&Tech,Yes,86.5,Mkt&Fin,66.28,Placed,200000.0
2,3,M,65.0,Central,68.0,Central,Arts,64.0,Comm&Mgmt,No,75.0,Mkt&Fin,57.8,Placed,250000.0
3,4,M,56.0,Central,52.0,Central,Science,52.0,Sci&Tech,No,66.0,Mkt&HR,59.43,Not Placed,
4,5,M,85.8,Central,73.6,Central,Commerce,73.3,Comm&Mgmt,No,96.8,Mkt&Fin,55.5,Placed,425000.0


In [3]:
def quanQual(dataset):
    quan = dataset.select_dtypes(include=["int64","float64"]).columns.tolist()
    qual = dataset.select_dtypes(include=["object"]).columns.tolist()
    return quan, qual

quan, qual = quanQual(dataset)

print("Quantitative:", quan)
print("Qualitative:", qual)

Quantitative: ['sl_no', 'ssc_p', 'hsc_p', 'degree_p', 'etest_p', 'mba_p', 'salary']
Qualitative: ['gender', 'ssc_b', 'hsc_b', 'hsc_s', 'degree_t', 'workex', 'specialisation', 'status']


See https://pandas.pydata.org/docs/user_guide/migration-3-strings.html#string-migration-select-dtypes for details on how to write code that works with pandas 2 and 3.
  qual = dataset.select_dtypes(include=["object"]).columns.tolist()


In [4]:
for col in quan:
    dataset[col] = pd.to_numeric(dataset[col], errors='coerce')

In [5]:
def descriptive_analysis(dataset, quan):
    
    descriptive = pd.DataFrame(
        index=["Mean","Median","Mode","Min","25%","50%","75%","Max",
               "IQR","1.5IQR","LesserRange","GreaterRange"],
        columns=quan
    )
    
    for col in quan:
        series = dataset[col].dropna()
        
        q1 = series.quantile(0.25)
        q3 = series.quantile(0.75)
        iqr = q3 - q1
        
        descriptive.loc["Mean", col] = series.mean()
        descriptive.loc["Median", col] = series.median()
        descriptive.loc["Mode", col] = series.mode().iloc[0] if not series.mode().empty else None
        descriptive.loc["Min", col] = series.min()
        descriptive.loc["25%", col] = q1
        descriptive.loc["50%", col] = series.quantile(0.50)
        descriptive.loc["75%", col] = q3
        descriptive.loc["Max", col] = series.max()
        descriptive.loc["IQR", col] = iqr
        descriptive.loc["1.5IQR", col] = 1.5 * iqr
        descriptive.loc["LesserRange", col] = q1 - 1.5 * iqr
        descriptive.loc["GreaterRange", col] = q3 + 1.5 * iqr
        
    return descriptive

descriptive = descriptive_analysis(dataset, quan)
descriptive

Unnamed: 0,sl_no,ssc_p,hsc_p,degree_p,etest_p,mba_p,salary
Mean,108.0,67.303395,66.333163,66.370186,72.100558,62.278186,288655.405405
Median,108.0,67.0,65.0,66.0,71.0,62.0,265000.0
Mode,1.0,62.0,63.0,65.0,60.0,56.7,300000.0
Min,1.0,40.89,37.0,50.0,50.0,51.21,200000.0
25%,54.5,60.6,60.9,61.0,60.0,57.945,240000.0
50%,108.0,67.0,65.0,66.0,71.0,62.0,265000.0
75%,161.5,75.7,73.0,72.0,83.5,66.255,300000.0
Max,215.0,89.4,97.7,91.0,98.0,77.89,940000.0
IQR,107.0,15.1,12.1,11.0,23.5,8.31,60000.0
1.5IQR,160.5,22.65,18.15,16.5,35.25,12.465,90000.0


In [6]:
lesser = []
greater = []

for col in quan:
    
    LR = descriptive.loc["LesserRange", col]
    GR = descriptive.loc["GreaterRange", col]
    Min = descriptive.loc["Min", col]
    Max = descriptive.loc["Max", col]
    
    if LR > Min:
        lesser.append(col)
        
    if GR < Max:
        greater.append(col)

print("Columns with Lesser Outliers:", lesser)
print("Columns with Greater Outliers:", greater)

Columns with Lesser Outliers: ['hsc_p']
Columns with Greater Outliers: ['hsc_p', 'degree_p', 'salary']


In [7]:
# Cap outliers

for col in lesser:
    LR = descriptive.loc["LesserRange", col]
    dataset.loc[dataset[col] < LR, col] = LR

for col in greater:
    GR = descriptive.loc["GreaterRange", col]
    dataset.loc[dataset[col] > GR, col] = GR

print("Outliers capped successfully.")

Outliers capped successfully.


In [8]:
np.percentile(dataset["ssc_p"], 50)

67.0