# With Outliers Data

In [1]:
import pandas as pd
import numpy as np
#data collection
dataset=pd.read_csv("Placement.csv")
#collect Quan & Qual data
quan=[x for x in dataset if dataset[x].dtype!=object]
qual=[x for x in dataset if dataset[x].dtype==object]
#remove s1_no
quan.remove("sl_no")

#create New Table for central Tendency & descriptive
descriptive=pd.DataFrame(index=["Mean","Median","Mode",
                                "Q0-Min-0%","Q1-25%","Q2-50%","Q3-75%","Q4-Max-100%",
                               "IQR","1.5-Rule","Lower-Bound","Upper-Bound"]
                         ,columns=quan)
for x in quan:
    #print(x)
    descriptive.loc["Mean", x] = round(dataset[x].mean(), 2)
    descriptive.loc["Median", x] = dataset[x].median()
    descriptive.loc["Mode", x] = round(dataset[x].mode()[0], 2)
    descriptive.loc["Q0-Min-0%",x]=dataset.describe().loc['min',x]
    descriptive.loc["Q1-25%",x]=dataset.describe().loc['25%',x]
    descriptive.loc["Q2-50%",x]=dataset.describe().loc['50%',x]
    descriptive.loc["Q3-75%",x]=dataset.describe().loc['75%',x]
    descriptive.loc["Q4-Max-100%",x]=dataset.describe().loc['max',x]
    descriptive.loc["IQR",x]=descriptive.loc["Q3-75%",x]-descriptive.loc["Q1-25%",x]
    descriptive.loc["1.5-Rule",x]=1.5*descriptive.loc["IQR",x]
    descriptive.loc["Lower-Bound",x]=descriptive.loc["Q1-25%",x]-descriptive.loc["1.5-Rule",x]
    descriptive.loc["Upper-Bound",x]=descriptive.loc["Q3-75%",x]+descriptive.loc["1.5-Rule",x]
descriptive

lesser=[]
greater=[]
for x in quan:
    if descriptive.loc["Q0-Min-0%",x]<descriptive.loc["Lower-Bound",x]:
        lesser.append(x)
        print(f"Lower outlier in :'{x}' value :", descriptive.loc["Q0-Min-0%",x])
    if descriptive.loc["Q4-Max-100%",x]>descriptive.loc["Upper-Bound",x]:
        greater.append(x)
        print(f"Upper outlier in :'{x}' value :", descriptive.loc["Q4-Max-100%",x])
print(lesser,greater)

Lower outlier in :'hsc_p' value : 37.0
Upper outlier in :'hsc_p' value : 97.7
Upper outlier in :'degree_p' value : 91.0
Upper outlier in :'salary' value : 940000.0
['hsc_p'] ['hsc_p', 'degree_p', 'salary']


# Replace outliers with lower and upper bound 

In [3]:
import pandas as pd
import numpy as np

# Load dataset
dataset = pd.read_csv("Placement.csv")

# Function to replace outliers using np.percentile
def replace_outliers(df):
    for col in df.select_dtypes(exclude="object").columns:
        # Q1 (25th percentile) and Q3 (75th percentile)
        #dropna() removes missing values (NaN) before calculating, 
        #Q1 = df[col].quantile(0.25) # other way in panda calculate Q1, Q2
        #Q3 = df[col].quantile(0.75)
        Q1 = np.percentile(df[col].dropna(), 25)
        Q3 = np.percentile(df[col].dropna(), 75)
        IQR = Q3 - Q1

        # Bounds
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR

        # Replace lower outliers with lower bound
        df.loc[df[col] < lower_bound, col] = lower_bound
        
        # Replace higher outliers with upper bound
        df.loc[df[col] > upper_bound, col] = upper_bound

        print(f"{col}: Outliers replaced → lower < {lower_bound:.2f}, upper > {upper_bound:.2f}")

    return df

# Apply function
cleaned_dataset = dataset.copy()
cleaned_dataset = cleaned_dataset.drop("sl_no", axis=1)  # drop unwanted column
cleaned_dataset = replace_outliers(cleaned_dataset)

# Print cleaned dataset
print(cleaned_dataset)

ssc_p: Outliers replaced → lower < 37.95, upper > 98.35
hsc_p: Outliers replaced → lower < 42.75, upper > 91.15
degree_p: Outliers replaced → lower < 44.50, upper > 88.50
etest_p: Outliers replaced → lower < 24.75, upper > 118.75
mba_p: Outliers replaced → lower < 45.48, upper > 78.72
salary: Outliers replaced → lower < 150000.00, upper > 390000.00
    gender  ssc_p    ssc_b  hsc_p    hsc_b     hsc_s  degree_p   degree_t  \
0        M  67.00   Others  91.00   Others  Commerce     58.00   Sci&Tech   
1        M  79.33  Central  78.33   Others   Science     77.48   Sci&Tech   
2        M  65.00  Central  68.00  Central      Arts     64.00  Comm&Mgmt   
3        M  56.00  Central  52.00  Central   Science     52.00   Sci&Tech   
4        M  85.80  Central  73.60  Central  Commerce     73.30  Comm&Mgmt   
..     ...    ...      ...    ...      ...       ...       ...        ...   
210      M  80.60   Others  82.00   Others  Commerce     77.60  Comm&Mgmt   
211      M  58.00   Others  60.00

# cleaned Data without outlier

In [4]:
#collect Quan & Qual data
quan=[x for x in cleaned_dataset if cleaned_dataset[x].dtype!=object]
qual=[x for x in cleaned_dataset if cleaned_dataset[x].dtype==object]


#create New Table for central Tendency & descriptive
descriptive=pd.DataFrame(index=["Mean","Median","Mode",
                                "Min","Q1-25%","Q2-50%","Q3-75%","Max",
                               "IQR","1.5-Rule","Lower-Bound","Upper-Bound"]
                         ,columns=quan)
for x in quan:
    #print(x)
    descriptive.loc["Mean", x] = round(cleaned_dataset[x].mean(), 2)
    descriptive.loc["Median", x] = cleaned_dataset[x].median()
    descriptive.loc["Mode", x] = round(cleaned_dataset[x].mode()[0], 2)
    descriptive.loc["Min",x]=cleaned_dataset.describe().loc['min',x]
    descriptive.loc["Q1-25%",x]=cleaned_dataset.describe().loc['25%',x]
    descriptive.loc["Q2-50%",x]=cleaned_dataset.describe().loc['50%',x]
    descriptive.loc["Q3-75%",x]=cleaned_dataset.describe().loc['75%',x]
    descriptive.loc["Max",x]=cleaned_dataset.describe().loc['max',x]
    descriptive.loc["IQR",x]=descriptive.loc["Q3-75%",x]-descriptive.loc["Q1-25%",x]
    descriptive.loc["1.5-Rule",x]=1.5*descriptive.loc["IQR",x]
    descriptive.loc["Lower-Bound",x]=descriptive.loc["Q1-25%",x]-descriptive.loc["1.5-Rule",x]
    descriptive.loc["Upper-Bound",x]=descriptive.loc["Q3-75%",x]+descriptive.loc["1.5-Rule",x]
print(descriptive)
lesser=[]
greater=[]
for x in quan:
    if descriptive.loc["Min",x]<descriptive.loc["Lower-Bound",x]:
        lesser.append(x)
        print(f"Lower outlier in :'{x}' value :", descriptive.loc["Min",x])
    if descriptive.loc["Max",x]>descriptive.loc["Upper-Bound",x]:
        greater.append(x)
        print(f"Upper outlier in :'{x}' value :", descriptive.loc["Max",x])
print(lesser,greater)

             ssc_p  hsc_p degree_p etest_p   mba_p     salary
Mean          67.3  66.33    66.36    72.1   62.28  277648.65
Median        67.0   65.0     66.0    71.0    62.0   265000.0
Mode          62.0   63.0     65.0    60.0    56.7   300000.0
Min          40.89  42.75     50.0    50.0   51.21   200000.0
Q1-25%        60.6   60.9     61.0    60.0  57.945   240000.0
Q2-50%        67.0   65.0     66.0    71.0    62.0   265000.0
Q3-75%        75.7   73.0     72.0    83.5  66.255   300000.0
Max           89.4  91.15     88.5    98.0   77.89   390000.0
IQR           15.1   12.1     11.0    23.5    8.31    60000.0
1.5-Rule     22.65  18.15     16.5   35.25  12.465    90000.0
Lower-Bound  37.95  42.75     44.5   24.75   45.48   150000.0
Upper-Bound  98.35  91.15     88.5  118.75   78.72   390000.0
[] []
