# data preprocessing for iiit placement csv

In [151]:
import numpy as np
import pandas as pd

In [152]:
df_iiit = pd.read_csv("/Users/anuragchaubey/smart-college-recommender/data/iiit_placement.csv")
df_iiit.sample(5)

Unnamed: 0,Year,IIIT Name,Overall Avg (LPA),CSE Avg (LPA),ECE Avg (LPA)
86,,,,,
121,2024.0,IIIT Kurnool,8.2,8.2,8.2
70,2022.0,IIIT Dharwad,8.0,8.7,8.0
48,2021.0,IIIT Bhagalpur,10.35,10.35,10.35
111,2023.0,IIIT Bangalore,29.6,35.0,25.55


In [153]:
# drop empty rows
df_iiit.dropna(how='all', inplace=True)


In [154]:
# rename columns 
df_iiit.columns = ['year', 'institute_name', 'overall_avg_ctc', 'cse_avg_ctc', 'ece_avg_ctc']

In [155]:
df_iiit.head()

Unnamed: 0,year,institute_name,overall_avg_ctc,cse_avg_ctc,ece_avg_ctc
0,2020.0,IIIT Allahabad,20.83,20.83,16.0
1,2020.0,ABV-IIITM Gwalior,16.85,27.23,16.0
2,2020.0,IIITDM Jabalpur,6.42,8.5,21.49
3,2020.0,IIITDM Kancheepuram,9.8,9.6,11.09
4,2020.0,IIIT Kurnool,7.64,7.64,8.3


In [156]:
# remove duplicates (if any)
df_iiit.drop_duplicates(inplace=True)

In [157]:
df_iiit.info()

<class 'pandas.core.frame.DataFrame'>
Index: 140 entries, 0 to 144
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   year             140 non-null    float64
 1   institute_name   140 non-null    object 
 2   overall_avg_ctc  136 non-null    float64
 3   cse_avg_ctc      124 non-null    float64
 4   ece_avg_ctc      132 non-null    float64
dtypes: float64(4), object(1)
memory usage: 6.6+ KB


In [158]:
# convert column data types 
df_iiit['year'] = df_iiit['year'].astype(int)
# remove whitespaces from institute names
df_iiit['institute_name'] = df_iiit['institute_name'].str.strip()

In [159]:
# check missing values
df_iiit.isnull().sum()

year                0
institute_name      0
overall_avg_ctc     4
cse_avg_ctc        16
ece_avg_ctc         8
dtype: int64

## filling overall_avgg_ctc missing values

In [160]:
# if cse and ece placements are available
mask = df_iiit['overall_avg_ctc'].isnull() & df_iiit['cse_avg_ctc'].notnull() & df_iiit['ece_avg_ctc'].notnull()
df_iiit.loc[mask, 'overall_avg_ctc'] = (df_iiit.loc[mask, 'cse_avg_ctc'] + df_iiit.loc[mask, 'ece_avg_ctc']) / 2

In [161]:
# if ece placement is missing
mask_cse = df_iiit['overall_avg_ctc'].isnull() & df_iiit['cse_avg_ctc'].notnull()
df_iiit.loc[mask_cse, 'overall_avg_ctc'] = df_iiit.loc[mask_cse, 'cse_avg_ctc']

In [162]:
# if cse data is missing
mask_ece = df_iiit['overall_avg_ctc'].isnull() & df_iiit['ece_avg_ctc'].notnull()
df_iiit.loc[mask_ece, 'overall_avg_ctc'] = df_iiit.loc[mask_ece, 'ece_avg_ctc']

In [163]:
df_iiit.isnull().sum()

year                0
institute_name      0
overall_avg_ctc     0
cse_avg_ctc        16
ece_avg_ctc         8
dtype: int64

## filling cse_avg ctc column missing values

In [164]:
# # filling with their previous year records
df_iiit['cse_avg_ctc'] = df_iiit.groupby('institute_name')['cse_avg_ctc'].transform(
    lambda x: x.fillna(x.median())
 )


In [165]:
# filling missing values with overall average
mask = df_iiit['cse_avg_ctc'].isnull() & df_iiit['overall_avg_ctc'].notnull()
df_iiit.loc[mask, 'cse_avg_ctc'] = df_iiit.loc[mask, 'overall_avg_ctc']

In [166]:
df_iiit.isnull().sum()

year               0
institute_name     0
overall_avg_ctc    0
cse_avg_ctc        0
ece_avg_ctc        8
dtype: int64

## filling ece_avg-ctc missing values

In [167]:
# filling with previous year ece_data of same college (if available)
df_iiit['ece_avg_ctc'] = df_iiit.groupby('institute_name')['ece_avg_ctc'].transform(
    lambda x: x.fillna(x.median())
)

In [168]:
# if still missing values left fill with overall average
mask = df_iiit['ece_avg_ctc'].isnull() & df_iiit['overall_avg_ctc'].notnull()
df_iiit.loc[mask, 'ece_avg_ctc'] = df_iiit.loc[mask, 'overall_avg_ctc']

In [169]:
df_iiit.isnull().sum()

year               0
institute_name     0
overall_avg_ctc    0
cse_avg_ctc        0
ece_avg_ctc        0
dtype: int64

In [None]:
#  add institute type column
df_iiit['institute_type'] = 'IIIT'

In [171]:
# add new column to track the gap between cse vs ece
df_iiit['cse_vs_ece_gap'] = df_iiit['cse_avg_ctc'] - df_iiit['ece_avg_ctc']

In [None]:
#  sort the data on the basis of year and institute name
df_iiit = df_iiit.sort_values(by=["year", "institute_name"]).reset_index(drop=True)

In [173]:
df_iiit.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 140 entries, 0 to 139
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   year             140 non-null    int64  
 1   institute_name   140 non-null    object 
 2   overall_avg_ctc  140 non-null    float64
 3   cse_avg_ctc      140 non-null    float64
 4   ece_avg_ctc      140 non-null    float64
 5   institute_type   140 non-null    object 
 6   cse_vs_ece_gap   140 non-null    float64
dtypes: float64(4), int64(1), object(2)
memory usage: 7.8+ KB


In [174]:
df_iiit.describe()

Unnamed: 0,year,overall_avg_ctc,cse_avg_ctc,ece_avg_ctc,cse_vs_ece_gap
count,140.0,140.0,140.0,140.0,140.0
mean,2022.0,14.844143,15.5205,14.41875,1.10175
std,1.419292,6.293603,6.834817,5.867378,4.506705
min,2020.0,6.42,6.2,6.26,-12.99
25%,2021.0,10.1825,10.7825,10.35,-0.5325
50%,2022.0,13.1,13.395,12.78,0.475
75%,2023.0,18.105,18.7175,17.01,3.6975
max,2024.0,34.5,35.0,36.2,15.3


In [175]:
df_iiit.isnull().sum()

year               0
institute_name     0
overall_avg_ctc    0
cse_avg_ctc        0
ece_avg_ctc        0
institute_type     0
cse_vs_ece_gap     0
dtype: int64

In [None]:
df_iiit.to_csv("/Users/anuragchaubey/smart-college-recommender/data/cleaned/iiit_placement_cleaned.csv", index=False)