# data preprocessing for nit placement csv

In [2]:
import numpy as np
import pandas as pd

In [3]:
df_nit = pd.read_csv("/Users/anuragchaubey/smart-college-recommender/data/nit_ placement.csv")
df_nit.sample(5)

Unnamed: 0,Year,NIT Name,Overall Avg CTC (LPA),CSE Avg (LPA),ECE Avg (LPA),EE Avg (LPA),ME Avg (LPA),Chemical Avg (LPA),Civil Avg (LPA)
136,2024.0,NIT Silchar,13.34,14.9,13.34,11.84,13.34,,13.34
122,2023.0,IIEST Shibpur,8.5,8.5,8.5,8.5,8.5,,
56,2021.0,NIT Goa,7.61,9.95,8.5,7.56,7.0,,5.83
116,2023.0,NIT Sikkim,8.5,11.98,11.56,8.22,7.6,,6.98
36,2021.0,NIT Durgapur,,21.45,17.49,14.61,9.72,8.87,7.39


In [4]:
# rename the columns 
df_nit.rename(columns={
    'Year': 'year',
    'NIT Name': 'institute_name',
    'Overall Avg CTC (LPA)': 'overall_avg_ctc',
    'CSE Avg (LPA)': 'cse_avg_ctc',
    'ECE Avg (LPA)': 'ece_avg_ctc',
    'EE Avg (LPA)': 'ee_avg_ctc',
    'ME Avg (LPA)': 'me_avg_ctc',
    'Chemical Avg (LPA)': 'chemical_avg_ctc',
    'Civil Avg (LPA)': 'civil_avg_ctc'
}, inplace=True)

In [5]:
df_nit.head()

Unnamed: 0,year,institute_name,overall_avg_ctc,cse_avg_ctc,ece_avg_ctc,ee_avg_ctc,me_avg_ctc,chemical_avg_ctc,civil_avg_ctc
0,2020.0,NIT Trichy,12.0,12.93,12.93,9.5,9.16,12.93,
1,2020.0,NIT Surathkal,19.66,19.66,16.25,16.25,,16.25,16.25
2,2020.0,NIT Rourkela,8.0,13.8,10.6,13.62,,9.36,10.5
3,2020.0,NIT Warangal,14.35,18.55,9.4,22.0,,,
4,2020.0,NIT Calicut,17.68,17.68,12.11,12.11,8.0,10.68,12.0


In [6]:
df_nit.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 154 entries, 0 to 153
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   year              150 non-null    float64
 1   institute_name    150 non-null    object 
 2   overall_avg_ctc   141 non-null    float64
 3   cse_avg_ctc       142 non-null    float64
 4   ece_avg_ctc       142 non-null    float64
 5   ee_avg_ctc        140 non-null    float64
 6   me_avg_ctc        115 non-null    float64
 7   chemical_avg_ctc  63 non-null     float64
 8   civil_avg_ctc     140 non-null    float64
dtypes: float64(8), object(1)
memory usage: 11.0+ KB


In [7]:
# convert year to int
df_nit['year'] = df_nit['year'].astype('Int64')

In [8]:
# trim whitespaces from institute_name
df_nit['institute_name'] = df_nit['institute_name'].str.strip()

In [9]:
# remove fully blank rows
df_nit.dropna(how='all', inplace=True)

In [10]:
# check missing values
missing_values = df_nit.isnull().sum()
print(missing_values[missing_values > 0])

overall_avg_ctc      9
cse_avg_ctc          8
ece_avg_ctc          8
ee_avg_ctc          10
me_avg_ctc          35
chemical_avg_ctc    87
civil_avg_ctc       10
dtype: int64


In [41]:
# fill missing values in overall_avg_ctc

# fill with data from forward or backward
df_nit['overall_avg_ctc'] = (
    df_nit.groupby('institute_name')['overall_avg_ctc']
          .transform(lambda x: x.ffill().bfill())
)

# if still left fill with overall mean
mean_overall = df_nit['overall_avg_ctc'].mean()
df_nit['overall_avg_ctc'] = df_nit['overall_avg_ctc'].fillna(mean_overall)


In [14]:
print("Missing values (overall_avg_ctc):", df_nit['overall_avg_ctc'].isnull().sum())

Missing values (overall_avg_ctc): 0


In [None]:
# filling missing values (cse_avg_ctc)

# fill with other cse data from same college
df_nit['cse_avg_ctc'] = (
    df_nit.groupby('institute_name')['cse_avg_ctc']
          .transform(lambda x: x.ffill().bfill())
)

# fill with correlation method (remaining)
cse_ratio = (df_nit['cse_avg_ctc'] / df_nit['overall_avg_ctc']).mean()
print("Average CSE to Overall ratio:", cse_ratio)

df_nit['cse_avg_ctc'] = df_nit['cse_avg_ctc'].fillna(
    df_nit['overall_avg_ctc'] * cse_ratio
)


Average CSE to Overall ratio: 1.2661728018330707


In [17]:
print("Remaining nulls in cse_avg_ctc:", df_nit['cse_avg_ctc'].isnull().sum())


Remaining nulls in cse_avg_ctc: 0


In [None]:
# filling missing values  (ece_avg_ctc)

# forward , backward filling
df_nit['ece_avg_ctc'] = (
    df_nit.groupby('institute_name')['ece_avg_ctc']
          .transform(lambda x: x.ffill().bfill())
)

# fill remaining values. ( using correlation method)
ece_ratio = (df_nit['ece_avg_ctc'] / df_nit['overall_avg_ctc']).mean()
print("Avg ECE to Overall Ratio:", ece_ratio)

df_nit['ece_avg_ctc'] = df_nit['ece_avg_ctc'].fillna(
    df_nit['overall_avg_ctc'] * ece_ratio
)

Avg ECE to Overall Ratio: 1.1030218881161595


In [21]:
df_nit['ece_avg_ctc'].isnull().sum()

np.int64(0)

In [22]:
# filling missing values  (ee_avg_ctc)

# forward , backward filling
df_nit['ee_avg_ctc'] = (
    df_nit.groupby('institute_name')['ee_avg_ctc']
          .transform(lambda x: x.ffill().bfill())
)

# fill remaining values (correlation method)
ee_ratio = (df_nit['ee_avg_ctc'] / df_nit['overall_avg_ctc']).mean()
print("EE to Overall Ratio:", ee_ratio)

df_nit['ee_avg_ctc'] = df_nit['ee_avg_ctc'].fillna(
    df_nit['overall_avg_ctc'] * ee_ratio
)


EE to Overall Ratio: 1.0063835764051374


In [23]:
df_nit["ee_avg_ctc"].isnull().sum()

np.int64(0)

In [None]:
# filling missing values (me_avg_ctc)

# forward fill & backward fill
df_nit['me_avg_ctc'] = (
    df_nit.groupby('institute_name')['me_avg_ctc']
          .transform(lambda x: x.ffill().bfill())
)

# fill remaining values (correlation method)
me_ratio = (df_nit['me_avg_ctc'] / df_nit['overall_avg_ctc']).mean()
print("ME to Overall Ratio:", me_ratio)

df_nit['me_avg_ctc'] = df_nit['me_avg_ctc'].fillna(
    df_nit['overall_avg_ctc'] * me_ratio
)


ME to Overall Ratio: 0.911323006964742


In [25]:
df_nit['me_avg_ctc'].isnull().sum()

np.int64(0)

In [None]:
# filling missing value (chemical_avg_ctc )

# forward fill & backward fill
df_nit['chemical_avg_ctc'] = (
    df_nit.groupby('institute_name')['chemical_avg_ctc']
          .transform(lambda x: x.ffill().bfill())
)

# fill remaining values (correlation method)
chem_ratio = (df_nit['chemical_avg_ctc'] / df_nit['overall_avg_ctc']).mean()
print("Chemical to Overall Ratio:", chem_ratio)

df_nit['chemical_avg_ctc'] = df_nit['chemical_avg_ctc'].fillna(
    df_nit['overall_avg_ctc'] * chem_ratio
)

Chemical to Overall Ratio: 0.8791434481379429


In [35]:
df_nit['chemical_avg_ctc'].isnull().sum()

np.int64(0)

In [32]:
# filling missing values (civil_avg_ctc)

# forward fill & backward fill
df_nit['civil_avg_ctc'] = (
    df_nit.groupby('institute_name')['civil_avg_ctc']
          .transform(lambda x: x.ffill().bfill())
)

# fill remaining values (correlation method)
civil_ratio = (df_nit['civil_avg_ctc'] / df_nit['overall_avg_ctc']).mean()
print("Civil to Overall Ratio:", civil_ratio)

df_nit['civil_avg_ctc'] = df_nit['civil_avg_ctc'].fillna(
    df_nit['overall_avg_ctc'] * civil_ratio
)


Civil to Overall Ratio: 0.8363961814842825


In [34]:
df_nit.isnull().sum()

year                0
institute_name      0
overall_avg_ctc     0
cse_avg_ctc         0
ece_avg_ctc         0
ee_avg_ctc          0
me_avg_ctc          0
chemical_avg_ctc    0
civil_avg_ctc       0
dtype: int64

In [36]:
# sort the data
df_nit = df_nit.sort_values(by=['year', 'institute_name']).reset_index(drop=True)

In [37]:
df_nit.head(10)

Unnamed: 0,year,institute_name,overall_avg_ctc,cse_avg_ctc,ece_avg_ctc,ee_avg_ctc,me_avg_ctc,chemical_avg_ctc,civil_avg_ctc
0,2020,IIEST Shibpur,7.3,9.5,16.5,8.0,7.3,6.417747,5.75
1,2020,MNIT Jaipur,9.1,18.39,9.1,9.94,13.2,10.35,6.0
2,2020,MNNIT Allahabad,17.19,17.68,11.0,17.68,17.68,9.0,6.54
3,2020,NIT Agartala,6.95,14.71,13.83,15.0,6.5,7.0,6.95
4,2020,NIT Andhra Pradesh,6.54,10.7,6.29,6.54,6.0,8.13,6.54
5,2020,NIT Arunachal Pradesh,6.2,7.2,7.2,7.2,7.2,8.77,7.2
6,2020,NIT Calicut,17.68,17.68,12.11,12.11,8.0,10.68,12.0
7,2020,NIT Delhi,8.8,11.2,9.0,6.4,8.8,7.736462,15.59
8,2020,NIT Durgapur,8.0,21.45,17.49,14.61,9.72,8.87,8.0
9,2020,NIT Goa,10.87,9.95,8.5,7.56,6.63,9.556289,5.83


In [38]:
df_nit.shape

(150, 9)

In [40]:
# export the cleaned nit placement csv
df_nit.to_csv('/Users/anuragchaubey/smart-college-recommender/data/cleaned/nit_placement_cleaned.csv', index=False)