# data preprocessing (meta data of colleges)

In [160]:
import numpy as np
import pandas as pd

In [161]:
df = pd.read_csv("/Users/anuragchaubey/smart-college-recommender/data/college_metadata.csv")
print("shape of data : ",df.shape)
print("columns of data : ",df.columns)
df.sample(5)

shape of data :  (83, 10)
columns of data :  Index(['institute_name', 'institute_type', 'city', 'state', 'city_tier',
       'total_fee_4yr', 'total_fee_5yr', 'hostel_available', 'campus_area',
       'infra_rating'],
      dtype='object')


Unnamed: 0,institute_name,institute_type,city,state,city_tier,total_fee_4yr,total_fee_5yr,hostel_available,campus_area,infra_rating
79,IIIT Bhagalpur,IIIT,Bhagalpur,Bihar,3.0,1345000.0,,Y,50,4.3
16,IIT Palakkad,IIT,Palakkad,Kerala,3.0,905960.0,1000000.0,Y,500.0,4.0
63,IIITV,IIIT,Vadodara,Gujarat,2.0,1580000.0,,N,under construction,4.4
61,IIITS,IIIT,Sri City,Andhra Pradesh,3.0,1775000.0,,N,80,4.4
11,IIT Patna,IIT,Patna,Bihar,2.0,999250.0,1000000.0,Y,501.0,4.2


In [162]:
# column name cleaning
df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_')

In [163]:
df.columns

Index(['institute_name', 'institute_type', 'city', 'state', 'city_tier',
       'total_fee_4yr', 'total_fee_5yr', 'hostel_available', 'campus_area',
       'infra_rating'],
      dtype='object')

In [164]:
# missing values & "under construction" handling in campus_area column
df.replace(['under construction', '-', '–', '—', ' '], pd.NA, inplace=True)

In [165]:
df.isnull().sum()

institute_name       2
institute_type       2
city                 2
state                2
city_tier            2
total_fee_4yr        2
total_fee_5yr       27
hostel_available     2
campus_area         11
infra_rating         2
dtype: int64

In [166]:
# delete the rows which are totally empty
df.dropna(how='all', inplace=True)


In [167]:
df.isnull().sum()

institute_name       0
institute_type       0
city                 0
state                0
city_tier            0
total_fee_4yr        0
total_fee_5yr       25
hostel_available     0
campus_area          9
infra_rating         0
dtype: int64

In [168]:
df.shape

(81, 10)

In [169]:
# strip extraspaces from text columns
text_cols = ['institute_name', 'institute_type', 'city', 'state']
for col in text_cols:
    df[col] = df[col].str.strip()


## handling missing values 

In [170]:
# campus_area column

# create a new column campus_status
df['campus_status'] = df['campus_area'].apply(lambda x: 'Under Construction' if pd.isna(x) else 'Available')
df['campus_status'] = df['campus_status'].astype('category')

# fill campus_area values with 0
df['campus_area'].fillna(0,inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['campus_area'].fillna(0,inplace=True)


In [171]:
# total_fee_5yr column

# add a new column to check availability of 5 yr program
df['offers_5yr_program'] = df['total_fee_5yr'].notna()
df['offers_5yr_program'] = df['offers_5yr_program'].astype(bool)

# fill total_fee_5yr values with 0
df['total_fee_5yr'].fillna(0,inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['total_fee_5yr'].fillna(0,inplace=True)


In [172]:
# convert given columns to their respective data types

# float type
df['campus_area'] = pd.to_numeric(df['campus_area'], errors='coerce')

# bool type
df['hostel_available'] = df['hostel_available'].map({'Y': True, 'N': False})

# int type
df['total_fee_4yr'] = df['total_fee_4yr'].astype(int)
df['city_tier'] = df['city_tier'].astype(int)

# category type
df['institute_type'] = df['institute_type'].astype('category')

In [173]:
df.isnull().sum()

institute_name        0
institute_type        0
city                  0
state                 0
city_tier             0
total_fee_4yr         0
total_fee_5yr         0
hostel_available      0
campus_area           0
infra_rating          0
campus_status         0
offers_5yr_program    0
dtype: int64

In [174]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 81 entries, 0 to 82
Data columns (total 12 columns):
 #   Column              Non-Null Count  Dtype   
---  ------              --------------  -----   
 0   institute_name      81 non-null     object  
 1   institute_type      81 non-null     category
 2   city                81 non-null     object  
 3   state               81 non-null     object  
 4   city_tier           81 non-null     int64   
 5   total_fee_4yr       81 non-null     int64   
 6   total_fee_5yr       81 non-null     float64 
 7   hostel_available    81 non-null     bool    
 8   campus_area         81 non-null     float64 
 9   infra_rating        81 non-null     float64 
 10  campus_status       81 non-null     category
 11  offers_5yr_program  81 non-null     bool    
dtypes: bool(2), category(2), float64(3), int64(2), object(3)
memory usage: 6.3+ KB


In [176]:
df.to_csv('/Users/anuragchaubey/smart-college-recommender/data/cleaned/metadata_cleaned.csv', index=False)
