In [1]:
# Import required libraries
import numpy as np
import pandas as pd

In [2]:
# Load Dataset
df = pd.read_csv('World University Ranking.csv')
df.head() # first 5 records

Unnamed: 0,world_rank,institution,country,national_rank,quality_of_education,alumni_employment,quality_of_faculty,publications,influence,citations,broad_impact,patents,score,year
0,1,Harvard University,USA,1,7,9,1,1,1,1,,5,100.0,2012
1,2,Massachusetts Institute of Technology,USA,2,9,17,3,12,4,4,,1,91.67,2012
2,3,Stanford University,USA,3,17,11,5,4,2,2,,15,89.5,2012
3,4,University of Cambridge,United Kingdom,1,10,24,4,16,16,11,,50,86.17,2012
4,5,California Institute of Technology,USA,4,2,29,7,37,22,22,,18,85.21,2012


In [3]:
# last 5 records
df.tail()

Unnamed: 0,world_rank,institution,country,national_rank,quality_of_education,alumni_employment,quality_of_faculty,publications,influence,citations,broad_impact,patents,score,year
2195,996,University of the Algarve,Portugal,7,367,567,218,926,845,812,969.0,816,44.03,2015
2196,997,Alexandria University,Egypt,4,236,566,218,997,908,645,981.0,871,44.03,2015
2197,998,Federal University of Ceará,Brazil,18,367,549,218,830,823,812,975.0,824,44.03,2015
2198,999,University of A Coruña,Spain,40,367,567,218,886,974,812,975.0,651,44.02,2015
2199,1000,China Pharmaceutical University,China,83,367,567,218,861,991,812,981.0,547,44.02,2015


In [4]:
# Check total rows and columns in dataset

print("Total rows in dataset are", len(df))
print("Total columns in dataset are", len(df.columns))

Total rows in dataset are 2200
Total columns in dataset are 14


In [5]:
# Initial Data Exploration like data structure, types etc.
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2200 entries, 0 to 2199
Data columns (total 14 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   world_rank            2200 non-null   int64  
 1   institution           2200 non-null   object 
 2   country               2200 non-null   object 
 3   national_rank         2200 non-null   int64  
 4   quality_of_education  2200 non-null   int64  
 5   alumni_employment     2200 non-null   int64  
 6   quality_of_faculty    2200 non-null   int64  
 7   publications          2200 non-null   int64  
 8   influence             2200 non-null   int64  
 9   citations             2200 non-null   int64  
 10  broad_impact          2000 non-null   float64
 11  patents               2200 non-null   int64  
 12  score                 2200 non-null   float64
 13  year                  2200 non-null   int64  
dtypes: float64(2), int64(10), object(2)
memory usage: 240.8+ KB


In [6]:
# Check eight number summary of numerical columns
df.describe()

Unnamed: 0,world_rank,national_rank,quality_of_education,alumni_employment,quality_of_faculty,publications,influence,citations,broad_impact,patents,score,year
count,2200.0,2200.0,2200.0,2200.0,2200.0,2200.0,2200.0,2200.0,2000.0,2200.0,2200.0,2200.0
mean,459.590909,40.278182,275.100455,357.116818,178.888182,459.908636,459.797727,413.417273,496.6995,433.346364,47.798395,2014.318182
std,304.320363,51.74087,121.9351,186.779252,64.050885,303.760352,303.331822,264.366549,286.919755,273.996525,7.760806,0.76213
min,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,43.36,2012.0
25%,175.75,6.0,175.75,175.75,175.75,175.75,175.75,161.0,250.5,170.75,44.46,2014.0
50%,450.5,21.0,355.0,450.5,210.0,450.5,450.5,406.0,496.0,426.0,45.1,2014.0
75%,725.25,49.0,367.0,478.0,218.0,725.0,725.25,645.0,741.0,714.25,47.545,2015.0
max,1000.0,229.0,367.0,567.0,218.0,1000.0,991.0,812.0,1000.0,871.0,100.0,2015.0


In [7]:
# Check total missing values per column
df.isnull().sum()

world_rank                0
institution               0
country                   0
national_rank             0
quality_of_education      0
alumni_employment         0
quality_of_faculty        0
publications              0
influence                 0
citations                 0
broad_impact            200
patents                   0
score                     0
year                      0
dtype: int64

In [8]:
# Check total duplicated rows
df.duplicated().sum()

0

In [9]:
# Standardize column names
df.columns = df.columns.str.strip().str.title().str.replace('_',' ')  

# Removed leading & trailing spaces if any using strip()
# Converted column names into title case using title()
# Replaced underscore in column names with space using replace()

df.columns

Index(['World Rank', 'Institution', 'Country', 'National Rank',
       'Quality Of Education', 'Alumni Employment', 'Quality Of Faculty',
       'Publications', 'Influence', 'Citations', 'Broad Impact', 'Patents',
       'Score', 'Year'],
      dtype='object')

In [10]:
# Check unique country names available and sort for inconsistency
df['Country'].sort_values().unique()

array(['Argentina', 'Australia', 'Austria', 'Belgium', 'Brazil',
       'Bulgaria', 'Canada', 'Chile', 'China', 'Colombia', 'Croatia',
       'Cyprus', 'Czech Republic', 'Denmark', 'Egypt', 'Estonia',
       'Finland', 'France', 'Germany', 'Greece', 'Hong Kong', 'Hungary',
       'Iceland', 'India', 'Iran', 'Ireland', 'Israel', 'Italy', 'Japan',
       'Lebanon', 'Lithuania', 'Malaysia', 'Mexico', 'Netherlands',
       'New Zealand', 'Norway', 'Poland', 'Portugal', 'Puerto Rico',
       'Romania', 'Russia', 'Saudi Arabia', 'Serbia', 'Singapore',
       'Slovak Republic', 'Slovenia', 'South Africa', 'South Korea',
       'Spain', 'Sweden', 'Switzerland', 'Taiwan', 'Thailand', 'Turkey',
       'USA', 'Uganda', 'United Arab Emirates', 'United Kingdom',
       'Uruguay'], dtype=object)

In [11]:
# Handle inconsistent country names
# USA should be United States of America

#df['Country'] = df['Country'].replace('USA','United States of America') # One way using list
df['Country'] = df['Country'].replace({'USA':'United States of America'}) # 2nd way using dictionary

# USA became United States of America
df['Country'].sort_values().unique()

array(['Argentina', 'Australia', 'Austria', 'Belgium', 'Brazil',
       'Bulgaria', 'Canada', 'Chile', 'China', 'Colombia', 'Croatia',
       'Cyprus', 'Czech Republic', 'Denmark', 'Egypt', 'Estonia',
       'Finland', 'France', 'Germany', 'Greece', 'Hong Kong', 'Hungary',
       'Iceland', 'India', 'Iran', 'Ireland', 'Israel', 'Italy', 'Japan',
       'Lebanon', 'Lithuania', 'Malaysia', 'Mexico', 'Netherlands',
       'New Zealand', 'Norway', 'Poland', 'Portugal', 'Puerto Rico',
       'Romania', 'Russia', 'Saudi Arabia', 'Serbia', 'Singapore',
       'Slovak Republic', 'Slovenia', 'South Africa', 'South Korea',
       'Spain', 'Sweden', 'Switzerland', 'Taiwan', 'Thailand', 'Turkey',
       'Uganda', 'United Arab Emirates', 'United Kingdom',
       'United States of America', 'Uruguay'], dtype=object)

In [12]:
# Check how many unique countries are in data
print("Total unique countries available in dataset are", df['Country'].nunique())

Total unique countries available in dataset are 59


In [13]:
# Check imbalance in categorical data (Country column)
df['Country'].value_counts().sort_values(ascending=False)

Country
United States of America    573
China                       167
Japan                       159
United Kingdom              144
Germany                     115
France                      109
Italy                        96
Spain                        81
Canada                       72
South Korea                  72
Australia                    58
Taiwan                       46
Brazil                       36
India                        31
Netherlands                  29
Switzerland                  26
Sweden                       24
Austria                      24
Israel                       22
Finland                      20
Turkey                       20
Belgium                      20
Poland                       18
Iran                         16
Ireland                      16
Portugal                     14
Greece                       14
Denmark                      12
Hungary                      12
New Zealand                  12
Norway                       12


In [14]:
# Handle missing values

# column name Broad Impact contains missing values
print(df.isnull().sum()[df.isnull().sum() > 0])  

# Apply mean imputation
df['Broad Impact'] = df['Broad Impact'].fillna(df['Broad Impact'].mean())

# Recheck missing values 
df.isnull().sum()   # all missing values in dataset handled

Broad Impact    200
dtype: int64


World Rank              0
Institution             0
Country                 0
National Rank           0
Quality Of Education    0
Alumni Employment       0
Quality Of Faculty      0
Publications            0
Influence               0
Citations               0
Broad Impact            0
Patents                 0
Score                   0
Year                    0
dtype: int64

In [15]:
# Check total unique world ranks available
print("Total unique world ranks are", len(df['World Rank'].sort_values().unique()))

print("Lowest rank is", min(df['World Rank']))  # lowest rank is 1
print("Highest rank is", max(df['World Rank']))  # Highest rank is 1000

Total unique world ranks are 1000
Lowest rank is 1
Highest rank is 1000


In [16]:
# Create new feature called 'rank_group' based on world rank

def new_rank_group(rank):
    if rank <= 100:
        return 'Top 100'
    elif 100 < rank <= 500:
        return 'Good'
    elif 500 < rank <= 700:
        return 'Need Improvement'
    else:
        return 'Bottom 300'

df['Rank Group'] = df['World Rank'].apply(new_rank_group)
df

Unnamed: 0,World Rank,Institution,Country,National Rank,Quality Of Education,Alumni Employment,Quality Of Faculty,Publications,Influence,Citations,Broad Impact,Patents,Score,Year,Rank Group
0,1,Harvard University,United States of America,1,7,9,1,1,1,1,496.6995,5,100.00,2012,Top 100
1,2,Massachusetts Institute of Technology,United States of America,2,9,17,3,12,4,4,496.6995,1,91.67,2012,Top 100
2,3,Stanford University,United States of America,3,17,11,5,4,2,2,496.6995,15,89.50,2012,Top 100
3,4,University of Cambridge,United Kingdom,1,10,24,4,16,16,11,496.6995,50,86.17,2012,Top 100
4,5,California Institute of Technology,United States of America,4,2,29,7,37,22,22,496.6995,18,85.21,2012,Top 100
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2195,996,University of the Algarve,Portugal,7,367,567,218,926,845,812,969.0000,816,44.03,2015,Bottom 300
2196,997,Alexandria University,Egypt,4,236,566,218,997,908,645,981.0000,871,44.03,2015,Bottom 300
2197,998,Federal University of Ceará,Brazil,18,367,549,218,830,823,812,975.0000,824,44.03,2015,Bottom 300
2198,999,University of A Coruña,Spain,40,367,567,218,886,974,812,975.0000,651,44.02,2015,Bottom 300


In [17]:
# Sanity check
df.info() # All good

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2200 entries, 0 to 2199
Data columns (total 15 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   World Rank            2200 non-null   int64  
 1   Institution           2200 non-null   object 
 2   Country               2200 non-null   object 
 3   National Rank         2200 non-null   int64  
 4   Quality Of Education  2200 non-null   int64  
 5   Alumni Employment     2200 non-null   int64  
 6   Quality Of Faculty    2200 non-null   int64  
 7   Publications          2200 non-null   int64  
 8   Influence             2200 non-null   int64  
 9   Citations             2200 non-null   int64  
 10  Broad Impact          2200 non-null   float64
 11  Patents               2200 non-null   int64  
 12  Score                 2200 non-null   float64
 13  Year                  2200 non-null   int64  
 14  Rank Group            2200 non-null   object 
dtypes: float64(2), int64(