In [31]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


In [32]:
# load dataset

df = pd.read_csv('scholarshipdatabase.csv')

## Display the first few rows
df.head()

Unnamed: 0,Name,Area of specialisation,Link,Image/Pdf,End Date,End Date 2,Country,Level needed,Link2,Action item,Deadline month
0,University of Newcastle scholarship,"All disciplines, Computer science, Management,...",,,,,Australia,"BSc, MSC, PhD",,,
1,Texas A &M,"3d printing, Concrete, civil engineering",,Scholarship%20Database%200f820c6d3ec745ac96004...,,,USA,,,,
2,Warwick university,"Agricultural science, Agriculture, Bioscience,...",https://warwick.ac.uk/fac/cross_fac/mibtp/pgst...,,20-Jan,,UK,PhD,,,Jan
3,Warwick university,Plant science,https://warwick.ac.uk/fac/cross_fac/mibtp/pgst...,,20-Jan,,UK,PhD,,,Jan
4,University of Gronigen,"Environmental science, Social sciences",https://www.rug.nl/about-ug/work-with-us/job-o...,,16-Jan,,Netherlands,PhD,,,Jan


In [33]:
## Explore the data

## Get information about the dataset
print(df.info())

### Data cleaning - Handle missing data

print(df.isnull().sum())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 704 entries, 0 to 703
Data columns (total 11 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   Name                    699 non-null    object
 1   Area of specialisation  690 non-null    object
 2   Link                    541 non-null    object
 3   Image/Pdf               278 non-null    object
 4   End Date                443 non-null    object
 5   End Date 2              55 non-null     object
 6   Country                 650 non-null    object
 7   Level needed            653 non-null    object
 8   Link2                   120 non-null    object
 9   Action item             42 non-null     object
 10  Deadline month          443 non-null    object
dtypes: object(11)
memory usage: 60.6+ KB
None
Name                        5
Area of specialisation     14
Link                      163
Image/Pdf                 426
End Date                  261
End Date 2            

In [34]:
#New dataframe with selected features
features = ['Name', 'Area of specialisation', 'Country', 'Level needed']
data = df[features]

data.head()

Unnamed: 0,Name,Area of specialisation,Country,Level needed
0,University of Newcastle scholarship,"All disciplines, Computer science, Management,...",Australia,"BSc, MSC, PhD"
1,Texas A &M,"3d printing, Concrete, civil engineering",USA,
2,Warwick university,"Agricultural science, Agriculture, Bioscience,...",UK,PhD
3,Warwick university,Plant science,UK,PhD
4,University of Gronigen,"Environmental science, Social sciences",Netherlands,PhD


In [35]:
# Split the 'Area of specialisation' column into separate rows
data = data.assign(**{'Area of specialisation': data['Area of specialisation'].str.split(', ')})
data = data.explode('Area of specialisation')

#Split the "Country" into separate rows

data = data.assign(**{'Country': data['Country'].str.split(', ')})
data = data.explode('Country')

#Split the "Level needed" into separate rows

data = data.assign(**{'Level needed': data['Level needed'].str.split(", ")})
data = data.explode('Level needed')

# Reset the index to have unique index labels
data.reset_index(drop=True, inplace=True)

data.head()

data.shape

(3829, 4)

In [36]:
### Data cleaning - Handle missing data
data.isnull().sum()

Name                        5
Area of specialisation     17
Country                   329
Level needed              281
dtype: int64

In [37]:
data.dropna(subset=['Name'], inplace=True)
data.shape

(3824, 4)

In [38]:
data.isnull().sum()

Name                        0
Area of specialisation     12
Country                   324
Level needed              276
dtype: int64

In [39]:
# drop rows with missing value

data.fillna('', inplace=True)
#data.dropna(inplace=True)

data.head()

Unnamed: 0,Name,Area of specialisation,Country,Level needed
0,University of Newcastle scholarship,All disciplines,Australia,BSc
1,University of Newcastle scholarship,All disciplines,Australia,MSC
2,University of Newcastle scholarship,All disciplines,Australia,PhD
3,University of Newcastle scholarship,Computer science,Australia,BSc
4,University of Newcastle scholarship,Computer science,Australia,MSC


In [40]:
data.isnull().sum()

Name                      0
Area of specialisation    0
Country                   0
Level needed              0
dtype: int64

In [41]:
data.to_csv('scholarship_df.csv', index=False)  # Set index to False if you don't want to save the index column