In [None]:
import pandas as pd

# 1.1 import first dataset: university rankings
urank = pd.read_csv("https://raw.githubusercontent.com/caifand/DMRL_THA/master/FinalPaper/0_data_preprocessing/0_raw/cwur.csv")

# check the import
print(urank.head(10))
print(urank.tail(10))
print(list(urank))
print(urank.shape)

In [None]:
# 1.2 drop needless columns
cite_rank = urank[['Institution', 'Citations']]

# rename columns
cite_rank.rename(columns={'Institution':'institution',
                    'Citations':'citation'}, 
                 inplace=True)

print(cite_rank.shape)
print(cite_rank.head(5))

In [None]:
# 1.3 check null values and data types
cite_rank.isna().sum()
cite_rank.dtypes

In [None]:
# transform data types
#cite_rank['Citations'] = cite_rank['Citations'].astype('int64')
#cite_rank.dtypes

# First I got error message: "ValueError: invalid literal for int() with base 10: '> 1000'"
# This is because some observations have the value of citation indicators as "> 1000", which can not be converted to integer data type

# normalize column values using replace method
cite_rank = cite_rank.replace('> 1000', '1000')

In [None]:
# trasform data type
cite_rank['citation'] = cite_rank['citation'].astype('int64')
cite_rank.dtypes

In [None]:
# 2.1 import the second dataset: open access repositories
oa_repo = pd.read_csv('https://raw.githubusercontent.com/caifand/DMRL_THA/master/FinalPaper/0_data_preprocessing/0_raw/doar.csv')

print(oa_repo.head(5))
print(oa_repo.shape)

In [None]:
# 2.2 truncate needless columns

# check columns and missing values
oa_repo.isna().sum()

In [None]:
# only keep needed columns
oa_repo = oa_repo[['preferred_name', 'year_established', 'preferred_org_name', 'date_created']]
print(oa_repo.head(5))

In [None]:
# 2.3 Deal with missing values

# get needed data format
oa_repo.dtypes
oa_repo['date_created'] = oa_repo['date_created'].astype('datetime64')
oa_repo['date'] = pd.DatetimeIndex(oa_repo['date_created']).year

In [None]:
# replace missing values in 'year_established' column with corresponding values in 'date' column
oa_repo.year_established.fillna(oa_repo.date, inplace=True)

print(oa_repo.head(5))

In [None]:
# 2.4 Add new values

# calculate the age of the OA repositories

oa_repo['age'] = 2019 - oa_repo['year_established']
print(oa_repo.head(5))

In [None]:
# only keep needed columns
oa_repo = oa_repo[['preferred_org_name', 'age']]
oa_repo.rename(columns={'preferred_org_name':'institution',
                    'age':'repo_age'}, 
                 inplace=True)
print(oa_repo.head(5))

In [None]:
# 3.1 import the third dataset: open access mandates and policies

oap = pd.read_csv("https://raw.githubusercontent.com/caifand/DMRL_THA/master/FinalPaper/0_data_preprocessing/0_raw/roarmap.csv")

print(oap.shape)
oap.isna().sum()

In [None]:
# 3.2 Drop needless columns

oap = oap[['policymaker_name', 'policy_adoption', 'policy_effecive']]
print(oap.head(5))

In [None]:
# 3.3 Deal with missing values
# transform data types
oap['policy_effecive'] = oap['policy_effecive'].astype('datetime64')
oap['policy_adoption'] = oap['policy_adoption'].astype('datetime64')
oap.dtypes

In [None]:
# replace missing values

oap['policy_effecive'].fillna(oap['policy_adoption'], inplace=True)

oap['year_effective'] = pd.DatetimeIndex(oap['policy_effecive']).year
print(oap.head(5))

In [None]:
#oap.isna().sum()
# 3.4 Calculate new variable values

oap['effective_year'] = 2019 - oap['year_effective']
print(oap.head(5))

In [None]:
# truncate useless columns
oap = oap[['policymaker_name', 'effective_year']]
oap.rename(columns={'policymaker_name':'institution',
                    'effective_year':'policy_year'}, 
                 inplace=True)
print(oap.head(5))

In [None]:
# 4.1 merge datasets
# merge cite_rank & oa_repo
cite_repo = cite_rank.merge(oa_repo, on='institution', how='inner')
cite_repo

In [None]:
# group by insitution and sort out the first entry within each group
cite_repo = cite_repo.groupby('institution', as_index=False)
cite_repo = cite_repo.first()

In [None]:
# merge the third dataset oap
repo_p = cite_repo.merge(oap, on='institution', how='inner')
repo_p


In [None]:
#repo_p.isna().sum()
repo_p.dropna()
#print(repo_p)

In [None]:
# 5.1 export intermediate data 
repo_p.to_csv("repo_p.csv", index=False)