# Day 2 - Familiarise with scholarly data

## Import the libraries

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

## Load ROR dataset

In [None]:
ror = pd.read_csv('../data/raw/v1/v1.25-2023-05-11-ror-data.csv')

In [None]:
ror.head()

In [None]:
ror.info()

In [None]:
ror['country.country_name'].value_counts().head(20)

In [None]:
ror['country.country_name'].value_counts().head(20).plot.bar()

In [None]:
ror['addresses[0].geonames_city.name'].value_counts().head(20)

In [None]:
ror['addresses[0].geonames_city.name'].value_counts().head(40).plot.bar()

In [None]:
ror[ror['addresses[0].geonames_city.name'] == 'Pisa']

In [None]:
ror['types'].unique()

In [None]:
ror['types'].value_counts().plot.pie()

In [None]:
fig, axs = plt.subplots(figsize=(16, 8), ncols=3, nrows=2)

ror[ror['country.country_code'] == 'IT']['types'].value_counts().plot.pie(ax=axs[0,0], label='Italy')
ror[ror['country.country_code'] == 'GB']['types'].value_counts().plot.pie(ax=axs[0,1], label='United Kingdom')
ror[ror['country.country_code'] == 'US']['types'].value_counts().plot.pie(ax=axs[0,2], label='United States')
ror[ror['country.country_code'] == 'IN']['types'].value_counts().plot.pie(ax=axs[1,0], label='India')
ror[ror['country.country_code'] == 'JP']['types'].value_counts().plot.pie(ax=axs[1,1], label='Japan')
ror[ror['country.country_code'] == 'CN']['types'].value_counts().plot.pie(ax=axs[1,2], label='Çhina')

In [None]:
ror.groupby(['country.country_code', 'types']).count()['id']

In [None]:
country_composition = ror.groupby(['country.country_code', 'types']).count()['id'].unstack()
country_composition

In [None]:
country_composition['Total'] = country_composition.sum(axis=1)
country_composition

In [None]:
country_composition = country_composition.sort_values('Total', ascending=False)
country_composition

In [None]:
fig, axs = plt.subplots(figsize=(16, 8))
country_composition.drop(columns='Total').head(50).plot.bar(stacked=True, ax=axs)

In [None]:
fig, axs = plt.subplots(figsize=(16, 8))
country_composition.loc[['IT', 'ES', 'DE', 'FR', 'GB', 'RU', 'IN', 'CN', 'JP']].drop(columns='Total').plot.bar(stacked=True, ax=axs)

In [None]:
# drop missing longitude/latitude
df2 = ror.dropna(subset=['addresses[0].lat', 'addresses[0].lng'])
# also drop zero values
df2 = df2[df2['addresses[0].lat'] != 0]
# convert to geodataframe using geopandas
import geopandas

# set crs to EPSG:4326 to specify WGS84 Latitude/Longitude
gdf = geopandas.GeoDataFrame(
    df2,
    geometry = geopandas.points_from_xy(df2['addresses[0].lng'], df2['addresses[0].lat']),
    crs = 'EPSG:4326' ,
)
gdf.head()

In [None]:
f, ax = plt.subplots(figsize=(16, 16))
# plot location of calls involving animals
gdf.plot(ax=ax, color="black", alpha=0.3)
plt.title("ROR entries locations")
# plt.axis("off")
plt.show()

In [None]:
import contextily as cx

f, ax = plt.subplots(figsize=(16, 16))

gdf.plot(ax=ax, color="black", alpha=0.3)
# add a basemap of the region using contextily
cx.add_basemap(ax, crs=gdf.crs)
plt.title("ROR entries locations")
plt.axis("off")
plt.show()

In [None]:
f, ax = plt.subplots(figsize=(16, 16))

for type, colour in zip(gdf['types'].unique(), ['red', 'yellow', 'green', 'teal', 'blue', 'magenta', 'pink', 'brown', 'orange']):
    gdf[gdf['types'] == type].plot(
        ax=ax, color=colour, alpha=0.5, label=type
    )
# add a basemap of the region using contextily
cx.add_basemap(ax, crs=gdf.crs)
plt.title('ROR types')
plt.legend()
plt.axis('off')
plt.show()

In [None]:
ror.established.min()

In [None]:
ror[ror.established == 712]

In [None]:
ror.established.max()

In [None]:
ror.groupby('established')['id'].count()

In [None]:
ror.groupby(['established', 'types']).id.count()

In [None]:
ror.groupby(['established', 'types']).id.count().unstack().tail(200).plot.bar(subplots=True, figsize=(50, 25))