In [1]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns 
import plotly.express as px 

In [2]:
#reading the ajman university scopus data 
df = pd.read_csv("ajman_university_scopus_data.csv") 
display(df)

Unnamed: 0,Authors,Author(s) ID,Title,Year,Source title,Volume,Issue,Art. No.,Page start,Page end,Page count,Cited by,DOI,Link,Document Type,Publication Stage,Open Access,Source,EID
0,"Elhwuegi, A.S.",6507526363,Central monoamines and their role in major dep...,2004,Progress in Neuro-Psychopharmacology and Biolo...,28,3,,435,451,,394.0,10.1016/j.pnpbp.2003.11.018,https://www.scopus.com/inward/record.url?eid=2...,Review,Final,,Scopus,2-s2.0-1842861600
1,"Abualigah, L., Elaziz, M.A., Sumari, P., Geem,...",57190984712; 57195591068; 6602619198; 65066852...,Reptile Search Algorithm (RSA): A nature-inspi...,2022,Expert Systems with Applications,191,,116158,,,,364.0,10.1016/j.eswa.2021.116158,https://www.scopus.com/inward/record.url?eid=2...,Article,Final,,Scopus,2-s2.0-85120889442
2,"Kocarnik, J.M., Compton, K., Dean, F.E., Fu, W...",56041211800; 57205330541; 57219860968; 5721888...,"Cancer Incidence, Mortality, Years of Life Los...",2022,JAMA Oncology,8,3,,420,444,,255.0,10.1001/jamaoncol.2021.6987,https://www.scopus.com/inward/record.url?eid=2...,Article,Final,"All Open Access, Hybrid Gold, Green",Scopus,2-s2.0-85122535890
3,"Ahmed, H.E.A., Jaber, M.A., Abu Fanas, S.H., K...",24484523300; 7006492097; 56619823700; 24484509900,The pattern of maxillofacial fractures in Shar...,2004,"Oral Surgery, Oral Medicine, Oral Pathology, O...",98,2,,166,170,,186.0,10.1016/j.tripleo.2004.01.020,https://www.scopus.com/inward/record.url?eid=2...,Article,Final,,Scopus,2-s2.0-4043130359
4,"Aboelmaged, M.G.",57202057160,Predicting e-procurement adoption in a develop...,2010,Industrial Management and Data Systems,110,3,,392,414,,152.0,10.1108/02635571011030042,https://www.scopus.com/inward/record.url?eid=2...,Article,Final,,Scopus,2-s2.0-77949846015
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3033,"Abuazab, H.H., El-Horbaty, E.-S.M.",6506410515; 16419408000,An approach on database index schemes,2003,Proceedings of the International Conference on...,2,,,581,587,,,,https://www.scopus.com/inward/record.url?eid=2...,Conference Paper,Final,,Scopus,2-s2.0-1642296526
3034,"Abou-Elnour, A., Abo-Elnor, O.",6701553440; 6602164471,Determination of the influence of device struc...,2003,Proceedings of SPIE - The International Societ...,5113,,,463,474,,,10.1117/12.497035,https://www.scopus.com/inward/record.url?eid=2...,Conference Paper,Final,,Scopus,2-s2.0-0041322846
3035,"Al-Hussain, S.M.",56127704800,Some neuronal species differences: The granule...,2002,Journal of the Bahrain Medical Society,14,3,,91,95,,,,https://www.scopus.com/inward/record.url?eid=2...,Article,Final,,Scopus,2-s2.0-0036665949
3036,"Abou-Elnour, A., Abo-Elnor, O.",6701553440; 6602164471,Impact of alloy composition on the noise behav...,2001,Proceedings of SPIE - The International Societ...,4490,,,152,159,,,10.1117/12.455420,https://www.scopus.com/inward/record.url?eid=2...,Conference Paper,Final,,Scopus,2-s2.0-0035767561


#### Visualizations we can implement 
1. total citations per year 
2. search by DOI feature, fetch the information for a provided DOI 
3. breakdown by 'Document Type' (Pie chart) 
4. Breakdown by 'Publication Stage'

In [3]:
#fetching the list of columns 
df.columns.tolist()

['Authors',
 'Author(s) ID',
 'Title',
 'Year',
 'Source title',
 'Volume',
 'Issue',
 'Art. No.',
 'Page start',
 'Page end',
 'Page count',
 'Cited by',
 'DOI',
 'Link',
 'Document Type',
 'Publication Stage',
 'Open Access',
 'Source',
 'EID']

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3038 entries, 0 to 3037
Data columns (total 19 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Authors            3038 non-null   object 
 1   Author(s) ID       3038 non-null   object 
 2   Title              3038 non-null   object 
 3   Year               3038 non-null   int64  
 4   Source title       3038 non-null   object 
 5   Volume             2673 non-null   object 
 6   Issue              1988 non-null   object 
 7   Art. No.           1271 non-null   object 
 8   Page start         1750 non-null   object 
 9   Page end           1739 non-null   object 
 10  Page count         1 non-null      float64
 11  Cited by           2036 non-null   float64
 12  DOI                2808 non-null   object 
 13  Link               3038 non-null   object 
 14  Document Type      3038 non-null   object 
 15  Publication Stage  3038 non-null   object 
 16  Open Access        1520 

In [5]:
#replacing the null values under Cited By with zero
df['Cited by'].replace(np.nan,0,inplace=True)

In [6]:
#converting the float64 Cited by attribute to an integer attribute 
df['Cited by'] = df['Cited by'].astype("int64")


In [7]:
#finding the total citations 
df['Cited by'].sum()

20603

In [29]:
def citation_range(startyear=None, endyear=None, df=None):
    new_df = df.copy() 
    min_year = new_df['Year'].min()
    print(min_year) 
    max_year = new_df['Year'].max() 
    print(max_year)
    if startyear < min_year: 
        startyear = min_year
    if endyear > max_year: 
        endyear = max_year
    if startyear is not None and endyear is not None: 
        query = new_df[(new_df.Year >= startyear) & (new_df.Year <= endyear)]
    elif startyear is None and endyear is not None: 
        query = new_df[(new_df.Year >= min_year) & (new_df.Year <= endyear)]
    elif startyear is None and endyear is not None:
        query = new_df[(new_df.Year >= start_year) & (new_df.Year <= max_year)]
    grouping = query.groupby('Year').sum()
    grouping.drop(['Page count'],axis=1,inplace=True)
    grouping = grouping.reset_index()
    fig = px.line(grouping,x='Year',y='Cited by', title='# of Citations per Year', markers = True)
    fig.update_layout(
        xaxis = dict(
            tickmode = 'linear',
            dtick = 1
        )
    )
    return fig

In [30]:
#let's test the function 
fig = citation_range(startyear=1996,endyear=2022,df=df)
fig.show()

1996
2024



The default value of numeric_only in DataFrameGroupBy.sum is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function.



In [65]:
#Breakdown by document type 
def document_type_range(startyear=None,endyear=None,df=None):
    new_df = df.copy() 
    min_year = new_df['Year'].min()
    print(min_year) 
    max_year = new_df['Year'].max() 
    print(max_year)
    if startyear < min_year: 
        startyear = min_year
    elif endyear > max_year: 
        endyear = max_year
    if startyear is not None and endyear is not None: 
        query = new_df[(new_df.Year >= startyear) & (new_df.Year <= endyear)]
    elif startyear is None and endyear is not None: 
        query = new_df[(new_df.Year >= min_year) & (new_df.Year <= endyear)]
    elif startyear is None and endyear is not None:
        query = new_df[(new_df.Year >= start_year) & (new_df.Year <= max_year)]
    #pivoting 
    pivoted = query.pivot_table(index=['Document Type'],aggfunc='size')
    reset_pivoted = pivoted.reset_index()
    reset_pivoted.columns = reset_pivoted.columns.map(str)
    reset_pivoted = reset_pivoted.rename(columns={'Document Type':'Document Type','0':'count'})
    reset_pivoted  
    fig = px.pie(reset_pivoted,values='count',names='Document Type',title='Breakdown by Document type: '+str(startyear)+' - '+str(endyear)) 
    return fig 

In [66]:
fig = document_type_range(startyear=2023,endyear=2024,df=df)
fig.show()

1996
2024


In [100]:
#Breakdown by document type by citations 
def document_type_citations_range(startyear=None,endyear=None,df=None):
    new_df = df.copy() 
    min_year = new_df['Year'].min()
    print(min_year) 
    max_year = new_df['Year'].max() 
    print(max_year)
    if startyear < min_year: 
        startyear = min_year
    if endyear > max_year: 
        endyear = max_year
    if startyear is not None and endyear is not None: 
        query = new_df[(new_df.Year >= startyear) & (new_df.Year <= endyear)]
    elif startyear is None and endyear is not None: 
        query = new_df[(new_df.Year >= min_year) & (new_df.Year <= endyear)]
    elif startyear is None and endyear is not None:
        query = new_df[(new_df.Year >= start_year) & (new_df.Year <= max_year)]
    #pivoting 
    fig = px.pie(query,values='Cited by',names='Document Type',title='Citations by Document Type: '+str(startyear)+' - '+str(endyear)) 
    return fig 

In [101]:
fig = document_type_citations_range(startyear=2005,endyear=2015,df=df)
fig.show()

1996
2024


In [98]:
#Breakdown by document type BAR
def document_type_range_BAR(startyear=None,endyear=None,df=None):
    new_df = df.copy() 
    min_year = new_df['Year'].min()
    print(min_year) 
    max_year = new_df['Year'].max() 
    print(max_year)
    if startyear < min_year: 
        startyear = min_year
    if endyear > max_year: 
        endyear = max_year
    if startyear is not None and endyear is not None: 
        query = new_df[(new_df.Year >= startyear) & (new_df.Year <= endyear)]
    elif startyear is None and endyear is not None: 
        query = new_df[(new_df.Year >= min_year) & (new_df.Year <= endyear)]
    elif startyear is None and endyear is not None:
        query = new_df[(new_df.Year >= start_year) & (new_df.Year <= max_year)]
    #pivoting 
    pivoted = query.pivot_table(index=['Document Type'],aggfunc='size')
    reset_pivoted = pivoted.reset_index()
    reset_pivoted.columns = reset_pivoted.columns.map(str)
    reset_pivoted = reset_pivoted.rename(columns={'Document Type':'Document Type','0':'count'})
    reset_pivoted = reset_pivoted.sort_values('count',ascending=False) 
    fig = px.bar(reset_pivoted,y='count',x='Document Type',title='Breakdown by Document type: '+str(startyear)+' - '+str(endyear)) 
    return fig 

In [99]:
fig = document_type_range_BAR(startyear=2010,endyear=2012,df=df)
fig.show()

1996
2024


In [96]:
#Breakdown by document type by citations 
def document_type_citations_range_BAR(startyear=None,endyear=None,df=None):
    new_df = df.copy() 
    min_year = new_df['Year'].min()
    print(min_year) 
    max_year = new_df['Year'].max() 
    print(max_year)
    if startyear < min_year: 
        startyear = min_year
    if endyear > max_year: 
        endyear = max_year
    if startyear is not None and endyear is not None: 
        query = new_df[(new_df.Year >= startyear) & (new_df.Year <= endyear)]
    elif startyear is None and endyear is not None: 
        query = new_df[(new_df.Year >= min_year) & (new_df.Year <= endyear)]
    elif startyear is None and endyear is not None:
        query = new_df[(new_df.Year >= start_year) & (new_df.Year <= max_year)]
    query = query.sort_values('Cited by',ascending=False) 
    fig = px.histogram(query,y='Cited by',x='Document Type',title='Citations by Document Type: '+str(startyear)+' - '+str(endyear)) 
    return fig 

In [97]:
fig = document_type_citations_range_BAR(startyear=1990,endyear=2050,df=df)
fig.show()

1996
2024


In [94]:
#Breakdown by document type 
def publication_stage_range(startyear=None,endyear=None,df=None):
    new_df = df.copy() 
    min_year = new_df['Year'].min()
    print(min_year) 
    max_year = new_df['Year'].max() 
    print(max_year)
    if startyear < min_year: 
        startyear = min_year
    if endyear > max_year: 
        endyear = max_year
    if startyear is not None and endyear is not None: 
        query = new_df[(new_df.Year >= startyear) & (new_df.Year <= endyear)]
    elif startyear is None and endyear is not None: 
        query = new_df[(new_df.Year >= min_year) & (new_df.Year <= endyear)]
    elif startyear is None and endyear is not None:
        query = new_df[(new_df.Year >= start_year) & (new_df.Year <= max_year)]
    #pivoting 
    pivoted = query.pivot_table(index=['Publication Stage'],aggfunc='size')
    reset_pivoted = pivoted.reset_index()
    reset_pivoted.columns = reset_pivoted.columns.map(str)
    reset_pivoted = reset_pivoted.rename(columns={'Publication Stage':'Publication Stage','0':'count'})
    reset_pivoted  
    fig = px.pie(reset_pivoted,values='count',names='Publication Stage',title='Breakdown by Publication Stage: '+str(startyear)+' - '+str(endyear)) 
    return fig 

In [110]:
fig = publication_stage_range(startyear=2023,endyear=2024,df=df)
fig.show()

1996
2024
