**Obesity research in specialty journals from 2000 to 2023: A bibliometric analysis**

In [None]:
import os
import pandas as pd
import numpy as np
import logging
from collections import defaultdict
from google.colab import drive

In [None]:
# Mount Google Drive
drive.flush_and_unmount()  # Unmount Google Drive if already mounted
drive.mount('/content/drive')

# Define the path to the CSV file in Google Drive and load
data_path = '/content/drive/My Drive/DATASETS/OBESITY.JOURNALS/'

Drive not mounted, so nothing to flush and unmount.
Mounted at /content/drive


In [None]:
# Load your dataset
file_name = os.path.join(data_path, 'merged_results_filtered.csv')
df = pd.read_csv(file_name)
df.head()

Unnamed: 0,id,funders,abstract,category_bra,category_for,category_hra,category_hrcs_hc,category_rcdc,category_sdg,category_uoa,...,pages,type,year,journal.id,journal.title,volume,issue,authors_count,concepts_scores,issn
0,pub.1000391299,,IntroductionIrisin is a myokine secreted from ...,"[{'id': '4001', 'name': 'Clinical Medicine and...","[{'id': '80003', 'name': '32 Biomedical and Cl...","[{'id': '3901', 'name': 'Clinical'}]","[{'id': '906', 'name': 'Metabolic and endocrin...","[{'id': '612', 'name': 'Physical Activity'}, {...",,"[{'id': '30024', 'name': 'C24 Sport and Exerci...",...,15-20,article,2016.0,jour.1155510,Obesity Medicine,1.0,,2,"[{'concept': 'sedentary young women', 'relevan...",24518476
1,pub.1007273132,"[{'acronym': 'ESE', 'city_name': 'Bristol', 'c...","Hormones encoded by the ghrelin gene, GHRL, re...","[{'id': '4000', 'name': 'Basic Science'}]","[{'id': '80051', 'name': '3208 Medical Physiol...",,"[{'id': '894', 'name': 'Cardiovascular'}, {'id...","[{'id': '507', 'name': 'Clinical Research'}, {...",,"[{'id': '30001', 'name': 'A01 Clinical Medicin...",...,1-3,article,2017.0,jour.1155510,Obesity Medicine,5.0,,5,"[{'concept': 'ghrelin gene expression', 'relev...",24518476
2,pub.1007962492,,PurposeThe aim of this study was to clarify th...,"[{'id': '4001', 'name': 'Clinical Medicine and...","[{'id': '80003', 'name': '32 Biomedical and Cl...","[{'id': '3901', 'name': 'Clinical'}]","[{'id': '906', 'name': 'Metabolic and endocrin...","[{'id': '438', 'name': 'Diabetes'}, {'id': '38...",,"[{'id': '30002', 'name': 'A02 Public Health, H...",...,1-5,article,2016.0,jour.1155510,Obesity Medicine,1.0,,6,"[{'concept': 'type 2 diabetic patients', 'rele...",24518476
3,pub.1009717273,"[{'acronym': 'CNPq', 'city_name': 'Brasília', ...",AimsConsidering the protective role of adipone...,"[{'id': '4001', 'name': 'Clinical Medicine and...","[{'id': '80056', 'name': '3213 Paediatrics'}, ...",,"[{'id': '906', 'name': 'Metabolic and endocrin...","[{'id': '389', 'name': 'Obesity'}, {'id': '308...",,"[{'id': '30003', 'name': 'A03 Allied Health Pr...",...,4-10,article,2017.0,jour.1155510,Obesity Medicine,5.0,,13,"[{'concept': 'biomarkers of inflammation', 're...",24518476
4,pub.1012242667,"[{'acronym': 'EC', 'city_name': 'Brussels', 'c...",BackgroundThe relation between area-level soci...,"[{'id': '4003', 'name': 'Public Health'}]","[{'id': '80003', 'name': '32 Biomedical and Cl...","[{'id': '3903', 'name': 'Population & Society'}]","[{'id': '906', 'name': 'Metabolic and endocrin...","[{'id': '389', 'name': 'Obesity'}, {'id': '558...",,"[{'id': '30003', 'name': 'A03 Allied Health Pr...",...,13-18,article,2016.0,jour.1155510,Obesity Medicine,2.0,,5,[{'concept': 'area-level socio-economic status...,24518476


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30061 entries, 0 to 30060
Data columns (total 31 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   id                    30061 non-null  object 
 1   funders               15962 non-null  object 
 2   abstract              30061 non-null  object 
 3   category_bra          24511 non-null  object 
 4   category_for          30057 non-null  object 
 5   category_hra          23646 non-null  object 
 6   category_hrcs_hc      21569 non-null  object 
 7   category_rcdc         29610 non-null  object 
 8   category_sdg          5844 non-null   object 
 9   category_uoa          30042 non-null  object 
 10  category_hrcs_rac     14285 non-null  object 
 11  category_icrp_cso     3628 non-null   object 
 12  category_icrp_ct      5293 non-null   object 
 13  recent_citations      30061 non-null  float64
 14  reference_ids         29562 non-null  object 
 15  concepts           

In [None]:
# Setup logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Define periods
periods = {
    '2000-2007': (2000, 2007),
    '2008-2015': (2008, 2015),
    '2016-2023': (2016, 2023),
    '2000-2023': (2000, 2023)
}

**Calculate Documents per Journal by Period**

In [None]:
def calculate_documents_per_period(df, periods):
    """Calculate number of documents per journal for each period"""
    results = {}

    for period_name, (start_year, end_year) in periods.items():
        try:
            # Filter data for the period
            period_df = df[(df['year'] >= start_year) & (df['year'] <= end_year)].copy()

            # Count documents per journal
            doc_counts = period_df.groupby('journal.title').size().reset_index()
            doc_counts.columns = ['Journal', 'Number_of_Documents']
            doc_counts = doc_counts.sort_values('Number_of_Documents', ascending=False)

            results[period_name] = doc_counts
            logger.info(f"Calculated document counts for period {period_name}: {len(doc_counts)} journals")

        except Exception as e:
            logger.error(f"Error calculating documents for period {period_name}: {str(e)}")

    return results

# Calculate documents per period
period_results = calculate_documents_per_period(df, periods)

# Display results
for period, result_df in period_results.items():
    print(f"\n=== {period} ===")
    print(result_df.head(10))


=== 2000-2007 ===
                                Journal  Number_of_Documents
0      International Journal of Obesity                 2200
1                               Obesity                 1741
4                       Obesity Surgery                 1404
3                       Obesity Reviews                  280
5                     Pediatric Obesity                   63
2  Obesity Research & Clinical Practice                   32

=== 2008-2015 ===
                                 Journal  Number_of_Documents
6                                Obesity                 3044
11                       Obesity Surgery                 2309
4       International Journal of Obesity                 1953
9                        Obesity Reviews                  791
12                     Pediatric Obesity                  665
5                     Journal of Obesity                  494
8   Obesity Research & Clinical Practice                  464
7                          Obesity Fact

**Calculating Journals' h-index**

In [None]:
def calculate_h_index(citations_list):
    """
    Calculate h-index for a list of citation counts
    h-index: largest number h such that h papers have at least h citations each
    """
    if not citations_list or len(citations_list) == 0:
        return 0

    # Sort citations in descending order
    citations_sorted = sorted(citations_list, reverse=True)
    h_index = 0

    for i, citations in enumerate(citations_sorted):
        # i+1 is the number of papers (1-indexed)
        if citations >= i + 1:
            h_index = i + 1
        else:
            break

    return h_index

def calculate_journal_h_index(df):
    """Calculate h-index for each journal"""
    h_indices = {}

    for journal in df['journal.title'].unique():
        try:
            journal_papers = df[df['journal.title'] == journal]
            citations_list = journal_papers['recent_citations'].tolist()

            # Remove any null or invalid citations
            citations_list = [c for c in citations_list if pd.notna(c) and c >= 0]

            h_index = calculate_h_index(citations_list)
            h_indices[journal] = h_index

        except Exception as e:
            logger.error(f"Error calculating h-index for journal {journal}: {str(e)}")
            h_indices[journal] = 0

    return h_indices

# Calculate h-indices
journal_h_indices = calculate_journal_h_index(df)
print("H-indices calculated successfully!")
for journal, h_idx in sorted(journal_h_indices.items(), key=lambda x: x[1], reverse=True):
    print(f"{journal}: {h_idx}")

H-indices calculated successfully!
Obesity Reviews: 91
Obesity: 74
International Journal of Obesity: 67
Current Obesity Reports: 52
Obesity Surgery: 50
Obesity Facts: 36
Pediatric Obesity: 34
Journal of Obesity: 32
Obesity Research & Clinical Practice: 30
Childhood Obesity: 25
Clinical Obesity: 23
Obesity Science & Practice: 21
Obesity Medicine: 19
BMC Obesity: 15
The Open Obesity Journal: 4


**Calculating Journals' g-index**

In [None]:
def calculate_g_index(citations_list):
    """
    Calculate g-index for a list of citation counts
    g-index: largest number g such that the top g papers have together at least g² citations
    """
    if not citations_list or len(citations_list) == 0:
        return 0

    # Sort citations in descending order
    citations_sorted = sorted(citations_list, reverse=True)
    g_index = 0

    cumulative_citations = 0
    for i, citations in enumerate(citations_sorted):
        cumulative_citations += citations
        # i+1 is the number of papers (1-indexed)
        if cumulative_citations >= (i + 1) ** 2:
            g_index = i + 1
        else:
            break

    return g_index

def calculate_journal_g_index(df):
    """Calculate g-index for each journal"""
    g_indices = {}

    for journal in df['journal.title'].unique():
        try:
            journal_papers = df[df['journal.title'] == journal]
            citations_list = journal_papers['recent_citations'].tolist()

            # Remove any null or invalid citations
            citations_list = [c for c in citations_list if pd.notna(c) and c >= 0]

            g_index = calculate_g_index(citations_list)
            g_indices[journal] = g_index

        except Exception as e:
            logger.error(f"Error calculating g-index for journal {journal}: {str(e)}")
            g_indices[journal] = 0

    return g_indices

# Calculate g-indices
journal_g_indices = calculate_journal_g_index(df)
print("G-indices calculated successfully!")
for journal, g_idx in sorted(journal_g_indices.items(), key=lambda x: x[1], reverse=True):
    print(f"{journal}: {g_idx}")

G-indices calculated successfully!
Obesity Reviews: 134
International Journal of Obesity: 104
Obesity: 103
Current Obesity Reports: 83
Obesity Surgery: 79
Obesity Facts: 66
Pediatric Obesity: 55
Journal of Obesity: 49
Obesity Research & Clinical Practice: 46
Clinical Obesity: 41
Childhood Obesity: 38
Obesity Science & Practice: 30
Obesity Medicine: 28
BMC Obesity: 23
The Open Obesity Journal: 6


**Journal Metrics Table**

In [None]:
def create_comprehensive_metrics_table(df, h_indices, g_indices):
    """Create a comprehensive table with all journal metrics"""

    metrics_data = []

    for journal in df['journal.title'].unique():
        try:
            journal_papers = df[df['journal.title'] == journal]

            total_papers = len(journal_papers)
            total_citations = journal_papers['recent_citations'].sum()
            h_index = h_indices.get(journal, 0)
            g_index = g_indices.get(journal, 0)

            metrics_data.append({
                'Journal': journal,
                'Total_Papers': total_papers,
                'Total_Citations': round(total_citations),
                'H_Index': h_index,
                'G_Index': g_index
            })

        except Exception as e:
            logger.error(f"Error creating metrics for journal {journal}: {str(e)}")
            # Add error entry with document IDs
            error_ids = df[df['journal.title'] == journal]['id'].tolist()
            logger.error(f"Affected document IDs for {journal}: {error_ids[:10]}...")  # Log first 10 IDs

    # Create DataFrame and sort by total papers
    metrics_df = pd.DataFrame(metrics_data)
    metrics_df = metrics_df.sort_values('Total_Papers', ascending=False)

    return metrics_df

# Create comprehensive metrics table
comprehensive_metrics = create_comprehensive_metrics_table(df, journal_h_indices, journal_g_indices)
print("Comprehensive metrics table created!")
print(comprehensive_metrics)

Comprehensive metrics table created!
                                 Journal  Total_Papers  Total_Citations  \
14                       Obesity Surgery          7585            50236   
8                                Obesity          6854            61567   
13      International Journal of Obesity          6021            57042   
11                       Obesity Reviews          2146            51510   
5                      Pediatric Obesity          1483            10782   
7   Obesity Research & Clinical Practice          1075             6397   
10                         Obesity Facts           880             8798   
2                      Childhood Obesity           818             4862   
4                     Journal of Obesity           772             5971   
12                      Clinical Obesity           595             4131   
3             Obesity Science & Practice           568             3421   
1                Current Obesity Reports           511         

**Exporting Results**

In [None]:
def export_results_to_csv(period_results, comprehensive_metrics):
    """Export all results to CSV files"""

    try:
        # Export period-specific document counts
        for period_name, period_df in period_results.items():
            filename = os.path.join(data_path, f"journal_documents_{period_name.replace('-', '_')}.csv")
            period_df.to_csv(filename, index=False)
            print(f"Exported: {filename}")

        # Export comprehensive metrics
        comprehensive_filename = os.path.join(data_path, "journal_comprehensive_metrics_2000_2023.csv")
        comprehensive_metrics.to_csv(comprehensive_filename, index=False)
        print(f"Exported: {comprehensive_filename}")

    except Exception as e:
        logger.error(f"Error exporting CSV files: {str(e)}")

# Export all results
export_results_to_csv(period_results, comprehensive_metrics)

Exported: /content/drive/My Drive/DATASETS/OBESITY.JOURNALS/journal_documents_2000_2007.csv
Exported: /content/drive/My Drive/DATASETS/OBESITY.JOURNALS/journal_documents_2008_2015.csv
Exported: /content/drive/My Drive/DATASETS/OBESITY.JOURNALS/journal_documents_2016_2023.csv
Exported: /content/drive/My Drive/DATASETS/OBESITY.JOURNALS/journal_documents_2000_2023.csv
Exported: /content/drive/My Drive/DATASETS/OBESITY.JOURNALS/journal_comprehensive_metrics_2000_2023.csv
