# Priprava podatkov

In [1]:
import pandas as pd
import matplotlib.pyplot as plt

df = pd.read_excel("wos.xls")
df.head()

Unnamed: 0,Publication Type,Authors,Book Authors,Book Editors,Book Group Authors,Author Full Names,Book Author Full Names,Group Authors,Article Title,Source Title,...,Web of Science Index,Research Areas,IDS Number,Pubmed Id,Open Access Designations,Highly Cited Status,Hot Paper Status,Date of Export,UT (Unique WOS ID),Web of Science Record
0,J,"Wintzer-Wehekind, L; Moulis, L; Camus, M; Vanb...",,,,"Wintzer-Wehekind, Leonard; Moulis, Lionel; Cam...",,,Prospective assessment of learning curve and i...,BMC MEDICAL EDUCATION,...,Science Citation Index Expanded (SCI-EXPANDED)...,Education & Educational Research,Z9B6K,40069701.0,,,,2025-03-24,WOS:001441776100001,0
1,J,"Shiferaw, M; O'Hagan, KG; Weinstein, M",,,,"Shiferaw, Menbere; O'Hagan, Kaitlyn G.; Weinst...",,,Staying Put: Positive Spillovers on Teacher Re...,EDUCATION AND URBAN SOCIETY,...,Social Science Citation Index (SSCI),Education & Educational Research; Urban Studies,W0C2A,,,,,2025-03-24,WOS:001415355100001,0
2,J,"Li, AY; Hu, XD",,,,"Li, Amy Y.; Hu, Xiaodan",,,Goodbye Performance-Based Funding: Policy Aban...,RESEARCH IN HIGHER EDUCATION,...,Social Science Citation Index (SSCI),Education & Educational Research,O3L9U,,,,,2025-03-24,WOS:001370197800001,0
3,J,"Creps, R; Islem, S; Zeng, BR; Boatman, A; Sama...",,,,"Creps, Ryan; Islem, Shadman; Zeng, Bingran; Bo...",,,Tech Equity: a Survival Analysis of an Undergr...,INNOVATIVE HIGHER EDUCATION,...,Emerging Sources Citation Index (ESCI),Education & Educational Research,T4M0A,,,,,2025-03-24,WOS:001404753700001,0
4,J,"Corradi, B; Espinosa, D; Rodríguez, C; Espinoz...",,,,"Corradi, B.; Espinosa, D.; Rodriguez, C.; Espi...",,,Is Admission Enough? University Persistence of...,HIGHER EDUCATION POLICY,...,Social Science Citation Index (SSCI),Education & Educational Research,S0S3S,,,,,2025-03-24,WOS:001395427100001,0


In [3]:
import re
from collections import defaultdict

methods = {
    'Kaplan-Meier': ['kaplan[- ]meier', 'km curve'],
    'Cox Proportional Hazards': ['cox proportional hazards', 'cox regression', 'cox model'],
    'Discrete-Time Survival': ['discrete[- ]time survival', 'logistic hazard'],
    'Competing Risks': ['competing risks', 'cause[- ]specific hazard'],
    'Event History Analysis': ['event history analysis'],
    'Hazard Model': ['hazard model', 'hazard function'],
    'Multilevel Survival': ['multilevel survival', 'hierarchical survival'],
    'Machine Learning Survival': ['random survival forest', 'survival tree', 'deep survival', 'machine learning.*survival'],
    'Time-varying Covariates': ['time[- ]varying covariates', 'time[- ]dependent covariates'],
    'Parametric Models': ['weibull model', 'exponential survival', 'parametric survival'],
    'Dyadic Survival': ['dyadic survival'],
}

# Keywords indicating model success or impact
success_keywords = ['significant', 'improve', 'accurate', 'robust', 'validated', 'predictive power', 'fit well', 'low error']

# Store results
method_counts = defaultdict(int)
method_success_counts = defaultdict(int)

# Lowercase abstracts for matching
df['Abstract_lower'] = df['Abstract'].fillna('').astype(str).str.lower()


# Scan each abstract
for abstract in df['Abstract_lower']:
    for method, patterns in methods.items():
        for pattern in patterns:
            if re.search(pattern, abstract):
                method_counts[method] += 1
                if any(success_kw in abstract for success_kw in success_keywords):
                    method_success_counts[method] += 1
                break  # avoid double-counting the same method in one abstract

# Convert to DataFrame for easy viewing
results_df = pd.DataFrame({
    'Method': method_counts.keys(),
    'Article Count': method_counts.values(),
    'Success Mentions': [method_success_counts[m] for m in method_counts.keys()]
})

# Add a success ratio column
results_df['Success Ratio'] = results_df['Success Mentions'] / results_df['Article Count']

# Sort by count
results_df = results_df.sort_values(by='Article Count', ascending=False).reset_index(drop=True)

print(results_df)

                       Method  Article Count  Success Mentions  Success Ratio
0      Event History Analysis             51                17       0.333333
1    Cox Proportional Hazards             36                18       0.500000
2                Hazard Model             27                13       0.481481
3                Kaplan-Meier             27                16       0.592593
4      Discrete-Time Survival             20                 9       0.450000
5             Competing Risks             16                 6       0.375000
6           Parametric Models              4                 4       1.000000
7   Machine Learning Survival              3                 1       0.333333
8     Time-varying Covariates              2                 1       0.500000
9             Dyadic Survival              1                 0       0.000000
10        Multilevel Survival              1                 0       0.000000
