# Scholarly Recommender 

In [382]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, plot_confusion_matrix
from sklearn.ensemble import RandomForestClassifier

from collections import Counter

from Bio import Entrez
from Bio import Medline
from tqdm import tqdm

from itertools import combinations
import networkx as nx
from nxviz.plots import CircosPlot

# Data Scraping and Extraction

In [383]:
'''
https://towardsdatascience.com/network-analysis-to-quickly-get-insight-into-an-academic-field-with-python-cd891717d547
'''
def entrez_pull_df(keyword, save = False):
    Entrez.email = "awyeh450471@gmail.com"

    keyword = keyword

    result = Entrez.read(Entrez.esearch(db="pubmed", retmax=10, term=keyword))
    print(
        "Total number of publications that contain the term {}: {}".format(
            keyword, result["Count"]
        )
    )

    MAX_COUNT = result["Count"]
    result = Entrez.read(
        Entrez.esearch(db="pubmed", retmax=result["Count"], term=keyword)
    )

    ids = result["IdList"]

    batch_size = 100
    batches = [ids[x: x + 100] for x in range(0, len(ids), batch_size)]

    record_list = []
    for batch in tqdm(batches):
        h = Entrez.efetch(db="pubmed", id=batch, rettype="medline", retmode="text")
        records = Medline.parse(h)
        record_list.extend(list(records))
    print("Complete.")
    
    df = pd.DataFrame(record_list)
    if save == True:
        df.to_csv(f'../../data/{keyword}.csv', index = False)
    return df

In [384]:
def local_pull_df(keyword):
    df = pd.read_csv(f'../../data/{keyword}.csv', low_memory = False)
    return df

In [385]:
df_base = entrez_pull_df('hay fever', False)

Total number of publications that contain the term hay fever: 15769


 62%|█████████████████████████▍               | 98/158 [02:12<01:21,  1.35s/it]


KeyboardInterrupt: 

In [None]:
d1 = entrez_pull_df('sprained ankle', True)

In [None]:
df_base = local_pull_df('hay fever')

# Data Preprocessing and Cleaning

| Column | Field | Column | Field |
|--------|-------|--------|-------|
|'AB'|Abstract|'JID'|NLM Unique ID|
|'AD'|Affiliation|'JT'|Journal Title|
|'AID'|Article Identifier|'LA'|Language|
|'AU'|Author|'LID'|Location Identifier|
|'AUID'|Author Identifier|'LR'|Date Last Revised|
|'BTI'|Book Title|'MHDA'|MeSH Date|
|'CDAT'| |'MID'|Manuscript Identifier|
|'CI'|Copyright Information|'MH'|MeSH Terms|
|'CIN'|Comment In|'OID'|Other ID|
|'CN'|Corporate Author|'OT'|Other Term|
|'COIS'|Conflict of Interest Statement
|'CON'|Comment On|'OWN'|Owner|
|'CP'| |'PB'| |
|'CRDT'|Create Date|'PG'|Pagination|
|'CTDT'| |'PHST'|Publication History Status|
|'DA'|Date Created|'PL'|Place of Publication|
|'DCOM'|Date Completed|'PMC'|Pubmed Central Identifier|
|'DEP'|Date of Electronic Publication|'PMCR'|PubMed Central Release|
|'DP'|Date of Publication|'PMID'|PubMed Unique Identifier|
|'DRDT'| |'PS'|Personal Name as Subject|
|'ECI'|Expression of Concern|'PST'|Publication Status|
|'ED'|Editor Name|'PT'|Publication Type|
|'EFR'|Erratum For|'RF'|Number of References|
|'EIN'|Erratum in|'RIN'|Retraction In|
|'EDAT'|Entrez Date|'RN'|Registry Number|
|'FAU'|Full Author|'RPF'|Republished From|
|'FED'|Full Editor Name|'RPI'|Republished In|
|'FIR'|Full Investigator Name|'SB'|Subset|
|'FPS'|Full Personal Name as Subject|'SI'|Secondary Source ID|
|'GN'|General Note|'SO'|Source|
|'GR'|Grant Number|'STAT'|Status|
|'GS'|Gene Symbol|'TA'|Journal Title Abbreviation|
|'IP'|Issue|'TI'|Title|
|'IR'|Investigator Name|'TT'|Transliterated Title|
|'IS'|ISSN|'VI'|Volume|
|'ISBN'|ISBN|


In [None]:
cols = ['TI', 'AU', 'TA', 'EDAT', 'AB', 'MH', 'OT', 'RF', 'PL', 'LA']

df = df_base[cols]
df.columns = ['title', 'authors', 'journal', 'date', 'abstract', 'mesh_terms', 'other_terms', 'reference_number', 'location', 'language']

In [None]:
df.head()

In [None]:
s = df['other_terms'][3]

In [None]:
s

In [None]:
clean(s)

In [None]:
def clean(x):
    punctuation = '*&,\'-'
    s = x.strip('][').split(('\', '))
    s = [w for w in s if w != ', ']
    s = [w for w in s if w != '']
    s = [w.replace(' ', '_') for w in s]
    s = [w.replace('/', ' ') for w in s]
    s = [w.lower() for w in s]
    s = [w.translate(w.maketrans('', '', punctuation)) for w in s]
    return ' '.join(s)

def year(x):
    return int(x[:4])

In [None]:
df_clean = df[~df['mesh_terms'].isna() | ~df['other_terms'].isna()].reset_index(drop = True)
df_clean.fillna({'title': '', 'authors': '', 'abstract': '', 'mesh_terms': '', 'other_terms': '', 'reference_number': 0}, inplace = True)

df_clean['authors'] = df_clean['authors'].apply(clean)
df_clean['journal'] = df_clean['journal'].apply(clean)
df_clean['location'] = df_clean['location'].apply(clean)
df_clean['year'] = df_clean['date'].apply(year)
df_clean['language'] = df_clean['language'].apply(clean)
df_clean['mh_t'] = df_clean['mesh_terms'].apply(clean)
df_clean['ot_t'] = df_clean['other_terms'].apply(clean)
df_clean['terms'] = df_clean['mh_t'] + df_clean['ot_t']


df_clean.drop(columns = ['date', 'mesh_terms', 'other_terms', 'mh_t', 'ot_t'], inplace = True)

In [None]:
df_clean.head()

# Exploratory Data Analysis

In [None]:
def column_counter(col):
    s = ' '.join(df_clean[col])
    # removes empty strings
    s = s.replace('  ', ' ')
    return Counter(s.split(' '))

In [None]:
print("Top 10 Authors")
top10_authors = column_counter('authors').most_common(10)
print(top10_authors)
print("\nTop 10 Journals")
top10_journals = column_counter('journal').most_common(10)
print(top10_journals)
print("\nTop 10 Locations")
top10_locations = column_counter('location').most_common(10)
print(top10_locations)
print("\nTop 10 Languages")
top10_languages = column_counter('language').most_common(10)
print(top10_languages)
print("\nTop 10 Terms")
top10_terms = column_counter('terms').most_common(10)
print(top10_terms)

In [None]:
def top10_plots(counter, title):
    sns.set(context = 'poster', style = 'white')

    x = [x[1] for x in counter]
    y = [y[0] for y in counter]
    yticks = [y[0].replace('_', ' ').title() for y in counter]
    title = title

    plt.figure(figsize=(20,10))
    ax = sns.barplot(x = x, y = y, orient = 'h', color = '#63d297')
    ax.set(yticklabels = yticks, title = title)
    sns.despine(top = True, right = True, bottom = False, left = False)
    plt.savefig('top10_authors.png', dpi=300, bbox_inches='tight', transparent = True)


In [None]:
top10_plots(top10_authors, 'Top 10 Authors by Number of Submissions')

In [None]:
top10_plots(top10_journals, 'Top 10 Journals by Number of Submissions')

In [None]:
top10_plots(top10_locations, 'Top 10 Locations by Number of Submissions')

In [None]:
top10_plots(top10_languages, 'Top 10 Languages by Number of Submissions')

In [None]:
top10_plots(top10_terms, 'Top 10 Terms')

In [None]:
sns.set(context = 'poster', style = 'white')

x = df_clean['year'].sort_values(ascending = True)
xlabel = ''
ylabel = ''
yticks = ''
title = 'Number of Publications per Year'

plt.figure(figsize=(20,10))
ax = sns.displot(data = x, kind = 'kde', color = '#63d297', aspect = 1.75, height = 10)
ax.set_xticklabels(rotation=90)
ax.set(xlabel = xlabel, yticklabels = yticks, ylabel = ylabel, xlim = (1940, 2020), title = title)

plt.show()

# Data Inputing with Categorical Modeling

In [None]:
df_rf = df_clean[df_clean['reference_number'] > 0].reset_index(drop = True)
df_no_rf = df_clean[df_clean['reference_number'] == 0].reset_index(drop = True)

In [None]:
def relevant_labeling(rf):
    if rf > df_rf['reference_number'].describe()['25%']:
        return 1
    else:
        return 0
    
df_rf['relevant'] = df_rf['reference_number'].map(relevant_labeling)

In [None]:
X = df_rf['terms']
y = df_rf['relevant']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42, test_size = 0.25)

In [None]:
cv = CountVectorizer()
X_train_cv = cv.fit_transform(X_train)
X_train_cv = pd.DataFrame.sparse.from_spmatrix(X_train_cv)
X_train_cv.columns = sorted(cv.vocabulary_)

X_test_cv = cv.transform(X_test)
X_test_cv = pd.DataFrame.sparse.from_spmatrix(X_test_cv)
X_test_cv.columns = sorted(cv.vocabulary_)

In [None]:
lr_fsm = LogisticRegression(max_iter=10000)
lr_fsm.fit(X_train_cv, y_train)

y_pred_fsm = lr_fsm.predict(X_test_cv)
print(accuracy_score(y_test, y_pred_fsm))
print(precision_score(y_test, y_pred_fsm))
print(recall_score(y_test, y_pred_fsm))
print(f1_score(y_test, y_pred_fsm))

In [None]:
param_grid = {
    'max_iter': [10000],
    'solver': ['liblinear', 'sag'],
    'penalty': ['l1', 'l2', 'elasticnet'],
    'C': [10, 1.0, 0.1],
    'tol': [0.0001, 0.1, 1, 100]
}

gs1 = GridSearchCV(LogisticRegression(), param_grid)
gs1.fit(X_train_cv, y_train)

In [None]:
gs1.best_estimator_

In [None]:
lr = LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=10000,
                   multi_class='auto', n_jobs=None, penalty='l1',
                   random_state=None, solver='liblinear', tol=0.1, verbose=0,
                   warm_start=False)

lr.fit(X_train_cv, y_train)
y_pred = lr.predict(X_test_cv)
print(accuracy_score(y_test, y_pred))
print(precision_score(y_test, y_pred))
print(recall_score(y_test, y_pred))
print(f1_score(y_test, y_pred_fsm))

In [None]:
tfidf = TfidfVectorizer()
X_train_tfidf = tfidf.fit_transform(X_train)
X_train_tfidf = pd.DataFrame.sparse.from_spmatrix(X_train_tfidf)
X_train_tfidf.columns = sorted(cv.vocabulary_)

X_test_tfidf = tfidf.transform(X_test)
X_test_tfidf = pd.DataFrame.sparse.from_spmatrix(X_test_tfidf)
X_test_tfidf.columns = sorted(tfidf.vocabulary_)

In [None]:
lr_fsm = LogisticRegression(max_iter=10000)
lr_fsm.fit(X_train_tfidf, y_train)

y_pred_fsm = lr_fsm.predict(X_test_tfidf)
print(accuracy_score(y_test, y_pred_fsm))
print(precision_score(y_test, y_pred_fsm))
print(recall_score(y_test, y_pred_fsm))
print(f1_score(y_test, y_pred_fsm))

# Content Based Recommender System with Cosine Similarities

# Explanatory Data Analysis

# Summary

In [373]:

df_clean.sort_values(by = ['reference_number'], ascending = False).iloc[[0], :]

Unnamed: 0,title,authors,journal,abstract,reference_number,location,language,year,terms
4321,Allergic Rhinitis and its Impact on Asthma (AR...,bousquet_j khaltaev_n cruz_aa denburg_j fokken...,allergy,,2241.0,denmark,eng,2008,adolescent asthma epidemiology etiology therap...
