# Connecting Trends to Scholars@TAMU  
**Filename:** trends.ipynb  
**Path:** TAMIDS/Code/Scholars@TAMU Data/trends.ipynb  
**Created Date:** 05 April 2022, 01:54 

I connect the current trends of various online services to Scholars at Texas A&M.

In [23]:
from IPython.display import Markdown, display, HTML
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import re
import requests
import jieba
from gensim import corpora, models, similarities
from requests.exceptions import HTTPError
from bs4 import BeautifulSoup

pd.options.display.float_format = '{:,.3f}'.format
plt.style.use('seaborn-darkgrid')

# General Markdown Formatting Functions

def printmd(string, level=1):
    header_level = '#'*level + ' '
    display(Markdown(header_level + string))

## NLP Prep

In [24]:
def run_gensim_similarities(text_dict: dict, keyword: str) -> dict:
    """
    texts: dict[pub_api_id: text] - bodies of texts to compare against the keyword
    keyword: str

    returns: dict[key: similarity_num]
    """

    keys, texts = text_dict.keys(), text_dict.values()
    cut_texts = [jieba.lcut(text) for text in texts]

    dictionary = corpora.Dictionary(cut_texts)
    feature_cnt = len(dictionary.token2id)
    corpus = [dictionary.doc2bow(text) for text in cut_texts]
    tfidf = models.TfidfModel(corpus)
    kw_vector = dictionary.doc2bow(jieba.lcut(keyword))
    index = similarities.SparseMatrixSimilarity(tfidf[corpus], num_features = feature_cnt)
    sim = index[tfidf[kw_vector]]
    return {key: val for key, val in zip(keys, sim)}

## Wikipedia Prep

In [5]:
def get_api_dict(url: str, kind=None) -> dict:
    try:
        if kind == 'Wikipedia':
            headers = {
                'User-Agent': 'My User Agent',
                'From': 'abibstopher@tamu.edu'
            }
            response = requests.get(url, headers=headers)
        else:
            response = requests.get(url)
        response.raise_for_status()
        jsonResponse = response.json()
    except HTTPError as http_err:
        print(f'HTTP error occurred: {http_err}')
    except Exception as err:
        print(f'Other error occurred: {err}')
    else:
        return jsonResponse
    return {}

def get_wikipedia_article(title: str) -> str:
    url = 'https://en.wikipedia.org/wiki/' + title
    try:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')
    except HTTPError as http_err:
        print(f'HTTP error occurred: {http_err}')
    except Exception as err:
        print(f'Other error occurred: {err}')
    else:
        text = ''
        for paragraph in soup.find_all('p'):
            text += paragraph.text
            
        text = re.sub(r'\[.*?\]+', '', text)
        text = text.replace('\n', '')
        return text

def get_top_wiki_views(access='all-access', date='2022/03/all-days') -> str:
    base_url = f'https://wikimedia.org/api/rest_v1/metrics/pageviews/top/en.wikipedia/'
    url = base_url + access + '/' + date
    return get_api_dict(url=url, kind='Wikipedia')

## Loading Data

In [30]:
people = pd.read_pickle('../../Data/Scholars@TAMU/my_api_calls/people_df.pickle')
publications = pd.read_pickle('../../Data/Scholars@TAMU/my_api_calls/publications_df.pickle')

### pubs abstract sample

In [70]:
pubs_sample = publications.sample(n=1000, random_state=1)

# abstract_dict = {key: pubs_sample['abstract'][key] if type(pubs_sample['abstract'][key]) == str else '' for key in pubs_sample.index.to_list()}
abstract_dict = {key: publications['abstract'][key] if type(publications['abstract'][key]) == str else '' for key in publications.index.to_list()}

In [81]:
people

keywords_dict = {key: ' '.join(people['keywords'][key]) for key in people.index.to_list()}

## Wikipedia Trends

In [88]:
EXCLUDE = ['Main_Page', 'Special:Search']

top_views = get_top_wiki_views()

top_articles = [article for article in top_views['items'][0]['articles']]

# wiki_text = get_wikipedia_article(top_articles[2]['article'])

[article for article in top_articles]

wiki_texts_100 = {article['article']: [get_wikipedia_article(article['article']), article['rank'], article['views']] for article in top_articles[:100]}

In [89]:
similarity_series = {key: pd.Series(run_gensim_similarities(text_dict=keywords_dict, keyword=value[0])) for key, value in wiki_texts_100.items()}

In [90]:
df = pd.DataFrame(similarity_series)

In [98]:
# df.idxmax()
['n032647a0']

"2.5 Research Design And Methodologies (aetiology) Bioengineering Graphical Models Kernel Machines Generalized Linear Mixed Models Functional Data Analysis Bugs Spatial Statistics Monte Carlo Methods Hierarchical Bayesian Models Boosting Longitudinal Data Analysis Mixed Models Penalized Splines Asymptotics Middle Aged Energy Intake Female Nutritional Status Potassium Adult Surveys And Questionnaires Reproducibility Of Results Population Surveillance Aged Dietary Proteins Biomarkers Humans Diet Male Measurement Error Models Truncated Normals Gibbs Sampling Bayesian Methods Nonparametric Regression Generalised Estimation Equation Kernel Method Nonparametric Regression Sandwich Estimator Clustered Data Profile Method Partially Linear Model Longitudinal Data Semiparametric Efficiency Bound Semiparametric Efficient Score Inverse Probability Weighting Missing Response Structural Measurement Error Functional Measurement Error Longitudinal Data Measurement Error Generalized Method Of Moments C

In [92]:
df.to_pickle('../../Data/Scholars@TAMU/my_api_calls/wikipedia_keyword_comparison.pickle')

In [72]:
d = run_gensim_similarities(text_dict=abstract_dict, keyword=wiki_text)

In [78]:
nlp = pd.Series(d)

print(nlp.sort_values(ascending=False)[nlp != 0])

publications.loc['n518675SE']['abstract']
# publications.loc['n400269SE']['abstract']

n518675SE   0.877
n227562SE   0.876
n193536SE   0.864
n146780SE   0.853
n217539SE   0.850
             ... 
n352604SE   0.002
n389166SE   0.001
n103223SE   0.001
n296947SE   0.001
n601755SE   0.000
Length: 143561, dtype: float32


'<jats:p>One major characteristic of studies in operations and supply chain                         management literature is a focus on how integration can lead to superior                         operations and manufacturing outcomes. Most of these studies, however, focus                         only on internal or external integration and few have been dedicated to                         understand how both internal and external integration influence performance                         outcomes. In addition, few studies, if any, have looked to the antecedents                         of organizational structure as a driver for such forms of integration. To                         help filling this gap, we draw on organizational structure and                         resource-based view theoretical perspectives to present a conceptual model                         that proposes a relationship between organizational structure and                         integration. The model also consi

In [82]:
wiki_text = get_wikipedia_article(top_articles[2]['article'])
keyword_similarities = run_gensim_similarities(text_dict=keywords_dict, keyword=wiki_text)

In [84]:
nlp = pd.Series(keyword_similarities)

print(nlp.sort_values(ascending=False)[nlp != 0])

people.loc['n032647a0']

n032647a0   0.649
nbb6c8c2a   0.646
n47de353a   0.642
n7deb8230   0.639
n70a3d026   0.639
             ... 
nd13e4d2d   0.046
n2f2205de   0.041
n5c077f89   0.033
nff831fe5   0.028
nfa5c67e9   0.027
Length: 3068, dtype: float32


uin                                                        402000618
lastname                                                     Carroll
middlename                                                         J
firstname                                                    Raymond
email                                              rcarroll@tamu.edu
preferred_title                              Distinguished Professor
employment_type                                              Faculty
research_areas                                                    []
keywords           [2.5 Research Design And Methodologies (aetiol...
colleges                                        [College of Science]
organizations                                           [Statistics]
education          [{'id': 'n032647a0_3482b02b-b399-11e9-adb7-001...
teaching           [{'id': 'n62393ae6', 'label': 'STAT485 Directe...
publications       [{'id': 'n295784SE', 'label': 'Hierarchical fu...
hr_title                          