# Connecting Trends to Scholars@TAMU  
**Filename:** trends.ipynb  
**Path:** TAMIDS/Code/Scholars@TAMU Data/trends.ipynb  
**Created Date:** 05 April 2022, 01:54 

I connect the current trends of various online services to Scholars at Texas A&M.

In [23]:
from IPython.display import Markdown, display, HTML
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import re
import requests
import jieba
from gensim import corpora, models, similarities
from requests.exceptions import HTTPError
from bs4 import BeautifulSoup

pd.options.display.float_format = '{:,.3f}'.format
plt.style.use('seaborn-darkgrid')

# General Markdown Formatting Functions

def printmd(string, level=1):
    header_level = '#'*level + ' '
    display(Markdown(header_level + string))

## NLP Prep

In [24]:
def run_gensim_similarities(text_dict: dict, keyword: str) -> dict:
    """
    texts: dict[pub_api_id: text] - bodies of texts to compare against the keyword
    keyword: str

    returns: dict[key: similarity_num]
    """

    keys, texts = text_dict.keys(), text_dict.values()
    cut_texts = [jieba.lcut(text) for text in texts]

    dictionary = corpora.Dictionary(cut_texts)
    feature_cnt = len(dictionary.token2id)
    corpus = [dictionary.doc2bow(text) for text in cut_texts]
    tfidf = models.TfidfModel(corpus)
    kw_vector = dictionary.doc2bow(jieba.lcut(keyword))
    index = similarities.SparseMatrixSimilarity(tfidf[corpus], num_features = feature_cnt)
    sim = index[tfidf[kw_vector]]
    return {key: val for key, val in zip(keys, sim)}

## Wikipedia Prep

In [5]:
def get_api_dict(url: str, kind=None) -> dict:
    try:
        if kind == 'Wikipedia':
            headers = {
                'User-Agent': 'My User Agent',
                'From': 'abibstopher@tamu.edu'
            }
            response = requests.get(url, headers=headers)
        else:
            response = requests.get(url)
        response.raise_for_status()
        jsonResponse = response.json()
    except HTTPError as http_err:
        print(f'HTTP error occurred: {http_err}')
    except Exception as err:
        print(f'Other error occurred: {err}')
    else:
        return jsonResponse
    return {}

def get_wikipedia_article(title: str) -> str:
    url = 'https://en.wikipedia.org/wiki/' + title
    try:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')
    except HTTPError as http_err:
        print(f'HTTP error occurred: {http_err}')
    except Exception as err:
        print(f'Other error occurred: {err}')
    else:
        text = ''
        for paragraph in soup.find_all('p'):
            text += paragraph.text
            
        text = re.sub(r'\[.*?\]+', '', text)
        text = text.replace('\n', '')
        return text

def get_top_wiki_views(access='all-access', date='2022/03/all-days') -> str:
    base_url = f'https://wikimedia.org/api/rest_v1/metrics/pageviews/top/en.wikipedia/'
    url = base_url + access + '/' + date
    return get_api_dict(url=url, kind='Wikipedia')

## Loading Data

In [30]:
people = pd.read_pickle('../../Data/Scholars@TAMU/my_api_calls/people_df.pickle')
publications = pd.read_pickle('../../Data/Scholars@TAMU/my_api_calls/publications_df.pickle')

### pubs abstract sample

In [70]:
pubs_sample = publications.sample(n=1000, random_state=1)

# abstract_dict = {key: pubs_sample['abstract'][key] if type(pubs_sample['abstract'][key]) == str else '' for key in pubs_sample.index.to_list()}
abstract_dict = {key: publications['abstract'][key] if type(publications['abstract'][key]) == str else '' for key in publications.index.to_list()}

In [81]:
people

keywords_dict = {key: ' '.join(people['keywords'][key]) for key in people.index.to_list()}

## Wikipedia Trends

In [88]:
EXCLUDE = ['Main_Page', 'Special:Search']

top_views = get_top_wiki_views()

top_articles = [article for article in top_views['items'][0]['articles']]

# wiki_text = get_wikipedia_article(top_articles[2]['article'])

[article for article in top_articles]

wiki_texts_100 = {article['article']: [get_wikipedia_article(article['article']), article['rank'], article['views']] for article in top_articles[:100]}

In [89]:
similarity_series = {key: pd.Series(run_gensim_similarities(text_dict=keywords_dict, keyword=value[0])) for key, value in wiki_texts_100.items()}

In [90]:
df = pd.DataFrame(similarity_series)

In [138]:
df

Unnamed: 0,Main_Page,Special:Search,2022_Russian_invasion_of_Ukraine,Vladimir_Putin,The_Batman_(film),Ukraine,Volodymyr_Zelenskyy,The_Kashmir_Files,Russo-Ukrainian_War,Anna_Sorokin,...,Wagner_Group,F5_Networks,Dune_(2021_film),Casualties_of_the_Russo-Ukrainian_War,Pieces_of_Her_(TV_series),Exodus_of_Kashmiri_Hindus,West_Side_Story_(2021_film),Morbius_(film),XXX:_State_of_the_Union,XXX_(film_series)
n28cb7333,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,...,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000
n014c3d0f,0.471,0.000,0.537,0.513,0.541,0.483,0.520,0.501,0.521,0.564,...,0.507,0.560,0.524,0.529,0.535,0.543,0.501,0.530,0.517,0.535
n7a168a93,0.471,0.000,0.538,0.513,0.542,0.484,0.521,0.501,0.522,0.564,...,0.507,0.561,0.525,0.530,0.536,0.544,0.502,0.531,0.518,0.536
nbccd1f64,0.315,0.000,0.359,0.343,0.362,0.323,0.348,0.335,0.349,0.378,...,0.339,0.374,0.351,0.355,0.358,0.363,0.336,0.354,0.346,0.358
n18de9127,0.481,0.000,0.548,0.524,0.554,0.494,0.532,0.512,0.533,0.575,...,0.518,0.567,0.535,0.541,0.546,0.555,0.512,0.542,0.529,0.548
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
nc1e62471,0.488,0.000,0.555,0.531,0.561,0.500,0.539,0.519,0.540,0.583,...,0.525,0.575,0.543,0.547,0.553,0.562,0.519,0.549,0.536,0.556
n14b2580b,0.340,0.000,0.388,0.372,0.393,0.350,0.378,0.364,0.378,0.406,...,0.367,0.394,0.379,0.382,0.383,0.394,0.362,0.384,0.374,0.389
n4f37dfa5,0.177,0.000,0.202,0.190,0.200,0.180,0.193,0.184,0.193,0.215,...,0.189,0.228,0.196,0.201,0.206,0.201,0.189,0.198,0.194,0.196
n0e788fcb,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,...,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000


In [92]:
df.to_pickle('../../Data/Scholars@TAMU/my_api_calls/wikipedia_keyword_comparison.pickle')

In [82]:
wiki_text = get_wikipedia_article(top_articles[2]['article'])
keyword_similarities = run_gensim_similarities(text_dict=keywords_dict, keyword=wiki_text)

In [84]:
nlp = pd.Series(keyword_similarities)

print(nlp.sort_values(ascending=False)[nlp != 0])

people.loc['n032647a0']

n032647a0   0.649
nbb6c8c2a   0.646
n47de353a   0.642
n7deb8230   0.639
n70a3d026   0.639
             ... 
nd13e4d2d   0.046
n2f2205de   0.041
n5c077f89   0.033
nff831fe5   0.028
nfa5c67e9   0.027
Length: 3068, dtype: float32


uin                                                        402000618
lastname                                                     Carroll
middlename                                                         J
firstname                                                    Raymond
email                                              rcarroll@tamu.edu
preferred_title                              Distinguished Professor
employment_type                                              Faculty
research_areas                                                    []
keywords           [2.5 Research Design And Methodologies (aetiol...
colleges                                        [College of Science]
organizations                                           [Statistics]
education          [{'id': 'n032647a0_3482b02b-b399-11e9-adb7-001...
teaching           [{'id': 'n62393ae6', 'label': 'STAT485 Directe...
publications       [{'id': 'n295784SE', 'label': 'Hierarchical fu...
hr_title                          

## Example DataFrame Slices

In [137]:
ids = ['n14bc37c1', 'nf489b17d', 'nc47b6f90', 'n731c9f84', 'nbef1f2f3', 'na9a3fabc', 'n9894eb30']

people.loc[ids]

def get_author_name(x: object) -> list[str]:
    if type(x) == list:
        try:
            return [f"{people['firstname'].loc[id]} {people['lastname'].loc[id]}" for id in x]
        except KeyError:
            return x
    else:
        return x

publications['author_names'] = publications['author_ids'].apply(get_author_name)

def find_match(x: object) -> bool:
    if type(x) == list:
        if (set(x) & set(ids)):
            return True
    
    return False

publications[publications['author_ids'].apply(find_match)].sample(n=10)

Unnamed: 0_level_0,author_ids,author_uins,year,publication_type,publication_title,keyword,un_sustainable_development_goals,author_organization,author_city,author_country,abstract,author_names
publication_api_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
n584788SE,[n14bc37c1],[323009677],2011,Patent,Scalable traffic classifier and classifier tra...,,,,,,A traffic classifier has a plurality of binary...,[Nicholas Duffield]
n584764SE,[n14bc37c1],[323009677],2004,Conference,Flow sampling under hard resource constraints,,,,,,,[Nicholas Duffield]
n225271SE,[n14bc37c1],[323009677],2006,Conference,"Sampling Techniques for Large, Dynamic Graphs",,,"[University of Oregon, AT&amp;T Inc.]","[Eugene, San Antonio]","[United States, United States]",Peer-to-peer systems are becoming increasingly...,[Nicholas Duffield]
n584745SE,[n14bc37c1],[323009677],2019,Conference,Piecewise Stationary Modeling of Random Proces...,,,,,,,[Nicholas Duffield]
n92767SE,[nbef1f2f3],[126005180],2014,Conference,Data curation practices in institutional repos...,,,[Florida State University],[Tallahassee],[United States],,[Dong-Joon Lee]
n367437SE,[nbef1f2f3],[126005180],2018,Journal Article,Researchers' uses of and disincentives for sha...,"[Computer Science, Information Science & Libra...",,"[Florida State University, Queens College, Cit...","[Tallahassee, Flushing, College Station]","[United States, United States, United States]",,[Dong-Joon Lee]
n392518SE,[n731c9f84],[527006626],2009,Conference,Multi-label multiple kernel learning,,,"[Arizona State University, Michigan State Univ...","[Tempe, East Lansing]","[United States, United States]",We present a multi-label multiple kernel learn...,[Shuiwang Ji]
n92775SE,[nbef1f2f3],[126005180],2012,Conference,"Data determination, disambiguation, and refere...",,,[Florida State University],[Tallahassee],[United States],"Entity and instance determination, disambiguat...",[Dong-Joon Lee]
n408107SE,"[n3e0f7747, nbef1f2f3, n31ebd4a6, nf489b17d]","[402001311, 126005180, 116006104, 601003827]",2019,Institutional Repository Document (TAMU),Scholars@TAMU Texas A&M University Librariesâ€™,,,,,,Texas A&M University Libraries has been using ...,"[Douglas Hahn, Dong-Joon Lee, Ethelyn Mejia, B..."
n392531SE,[n731c9f84],[527006626],2010,Journal Article,A shared-subspace learning framework for multi...,[Computer Science],,"[Arizona State University, Computer-Aided Diag...","[Tempe, Malvern]","[United States, United States]",Multi-label problems arise in various domains ...,[Shuiwang Ji]
