## Syllabus Vectors

This notebook will attempt to recreate what was done for the Web of Science data with the institution and city vectors.

In [1]:
from json import JSONDecoder, JSONDecodeError
import re
import os

## Metadata

The data has no real information so let us first get some meta data. 

In [2]:
from gensim.parsing.preprocessing import remove_stopwords, strip_numeric, strip_non_alphanum, stem_text


In [3]:
NOT_WHITESPACE = re.compile(r'[^\s]')
def decode_stacked(document, pos=0, decoder=JSONDecoder()):
    while True:
        match = NOT_WHITESPACE.search(document, pos)
        if not match:
            return
        pos = match.start()

        try:
            obj, pos = decoder.raw_decode(document, pos)
        except JSONDecodeError:
            # do something sensible if there's some error
            raise
        yield obj

In [None]:
teaching_texts_cities = {}
teaching_texts_orgs = {}

In [None]:
for filename in os.listdir("openSyReal/"):
    if filename.endswith(".json"):
        with open("openSyReal/" + filename, encoding='utf-8') as f:
            for line in f:
                for jsonfile in decode_stacked(line):
                    try:
                        if jsonfile['grid_country_code'] == "US" and int(jsonfile['year']) > 2006:
                            try:
                                text = remove_stopwords(strip_numeric(strip_non_alphanum(jsonfile['text'].replace("\n", " ").replace("\r", " ").replace("\xa0", " ").replace("\t", "").lower())))
                            except KeyError:
                                continue
                            i += 1
                            try:
                                name = jsonfile['NAME']
                                if name is not None and name not in teaching_texts_orgs:
                                    teaching_texts_orgs[name] = []
                                    teaching_texts_orgs[name].append(text)
                                else:
                                    teaching_texts_orgs[name].append(text)
                            except KeyError:
                                pass
                            try:
                                city = jsonfile['CITY']
                                if name is not None and name not in teaching_texts_cities:
                                    teaching_texts_cities[name] = []
                                    teaching_texts_cities[name].append(text)
                                else:
                                    teaching_texts_cities[name].append(text)
                            except KeyError:
                                pass
                    except KeyError:
                        continue
                        

### Models

In [None]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.models.word2vec import Word2Vec
from gensim.models import KeyedVectors
from gensim.models import TfidfModel
import gensim
from gensim.corpora import Dictionary
import numpy as np

In [None]:
def create_vector(document, model, method):
    if method == "doc2vec":
        vector = model.infer_vector(document)
        return vector
    if method == "word2vec":
        vector = model[document]
        return vector
    if method == "tfidf":
        vector = model[document]
        return vector

In [None]:
d2vmodel = Doc2Vec.load("alldoc2vec")

### Let us now aggregate city wise and institution wise 



In [None]:
def entity_vector(texts, model, method):
    vectors = []
    for text in texts:
        vector = create_vector(text, model, method)
        vectors.append(vector)
    # simple centroid, can do more complex method
    return (np.mean(vectors, axis=0), np.var(vectors, axis=0), len(texts))

## WoS Data

We're going to now load up and use the WoS data and compare our vectors.

In [None]:
import pandas as pd

In [None]:
wos_2006 = pd.read_csv("data_files_USA/data_2006_US.csv")

In [None]:
wos_2007 = pd.read_csv("data_files_USA/data_2007_US.csv")

In [None]:
wos_2008 = pd.read_csv("data_files_USA/data_2008_US.csv")

In [None]:
wos_2009 = pd.read_csv("data_files_USA/data_2009_US.csv")

In [None]:
wos_2010 = pd.read_csv("data_files_USA/data_2010_US.csv")

In [None]:
wos_2011 = pd.read_csv("data_files_USA/data_2011_US.csv")

In [None]:
wos_2012 = pd.read_csv("data_files_USA/data_2012_US.csv")

In [None]:
wos_2013 = pd.read_csv("data_files_USA/data_2013_US.csv")

In [None]:
wos_2014 = pd.read_csv("data_files_USA/data_2014_US.csv")

In [None]:
wos_2015 = pd.read_csv("data_files_USA/data_2015_US.csv")

In [None]:
wos_2016 = pd.read_csv("data_files_USA/data_2016_US.csv")

In [None]:
frames = [wos_2006, wos_2007, wos_2008, wos_2009, wos_2010, wos_2011, wos_2012, wos_2013, wos_2014, wos_2015, wos_2016]

In [None]:
wos = pd.concat(frames)

In [None]:
research_texts_cities = {}

In [None]:
research_texts_orgs = {}

In [None]:
for row in wos.itertuples(index=True):
    try:
        text = remove_stopwords(strip_numeric(strip_non_alphanum(row.abstract.lower())))
    except AttributeError:
        continue
    if row.city not in research_texts_cities:
        research_texts_cities[row.city] = []
        research_texts_cities[row.city].append(text)
    else:
        research_texts_cities[row.city].append(text)

    if row.org not in research_texts_orgs:
        research_texts_orgs[row.org] = []
        research_texts_orgs[row.org].append(text)
    else:
        research_texts_orgs[row.org].append(text)

In [None]:
del wos

In [None]:
import gc
gc.collect()

### Final Vectors

In [None]:
research_vectors_cities = {}
research_vectors_orgs = {}

In [None]:
teaching_vectors_cities = {}
teaching_vectors_orgs = {}

In [None]:
from gensim.parsing.preprocessing import remove_stopwords, strip_numeric, strip_non_alphanum, stem_text

In [None]:
for city in research_texts_cities:
    if city not in research_vectors_cities and len(research_texts_cities[city]) > 0:
        research_vectors_cities[city] = entity_vector(research_texts_cities[city], d2vmodel, "doc2vec")

In [None]:
for org in research_texts_orgs:
    if org not in research_vectors_orgs and len(research_texts_orgs[org]) > 0:
        research_vectors_orgs[org] = entity_vector(research_texts_orgs[org], d2vmodel, "doc2vec")

In [None]:
for city in teaching_texts_cities:
    if city not in teaching_vectors_cities and len(teaching_texts_cities[city]) > 0:
        teaching_vectors_cities[city] = entity_vector(teaching_texts_cities[city], d2vmodel, "doc2vec")

In [None]:
for org in teaching_texts_orgs:
    if org not in teaching_vectors_orgs and len(teaching_texts_orgs[org]) > 0:
        teaching_vectors_orgs[org] = entity_vector(teaching_texts_orgs[org], d2vmodel, "doc2vec")

In [None]:
with open('research_vectors_cities.txt', 'w') as file:
    file.write(json.dumps(research_vectors_cities))

In [None]:
with open('teaching_vectors_cities.txt', 'w') as file:
    file.write(json.dumps(teaching_vectors_cities))

In [None]:
with open('research_vectors_orgs.txt', 'w') as file:
    file.write(json.dumps(research_vectors_orgs))

In [None]:
with open('teaching_vectors_orgs.txt', 'w') as file:
    file.write(json.dumps(teaching_vectors_orgs))