## This notebook will clean and create the combined embedding space

We will clean the teaching (syllabus), research (wos), and jobs (BG) spaces.

First we'll open, get metadata, then create an embedding space.

### Functions

In [None]:
from json import JSONDecoder, JSONDecodeError
import re
import os

In [None]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.models.word2vec import Word2Vec
from gensim.models import KeyedVectors
from gensim.models import TfidfModel
import gensim
from gensim.corpora import Dictionary
import numpy as np

In [None]:
import pandas as pd

In [None]:
import zipfile
import xml.etree.cElementTree as ET
import os

In [None]:
NOT_WHITESPACE = re.compile(r'[^\s]')
def decode_stacked(document, pos=0, decoder=JSONDecoder()):
    while True:
        match = NOT_WHITESPACE.search(document, pos)
        if not match:
            return
        pos = match.start()

        try:
            obj, pos = decoder.raw_decode(document, pos)
        except JSONDecodeError:
            # do something sensible if there's some error
            raise
        yield obj

In [None]:
from gensim.parsing.preprocessing import remove_stopwords, strip_numeric, strip_non_alphanum, stem_text

In [None]:
def clean_corpus(corpus, method="default"):
    cleaned = []
    if method == "default":
        for line in corpus:
            cleaned.append(remove_stopwords(strip_numeric(strip_non_alphanum(line.lower()))))
            
    if method == "basic":
        for line in corpus:
            try:
                cleaned.append(gensim.utils.simple_preprocess(line))
            except TypeError:
                continue
        return cleaned
    
    if method == "advanced":
        nlp = spacy.load("en")
        for abstract in corpus:
            article = []
            doc = nlp(abstract)
            for w in doc:
                # if it's not a stop word or punctuation mark, add it to our article!
                if not w.is_stop and not w.is_punct and not w.like_num and w.text != 'I':
                    # we add the lematized version of the word
                    article.append(w.lemma_)
            cleaned.append(article)
    
    return cleaned

In [None]:
def create_model(corpus, method, **kwargs):
    if method == "doc2vec":
        try:
            if kwargs["model_address"]:
                model = Doc2Vec.load(kwargs["model_address"], binary=True)
                return model
        except KeyError:
            pass
        # vector size?
        model = Doc2Vec(vector_size=100, min_count=5, epochs=40)
        i = 0
        train_corpus = []
        for doc in corpus:
            # process doc more?
            train_corpus.append(TaggedDocument(doc, [i]))
            i += 1
        model.build_vocab(train_corpus)
        model.train(train_corpus, total_examples=model.corpus_count, epochs=model.epochs)
        return model
        
    if method == "word2vec":
        try:
            if kwargs["model_address"]:
                model = KeyedVectors.load_word2vec_format(kwargs["model_address"], binary=True)
                return model
        except KeyError:
            pass
        try:
            # for now, for allowing for our own parameters
            model = Word2Vec(corpus, size=kwargs["size"], hs=kwargs["hs"])
            return model
        except KeyError:
            # return basic word2vec model
            model = Word2Vec(corpus, size=200, hs=1)
            model = model.wv
            return model
        
    if method == "tfidf":
        
        dct = Dictionary(corpus)
        bow_corpus = [dct.doc2bow(line) for line in corpus]
        model = TfidfModel(bow_corpus)
        
        low_value = 0.2
        low_value_words = []
        for bow in bow_corpus:
            low_value_words += [id for id, value in model[bow] if value < low_value]
        
        dct.filter_tokens(bad_ids=low_value_words)
        new_corpus = [dct.doc2bow(doc) for doc in corpus]
            
        return model, new_corpus

### Syllabus

In [None]:
texts = []
syllabus_text =[]
syllabus_years = []

In [None]:
for filename in os.listdir("openSyReal/"):
    if filename.endswith(".json"):
        with open("openSyReal/" + filename, encoding='utf-8') as f:
            for line in f:
                for jsonfile in decode_stacked(line):
                        try:
                            if jsonfile['grid_country_code'] == "US":
                                try:
                                    texts.append(jsonfile['text'])
                                except KeyError:
                                    continue
                                try:
                                    syllabus_years.append(jsonfile['year'])
                                except KeyError:
                                    syllabus_years.append(None)

In [None]:
indices_2006 = [i for i, j in enumerate(syllabus_years) if j != None and j >= 2006]

In [None]:
syllabus_texts = list(np.array(texts)[indices_2006])

### Jobs

In [None]:
job_texts = []

In [None]:
for folder in os.listdir("/project2/jevans/BG/Text Data/"):
    if os.path.isdir("/project2/jevans/BG/Text Data/" + folder):
        for filename in os.listdir("/project2/jevans/BG/Text Data/" + folder):
            zfile = zipfile.ZipFile("/project2/jevans/BG/Text Data/" + folder + "/" +filename)
            jobs = ET.parse(zfile.open(zfile.infolist()[0])).getroot()
            for job in jobs:
                job_texts.append(job[7].text)

### Teaching

In [None]:
wos_2006 = pd.read_csv("data_files_USA/data_2006_US.csv")
wos_2007 = pd.read_csv("data_files_USA/data_2007_US.csv")
wos_2008 = pd.read_csv("data_files_USA/data_2008_US.csv")
wos_2009 = pd.read_csv("data_files_USA/data_2009_US.csv")
wos_2010 = pd.read_csv("data_files_USA/data_2010_US.csv")
wos_2011 = pd.read_csv("data_files_USA/data_2011_US.csv")
wos_2012 = pd.read_csv("data_files_USA/data_2012_US.csv")
wos_2013 = pd.read_csv("data_files_USA/data_2013_US.csv")
wos_2014 = pd.read_csv("data_files_USA/data_2014_US.csv")
wos_2015 = pd.read_csv("data_files_USA/data_2015_US.csv")
wos_2016 = pd.read_csv("data_files_USA/data_2016_US.csv")

In [None]:
frames = [wos_2006, wos_2007, wos_2008, wos_2009, wos_2010, wos_2011, wos_2012, wos_2013, wos_2014, wos_2015, wos_2016]

In [None]:
research_texts = []

In [None]:
wos = pd.concat(frames)

In [None]:
for row in wos.itertuples(index=True):
    research_texts.append(row.abstract)

### Create space

In [None]:
cleaned_jobs = clean_corpus(job_texts)

In [None]:
del job_texts

In [None]:
cleaned_research = clean_corpus(research_texts)

In [None]:
del research_texts

In [None]:
cleaned_teaching = clean_corpus(syllabus_texts)

In [None]:
del syllabus_texts

In [None]:
cleaned_corpus = cleaned_teaching + cleaned_research + cleaned_jobs
del cleaned_teaching
del cleaned_research
del cleaned_jobs

In [None]:
model = create_model(cleaned_corpus, method='doc2vec')

In [None]:
model.save("alldoc2vec")