## Text as Skills Syllabi Validation

This notebook validates that text can be used as a proxy for skills.

In [1]:
from json import JSONDecoder, JSONDecodeError
import re
import os
from collections import Counter
from datetime import datetime

In [2]:
maybe = ["service", "administration", 'process']

In [3]:
my_stop_words = ['papers', 'paper', 'discussion', 'complete', 'require', 'time', 'quiz', 'attendance',
 'study', 'instructor', 'examination', 'course', 'student', 'students', 'syllabus', 'work', 'class', 'assignment',
 'requiring', 'requirement', 'credit', 'include', 'grade', 'week', 'professor', 'exam', 'final', 'credits', 'essay',
 'assessment', '•', 'prerequisite', 'book', 'textbook', 'withdraw', 'homework', 'hyperlink',
 'mailto:', 'exercise', '%', 'drop', 'withdrawl', 'pass', 'fail', 'classroom','campus','catalog',
 'unit', 'chapter', 'sunday', 'monday', 'tuesday', 'wednesday','thursday','friday','saturday','january',
 'february', 'march', 'april', 'may', 'june','july', 'august','september','october','november','december',
 'project','participation','semeseter', 'trimester','quarter','academic',
 'point', 'provide','activity', 'learn','date','office','hour','page', 'lecture',
 'review','lesson','disability','contact','meeting','meet','deadline','submit','integrity','dishonesty',
    'plagiarism', 'able', 'prepare', 'response', 'study', 'department',
 'cheat','report','submit','faculty','university','semester','presentation', 'summer','practicum',
 'instruction', 'generally', 'site', 'description', 'detail', 'unpaid', 'involve', 'college',
 'group', 'problem', 'support', 'section', 'schedule', 'available', 'term', 'late',
 'email', 'system', 'and/or','absence', 'introduction', 'write', 'read', 'test', 
 'topic', 'need', 'online','skill','note','resource', 'expect', 'post',  'question', 'material', 'follow',
 'help', 'apply', 'receive', 'base','practice','school','issue', 'center',
 'fall', 'assign', 'general', 'outcome', 'find', 'basic', 'total', 'evaluation', 'give', 'text',
 'learn', 'grade', 'require', 'point', 'hour', 'include', 'follow',
  'question', 'service', 'provide', 'course', 'requirement', 'exam', 'individual', 'expect', 
 'reading', 'give', 'access', 'experience', 'relate', 'discuss', 'activity', 'major',
    'result', 'level', 'form', 'appropriate', 'complete', 'assign', 'class','present', 'accommodation',
 'issue', 'well', 'think', 'list', 'phone', 'understand',  
    'quiz', 'miss', 'problem', 'right', 'degree', 'begin', 'teach', 'conduct', 'take', 'additional' 
    ]

In [4]:
NOT_WHITESPACE = re.compile(r'[^\s]')
def decode_stacked(document, pos=0, decoder=JSONDecoder()):
    while True:
        match = NOT_WHITESPACE.search(document, pos)
        if not match:
            return
        pos = match.start()

        try:
            obj, pos = decoder.raw_decode(document, pos)
        except JSONDecodeError:
            # do something sensible if there's some error
            raise
        yield obj

In [5]:
i = 0

In [6]:
field_texts  = {}

In [7]:
import spacy

In [9]:
nlp = spacy.load("en_core_web_sm")

In [10]:
for stopword in my_stop_words:
    lexeme = nlp.vocab[stopword]
    lexeme.is_stop = True

In [11]:
import logging
logger = logging.getLogger("spacy")
logger.setLevel(logging.ERROR)

In [12]:
def clean_text(text, max_len=1500000):
    nlp.max_length = max_len
    if len(text) > max_len:
        text = text[0:max_len]
    doc = nlp(text.lower(), disable=["parser", "tagger", "ner"])
    clean_text = []
    for w in doc:
        if (not '\n' in w.text and not w.is_stop and not w.is_punct and not w.like_num and not '’' in w.text 
            and not ')' in w.text and not '\xa0' in w.text and not '\r' in w.text and not '\t' in w.text 
            and not '\v' in w.text and len(w.text) > 3 and "@" not in w.text and "https" not in w.text
            and w.lemma_ not in my_stop_words):
                # we add the lematized version of the word
                clean_text.append(w.lemma_)
    return clean_text

In [13]:
for filename in os.listdir("../openSyReal_part/"):
    i += 1
    if i > 3:
        break
    if filename.endswith(".json"): #Added try before countrycode and encoding argument
        with open("../openSyReal_part/" + filename, encoding = 'utf-8') as f:
            for line in f:
                for jsonfile in decode_stacked(line):
                    try:
                        if jsonfile['grid_country_code'] == 'US':
                            try:
                                field = jsonfile['field_name']
                                if field not in field_texts:
                                    field_texts[field] = []
                                if field in field_texts:
                                    field_texts[field].append(clean_text(jsonfile['text']))
                            except KeyError:
                                pass
                    except:
                        pass

In [14]:
field_texts.keys()

dict_keys(['Chemistry.', 'Speech Communication and Rhetoric.', 'Dentistry', 'Marketing.', 'Economics.', 'Business, Management, and Related Support Services.', 'Mathematics/Applied Mathematics', 'Spanish Language and Literature.', 'Engineering.', 'Health and Medical Administrative Services.', 'Political Science, Government, International Relations and National Security Studies.', 'Biological and Biomedical Sciences, General/Other.', 'Rhetoric and Composition/Writing Studies.', 'English Language and Literature/Letters', 'Mechanic and Repair Technologies/Technicians.', 'Psychology, General/Other.', 'Communications and Media Studies, General/Other.', 'Computer, Network, and Technology Support Services.', 'Film/Video and Photographic Arts.', 'Journalism.', 'History.', 'Parks, Recreation, Leisure, and Fitness Studies.', 'Chinese Language and Literature.', 'Allied Health Diagnostic, Intervention, and Treatment Professions.', 'Engineering Technologies and Engineering-related Fields.', 'Basic C

In [19]:
len(field_texts)

89

In [15]:
field_texts['Chemistry.']

[['tentative',
  'chem',
  'interactive',
  'notes',
  'files',
  '.doc',
  '.pdf',
  'format',
  'print',
  'notes',
  'home',
  'library',
  'cyber',
  'café',
  'small',
  'charge',
  'computer',
  'labs',
  'download',
  'handouts',
  'ahead',
  'scheduled',
  'covered',
  'notes',
  'covered',
  'sure',
  'check',
  'updates',
  'interactive',
  'notes',
  'prior',
  'coming',
  'icons',
  'icons',
  'embedded',
  'notes',
  'symbols',
  'alert',
  'following',
  'represents',
  'fact',
  'piece',
  'information',
  'definitions',
  'element',
  'compound',
  'represents',
  'useful',
  'trick',
  'likely',
  'useful',
  'convert',
  'grams',
  'moles',
  'substance',
  'alerts',
  'important',
  'relationship',
  'micro',
  'macro',
  'scale',
  'properties',
  'phenomena',
  'respect',
  'provides',
  'link',
  'interesting',
  'briefly',
  'discussed',
  'supplemental',
  'scope',
  'chemistry102',
  'notes',
  'chem',
  'packets',
  'listed',
  'plus',
  'exams',
  'periodic',

In [17]:
all_texts = []

In [20]:
for field in field_texts:
    for text in field_texts[field]:
        all_texts.append(text)

In [21]:
len(all_texts)

8045

In [23]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.models.word2vec import Word2Vec
from gensim.models import KeyedVectors
from gensim.models import TfidfModel
import gensim
from gensim.corpora import Dictionary
import numpy as np

In [24]:
def create_model(corpus):
    model = Doc2Vec(vector_size=100, min_count=5, epochs=40)
    i = 0
    train_corpus = []
    for doc in corpus:
        # process doc more?
        train_corpus.append(TaggedDocument(doc, [i]))
        i += 1
    model.build_vocab(train_corpus)
    model.train(train_corpus, total_examples=model.corpus_count, epochs=model.epochs)
    return model

In [25]:
model = create_model(all_texts)

In [26]:
import random

In [40]:
within_differences = []

In [41]:
between_differences = []

In [30]:
from scipy.spatial import distance

In [42]:
while(len(between_differences) < 1000):
    field_1, field_2 = random.sample(list(field_texts.keys()), 2)
    if len(field_texts[field_1]) < 2 or len(field_texts[field_2]) < 2:
        continue
    text_1, text_2 = random.sample(field_texts[field_1], 2)
    text_3 = random.sample(field_texts[field_2], 1)[0]
    vec_1 = model.infer_vector(text_1)
    vec_2 = model.infer_vector(text_2)
    vec_3 = model.infer_vector(text_3)
    within_difference = distance.cosine(vec_1, vec_2)
    between_difference = distance.cosine(vec_2, vec_3)
    within_differences.append(within_difference)
    between_differences.append(between_difference)

In [43]:
def statistics(differences):
    sorted_differences = sorted(differences)
    mean_differences = np.mean(sorted_differences)
    variance_differences = np.var(differences)
    Bs = []
    
    for i in range(0, len(differences)):
        B = np.sqrt(len(differences))
        Bs.append(B)
    upper_interval = mean_differences - (((Bs[len(differences) -1]) * sorted_differences[len(differences)-1]) / len(differences))
    lower_interval = mean_differences - (((Bs[1]) * sorted_differences[1]) / len(differences))
    return [(lower_interval, upper_interval), (sorted_differences[0], sorted_differences[-1]), variance_differences]

In [44]:
statistics(between_differences)

[(0.8268901950516041, 0.8049127740839689),
 (0.3904539942741394, 1.1475856453180313),
 0.014105193191905199]

In [45]:
statistics(within_differences)

[(0.6477223068687135, 0.6163529324926518),
 (0.002533257007598877, 1.0305310152471066),
 0.022848953755698218]

In [46]:
from scipy.stats import ks_2samp

In [47]:
ks_2samp(within_differences, between_differences)

KstestResult(statistic=0.539, pvalue=1.774876169162964e-133)