### Meta data for Syllabi

This notebook explores the syllabi data to see how the texts can be cleaned, while also storing meta data to a file. This notebook should be converted to a script when running on the full data.

In [1]:
from json import JSONDecoder, JSONDecodeError
import re
import os
from collections import Counter
from datetime import datetime

In [2]:
maybe = ["service", "administration", 'process']

In [3]:
my_stop_words = ['papers', 'paper', 'discussion', 'complete', 'require', 'time', 'quiz', 'attendance',
 'study', 'instructor', 'examination', 'course', 'student', 'students', 'syllabus', 'work', 'class', 'assignment',
 'requiring', 'requirement', 'credit', 'include', 'grade', 'week', 'professor', 'exam', 'final', 'credits', 'essay',
 'assessment', '•', 'prerequisite', 'book', 'textbook', 'withdraw', 'homework', 'hyperlink',
 'mailto:', 'exercise', '%', 'drop', 'withdrawl', 'pass', 'fail', 'classroom','campus','catalog',
 'unit', 'chapter', 'sunday', 'monday', 'tuesday', 'wednesday','thursday','friday','saturday','january',
 'february', 'march', 'april', 'may', 'june','july', 'august','september','october','november','december',
 'project','participation','semeseter', 'trimester','quarter','academic',
 'point', 'provide','activity', 'learn','date','office','hour','page', 'lecture',
 'review','lesson','disability','contact','meeting','meet','deadline','submit','integrity','dishonesty',
    'plagiarism', 'able', 'prepare', 'response', 'study', 'department',
 'cheat','report','submit','faculty','university','semester','presentation', 'summer','practicum',
 'instruction', 'generally', 'site', 'description', 'detail', 'unpaid', 'involve', 'college',
 'group', 'problem', 'support', 'section', 'schedule', 'available', 'term', 'late',
 'email', 'system', 'and/or','absence', 'introduction', 'write', 'read', 'test', 
 'topic', 'need', 'online','skill','note','resource', 'expect', 'post',  'question', 'material', 'follow',
 'help', 'apply', 'receive', 'base','practice','school','issue', 'center',
 'fall', 'assign', 'general', 'outcome', 'find', 'basic', 'total', 'evaluation', 'give', 'text',
 'learn', 'grade', 'require', 'point', 'hour', 'include', 'follow',
  'question', 'service', 'provide', 'course', 'requirement', 'exam', 'individual', 'expect', 
 'reading', 'give', 'access', 'experience', 'relate', 'discuss', 'activity', 'major',
    'result', 'level', 'form', 'appropriate', 'complete', 'assign', 'class','present', 'accommodation',
 'issue', 'well', 'think', 'list', 'phone', 'understand',  
    'quiz', 'miss', 'problem', 'right', 'degree', 'begin', 'teach', 'conduct', 'take', 'additional' 
    ]

In [4]:
NOT_WHITESPACE = re.compile(r'[^\s]')
def decode_stacked(document, pos=0, decoder=JSONDecoder()):
    while True:
        match = NOT_WHITESPACE.search(document, pos)
        if not match:
            return
        pos = match.start()

        try:
            obj, pos = decoder.raw_decode(document, pos)
        except JSONDecodeError:
            # do something sensible if there's some error
            raise
        yield obj

In [5]:
texts = []
deformatted_texts = []
states = []
cities = []
names = []
fields = []

In [6]:
i = 0

In [7]:
meta_data = {}

In [8]:
for filename in os.listdir("../openSyReal/"):
    i += 1
    if i > 3:
        break
    if filename.endswith(".json"): #Added try before countrycode and encoding argument
        with open("../openSyReal/" + filename, encoding = 'utf-8') as f:
            for line in f:
                for jsonfile in decode_stacked(line):
                    try:
                        if jsonfile['grid_country_code'] == 'US':
                            # texts.append(jsonfile['text'])
                            try:
                                name = jsonfile['NAME']
                                # names.append(name)
                            except KeyError:
                                name = None
                                # names.append(None)

                            try:
                                city = jsonfile['CITY']
                                # cities.append(city)
                            except KeyError:
                                city = None
                                # cities.append(None)
                            try:
                                state = jsonfile['STABBR']
                                # states.append(state)
                            except KeyError:
                                city = None
                                #states.append(None)
                            try:
                                field = jsonfile['field_name']
                                # fields.append(field)
                            except KeyError:
                                field = None
                                #fields.append(None)
                            try:
                                year = jsonfile['year']
                                # years.append(year)
                            except KeyError:
                                year = None
                                #years.append(None)
                            
                            if name is not None and (name, year) not in meta_data:
                                meta_data[(name, year)] = {}
                            if name is not None and (name, year) in meta_data and field not in meta_data[(name, year)]:
                                meta_data[(name, year)][field] = 0
                            if name is not None and (name, year) in meta_data and field in meta_data[(name, year)]:
                                meta_data[(name, year)][field] += 1
                    except:
                        pass

In [9]:
meta_data

{('Angelo State University',
  2013): {'Business, Management, and Related Support Services.': 1, 'Mathematics/Applied Mathematics': 1, 'Accounting and Related Services.': 1, 'Rhetoric and Composition/Writing Studies.': 1, 'Psychology, General/Other.': 1, 'Nursing': 1, 'Parks, Recreation, Leisure, and Fitness Studies.': 1},
 ('University of Mississippi', 2013): {'Mathematics/Applied Mathematics': 1},
 ('Tufts University', 2017): {'Computer and Information Sciences.': 2},
 ('Foothill College', 2017): {'Psychology, General/Other.': 1,
  'Basic Computer Skills.': 1,
  'Fine and Studio Arts.': 1},
 ('University of Georgia', 2011): {'Journalism.': 2,
  'Statistics.': 1,
  'Chemistry.': 1,
  'Computer, Network, and Technology Support Services.': 1,
  'Marketing.': 1,
  'Psychology, General/Other.': 1,
  'Nursing': 1,
  'Accounting and Related Services.': 1,
  'Architecture and Related Services.': 1,
  'Education.': 3,
  'Philosophy.': 1,
  'Clinical, Counseling and Applied Psychology.': 1},
 

In [8]:
import spacy

In [9]:
nlp = spacy.load("en")

In [10]:
for stopword in my_stop_words:
    lexeme = nlp.vocab[stopword]
    lexeme.is_stop = True

In [11]:
def clean_text(text, max_len=1500000):
    nlp.max_length = max_len
    if len(text) > max_len:
        text = text[0:max_len]
    doc = nlp(text.lower(), disable=["parser", "tagger", "ner"])
    clean_text = []
    for w in doc:
        if (not '\n' in w.text and not w.is_stop and not w.is_punct and not w.like_num and not '’' in w.text 
            and not ')' in w.text and not '\xa0' in w.text and not '\r' in w.text and not '\t' in w.text 
            and not '\v' in w.text and len(w.text) > 3 and "@" not in w.text and "https" not in w.text
            and w.lemma_ not in my_stop_words):
                # we add the lematized version of the word
                clean_text.append(w.lemma_)
    return clean_text

In [12]:
cleaned_texts = []

In [13]:
for text in texts:
    cleaned_texts.append(clean_text(text))

In [14]:
word_count = {}

In [15]:
for text in cleaned_texts:
    for word in text:
        if word not in word_count:
            word_count[word] = 0
        if word in word_count:
            word_count[word] += 1

In [16]:
import operator
sorted_words = sorted(word_count.items(), key=operator.itemgetter(1))

In [17]:
sorted_words.reverse()

In [18]:
for word_ in sorted_words[:100]:
    word, n = word_
    print(word, end=' ')

policy information program research state design plan social change analysis case development community computer content science demonstrate communication management code responsibility technology business source identify concept standard health method procedure professional knowledge high describe objective application theory area datum word cover member session completion spring idea personal specific history behavior order library performance graduate allow type human document attend opportunity numb critical subject address possible language current develop american answer environment participate internet principle view module year statement value function request evaluate field consider create build format record tool goal life strategy link prior advance account example world website select 