# Discovery and Representation of Open Making Related Terms

This notebook sketches the initial exercise on discovering the open making related keywords. The input text is harvested via a Web crawler that identifies and crawls semantically related wikipedia articles.   

In [1]:
from utils import tokenizer
import nltk
from nltk import FreqDist
from nltk.stem.porter import PorterStemmer
from numpy import log, mean
import json, csv, re
import pprint as pp

import pandas as pd

## 1. Loading a reference English language corpus

In [2]:
from nltk.corpus import brown
brown.categories()

[u'adventure',
 u'belles_lettres',
 u'editorial',
 u'fiction',
 u'government',
 u'hobbies',
 u'humor',
 u'learned',
 u'lore',
 u'mystery',
 u'news',
 u'religion',
 u'reviews',
 u'romance',
 u'science_fiction']

## 2. Stop words

### 2.1 Standard stop words

In [3]:
with open("data/stopwords_standard.txt", "r") as f:
    STOP_WORDS_STANDARD = set(f.read().strip().split("\n"))
print(STOP_WORDS_STANDARD)

set(['all', "she'll", 'just', "don't", 'being', 'over', 'through', 'yourselves', 'its', 'before', "he's", "when's", "we've", 'had', 'should', "he'd", 'to', 'only', "there's", 'those', 'under', 'has', "haven't", 'do', 'them', 'his', "they'll", 'get', 'very', "who's", "they'd", 'cannot', 'know', 'they', 'not', 'during', 'yourself', 'him', 'nor', "we'll", 'like', 'did', "they've", 'this', 'she', 'each', "won't", 'where', "mustn't", "isn't", "i'll", "why's", 'www', 'because', "you'd", 'doing', 'some', 'up', 'are', 'further', 'ourselves', 'out', 'what', 'for', 'while', "wasn't", 'does', "shouldn't", 'above', 'between', 'ought', 'be', 'we', 'who', "you're", 'were', 'here', 'hers', "aren't", 'by', 'both', 'about', 'would', 'of', 'could', "i'd", "weren't", "i'm", 'com', 'or', "can't", 'own', 'into', 'whom', 'down', "hadn't", "couldn't", 'your', "doesn't", 'from', "how's", 'her', 'their', "it's", 'there', 'been', 'why', 'few', 'too', 'themselves', 'was', 'until', 'more', 'himself', "where's", "

### 2.2 Open-making related stop words

In [4]:
with open("data/stopwords_openmaker.txt", "r") as f:
    STOP_WORDS_OPENMAKER = set(f.read().strip().split("\n"))
print(STOP_WORDS_OPENMAKER)

set(['isbn', 'often', 'vol', 'vi', 'eg', 'one', 'ii', 'second', '1st', '7th', '11th', 'txt', 'ad', 'pp', '6th', '3rd', 'na', '5th', 'wikipedia', 'randd', '14th', 'also', 'html', 'von', '15th', 'first', 'bc', 'may', '4th', 'wikipedias', 'org', 'iv', 'iii', '13th', 'almost', 'doi', 'third', 'many', 'well', 'britannica', '2nd', 'etc', 'encyclopedia', '9th', 'doc', 'pdf', '10th', 'tt', '12th', '8th'])


## 3. Removing stop words from the reference English corpus

In [5]:
# merging the two list together
STOP_WORDS = STOP_WORDS_STANDARD.union(STOP_WORDS_OPENMAKER)
print(STOP_WORDS)

set(['all', "she'll", 'just', "don't", 'being', 'over', 'through', 'during', 'once', 'its', 'before', "he's", "when's", "we've", 'tt', 'had', 'html', 'randd', 'should', "he'd", 'to', 'only', 'does', "here's", 'under', 'has', "haven't", 'do', 'them', 'his', 'above', 'get', 'very', "who's", "they'd", 'cannot', 'know', 'they', 'not', 'yourselves', 'one', 'him', 'nor', "we'll", 'like', 'did', '12th', "they've", "wasn't", 'she', 'each', "won't", 'where', "mustn't", "isn't", "i'll", "why's", 'www', 'because', "you'd", 'doing', 'eg', 'theirs', 'some', "hasn't", 'second', 'are', '7th', 'further', '11th', 'ourselves', 'out', 'what', 'for', 'herself', 'bc', 'wikipedia', 'below', '14th', 'may', "there's", "shouldn't", "they'll", 'between', '15th', 'can', 'be', 'we', 'after', "doesn't", 'doc', 'here', 'hers', 'org', "aren't", 'by', 'von', 'both', 'about', 'her', '8th', 'of', 'could', 'britannica', 'etc', "i'd", "weren't", 'pdf', "i'm", 'com', 'or', "can't", 'first', 'own', 'isbn', 'into', 'yoursel

In [6]:
# load english words from the Brown corpus removing stop words.
english_freq_dist = FreqDist([w.lower() for w in nltk.corpus.brown.words()
                              if w not in STOP_WORDS])
for stopword in STOP_WORDS:
    if stopword in english_freq_dist:
        del english_freq_dist[stopword]
        
for punctuation in tokenizer.CHARACTERS_TO_SPLIT:
    if punctuation in english_freq_dist:
        del english_freq_dist[punctuation]

# Re-control most common words after cleaning:
n_english = sum(english_freq_dist.values())
english_freq_dist.most_common(10)

[(u'``', 8837),
 (u"''", 8789),
 (u'--', 3432),
 (u'will', 2245),
 (u'said', 1961),
 (u'new', 1635),
 (u'time', 1598),
 (u'two', 1412),
 (u'now', 1314),
 (u'man', 1207)]

## 4. Loading the input Open Maker corpus

In [7]:
# load the harvested text from wikipedia.
with open("data/wikipedia2.json", "r") as f: OM_Corpus_text = f.read()
OM_Corpus = json.loads(OM_Corpus_text)

In [8]:
# The total number of wiki articles used:
print(len(OM_Corpus))

144


In [9]:
# Column names of the the corpus.
OM_Corpus[0].keys()

[u'url', u'text', u'depth', u'theme.id', u'title']

In [10]:
def display_articles(tid):
    articles = [article for article in OM_Corpus if article['theme.id'] == tid]
    for article in articles:
        print(article['depth'],article['title'], article['url'])

In [11]:
display_articles(1)

(0, u'Power (social and political)', u'https://en.wikipedia.org/wiki/Power_(social_and_political)')
(1, u'Speaking truth to power', u'https://en.wikipedia.org/wiki/Speaking_truth_to_power')
(1, u'State collapse', u'https://en.wikipedia.org/wiki/State_collapse')
(1, u'The Anatomy of Revolution', u'https://en.wikipedia.org/wiki/The_Anatomy_of_Revolution')
(1, u'Authority bias', u'https://en.wikipedia.org/wiki/Authority_bias')
(1, u'Discourse of power', u'https://en.wikipedia.org/wiki/Discourse_of_power')
(1, u'Chronemics', u'https://en.wikipedia.org/wiki/Control_of_time_in_power_relationships')
(1, u'Personal boundaries', u'https://en.wikipedia.org/wiki/Personal_boundaries')
(1, u'Cratology', u'https://en.wikipedia.org/wiki/Cratology')
(1, u'Veto', u'https://en.wikipedia.org/wiki/Veto')
(1, u'Amity-enmity complex', u'https://en.wikipedia.org/wiki/Amity-enmity_complex')
(1, u'Social control', u'https://en.wikipedia.org/wiki/Social_control')


In [12]:
display_articles(2)

(0, u'Social influence', u'https://en.wikipedia.org/wiki/Social_influence')
(0, u'Need for achievement', u'https://en.wikipedia.org/wiki/Need_for_achievement')
(1, u'Goal orientation', u'https://en.wikipedia.org/wiki/Goal_orientation')
(1, u'Need for cognition', u'https://en.wikipedia.org/wiki/Need_for_cognition')
(1, u'Need for power', u'https://en.wikipedia.org/wiki/Need_for_power')
(1, u'Need theory', u'https://en.wikipedia.org/wiki/Need_theory')
(1, u'Need for affiliation', u'https://en.wikipedia.org/wiki/Need_for_affiliation')
(1, u'Bystander effect', u'https://en.wikipedia.org/wiki/Bystander_effect')
(1, u'Social proof', u'https://en.wikipedia.org/wiki/Social_proof')
(1, u'Mind shaping', u'https://en.wikipedia.org/wiki/Mind_shaping')
(1, u'Propaganda', u'https://en.wikipedia.org/wiki/Propaganda')
(1, u'Judge\u2013advisor system', u'https://en.wikipedia.org/wiki/Judge%E2%80%93advisor_system')
(1, u'Impression management', u'https://en.wikipedia.org/wiki/Impression_management')


In [13]:
display_articles(3)

(0, u'Hedonism', u'https://en.wikipedia.org/wiki/Hedonism')
(1, u'Utilitarianism', u'https://en.wikipedia.org/wiki/Utilitarianism')
(1, u'Paradox of hedonism', u'https://en.wikipedia.org/wiki/Paradox_of_hedonism')
(1, u'Epicureanism', u'https://en.wikipedia.org/wiki/Epicureanism')
(1, u'Eudaimonia', u'https://en.wikipedia.org/wiki/Eudaimonia')
(1, u'Yangism', u'https://en.wikipedia.org/wiki/Yangism')
(1, u'Pleasure principle (psychology)', u'https://en.wikipedia.org/wiki/Pleasure_principle_(psychology)')
(1, u'Cyrenaics', u'https://en.wikipedia.org/wiki/Cyrenaics')
(1, u'Torbj\xf6rn T\xe4nnsj\xf6', u'https://en.wikipedia.org/wiki/Torbj%C3%B6rn_T%C3%A4nnsj%C3%B6')
(1, u'Hedonism Resorts', u'https://en.wikipedia.org/wiki/Hedonism_Resorts')
(1, u'Psychological egoism', u'https://en.wikipedia.org/wiki/Psychological_hedonism')
(1, u'Michel Onfray', u'https://en.wikipedia.org/wiki/Michel_Onfray')
(1, u'David Pearce (philosopher)', u'https://en.wikipedia.org/wiki/David_Pearce_(philosopher)')


In [14]:
display_articles(4)

(0, u'Stimulation', u'https://en.wikipedia.org/wiki/Stimulation')


In [15]:
display_articles(5)

(0, u'Independence', u'https://en.wikipedia.org/wiki/Independence')
(0, u'Freedom', u'https://en.wikipedia.org/wiki/Freedom')
(0, u'Creativity', u'https://en.wikipedia.org/wiki/Creativity')


In [16]:
display_articles(6)

(0, u'Universalism', u'https://en.wikipedia.org/wiki/Universalism')
(0, u'Social justice', u'https://en.wikipedia.org/wiki/Social_justice')
(0, u'Egalitarianism', u'https://en.wikipedia.org/wiki/Egalitarianism')
(0, u'Environmental protection', u'https://en.wikipedia.org/wiki/Environmental_protection')


In [17]:
display_articles(7)

(0, u'Loyalty', u'https://en.wikipedia.org/wiki/Loyalty')
(0, u'Altruism', u'https://en.wikipedia.org/wiki/Altruism')
(0, u'Responsibility', u'https://en.wikipedia.org/wiki/Responsibility')
(1, u'Alms', u'https://en.wikipedia.org/wiki/Alms')
(1, u'Inclusive fitness', u'https://en.wikipedia.org/wiki/Inclusive_fitness')
(1, u'Tragedy of the commons', u'https://en.wikipedia.org/wiki/Comedy_of_the_commons')
(1, u"Prisoner's dilemma", u'https://en.wikipedia.org/wiki/Prisoner%27s_dilemma')
(1, u'Egotism', u'https://en.wikipedia.org/wiki/Egotism')
(1, u'Kin selection', u'https://en.wikipedia.org/wiki/Kin_selection')
(1, u'Empathy-altruism', u'https://en.wikipedia.org/wiki/Empathy-altruism')
(1, u'Prosocial behavior', u'https://en.wikipedia.org/wiki/Prosocial_behavior')
(1, u'Mutual aid (organization theory)', u'https://en.wikipedia.org/wiki/Mutual_aid_(organization)')
(1, u'Solidarity', u'https://en.wikipedia.org/wiki/Solidarity_(sociology)')
(1, u'Social psychology', u'https://en.wikipedia.o

In [18]:
display_articles(8)

(0, u'Tradition', u'https://en.wikipedia.org/wiki/Tradition')
(0, u'Modesty', u'https://en.wikipedia.org/wiki/Modesty')
(1, u'Perennial philosophy', u'https://en.wikipedia.org/wiki/Perennial_philosophy')
(1, u'Folklore', u'https://en.wikipedia.org/wiki/Folklore')
(1, u'Origin myth', u'https://en.wikipedia.org/wiki/Aition')


In [19]:
display_articles(9)

(0, u'Conformity', u'https://en.wikipedia.org/wiki/Conformity')
(1, u'Cultural assimilation', u'https://en.wikipedia.org/wiki/Cultural_assimilation')
(1, u'Authoritarian personality', u'https://en.wikipedia.org/wiki/Authoritarian_personality')
(1, u'Spiral of silence', u'https://en.wikipedia.org/wiki/Spiral_of_silence')
(1, u'Milieu control', u'https://en.wikipedia.org/wiki/Milieu_control')
(1, u'Countersignaling', u'https://en.wikipedia.org/wiki/Countersignaling')
(0, u'Obedience (human behavior)', u'https://en.wikipedia.org/wiki/Obedience_(human_behavior)')
(0, u'Discipline', u'https://en.wikipedia.org/wiki/Discipline')
(1, u'Social norm', u'https://en.wikipedia.org/wiki/Norm_(social)')
(1, u'Deindividuation', u'https://en.wikipedia.org/wiki/Deindividuation')
(1, u'School discipline', u'https://en.wikipedia.org/wiki/School_discipline')
(1, u'Positive discipline', u'https://en.wikipedia.org/wiki/Positive_discipline')
(1, u'Child discipline', u'https://en.wikipedia.org/wiki/Child_disci

In [20]:
display_articles(10)

(0, u'Security', u'https://en.wikipedia.org/wiki/Security')
(0, u'Social order', u'https://en.wikipedia.org/wiki/Social_order')
(0, u'Cleanliness', u'https://en.wikipedia.org/wiki/Cleanliness')
(1, u'Cleanroom', u'https://en.wikipedia.org/wiki/Clean_room')
(1, u'Green cleaning', u'https://en.wikipedia.org/wiki/Green_cleaning')
(1, u'Cleaning', u'https://en.wikipedia.org/wiki/Cleaning')
(1, u'Waste management', u'https://en.wikipedia.org/wiki/Waste_management')
(1, u'Environmental remediation', u'https://en.wikipedia.org/wiki/Environmental_remediation')
(1, u'Pollution', u'https://en.wikipedia.org/wiki/Pollution')
(1, u'Contamination control', u'https://en.wikipedia.org/wiki/Contamination_control')
(1, u'Lady Macbeth effect', u'https://en.wikipedia.org/wiki/Lady_Macbeth_effect')
(1, u'Antiseptic', u'https://en.wikipedia.org/wiki/Antiseptic')
(1, u'Hygiene', u'https://en.wikipedia.org/wiki/Hygiene')
(1, u'Cleaner', u'https://en.wikipedia.org/wiki/Cleaner')
(1, u'Ritual purification', u'h

## 5. Analyzing and cleaning a specific corpus based on a theme

In [21]:
def get_title(Corpus, theme_id):
    title = ''
    for article in Corpus:
        if article['theme.id'] == theme_id:
            title = article['title']
            break
    return title

### 5.0 Selecting the specific theme (a sub-corpus).

In [22]:
## For a different sub-corpus use a corresponding theme ID.
current_theme_id = 5

In [23]:
current_title = get_title(OM_Corpus, current_theme_id)

In [24]:
output_fname = "_".join([word.capitalize() for word in current_title.split(" ")])
print(current_title, "::", output_fname)

(u'Independence', '::', u'Independence')


In [25]:
input_text = " ".join([page['text'] for page in OM_Corpus if page['theme.id'] == current_theme_id])

In [26]:
pp.pprint(input_text)

u'Independence \n For other uses see \n Independence disambiguation \n Thirteen \n British \n colonies on the east coast of \n North America \n issued a \n Declaration of Independence \n in \n Chile \n one of several \n Spanish \n colonies in \n South America \n issued a \n Declaration of independence \n in \n Independence \n is a condition of a \n nation \n country \n or \n state \n in which its residents and population or some portion thereof exercise \n self-government \n and usually \n sovereignty \n over the territory The opposite of independence is the status of a \n dependent territory \n Definition of independence \n Distinction between independence and autonomy \n Declarations of independence \n Historical overview \n Continents \n Notes \n Definition of independence \n Whether the attainment of independence is different from \n revolution \n has long been contested and has often been debated over the question of \n violence \n as \n legitimate \n means to achieving sovereignt

In [27]:
# Tokenizing the input text:
tokenized = tokenizer.tokenize_words(input_text)
number_of_words = len(tokenized)
print(number_of_words,current_title)

(19061, u'Independence')


### 5.1 Computing frequency distributions of each token, i.e word, term, pancuation, etc.

In [28]:
input_freq_dist = FreqDist(tokenized)

In [29]:
input_freq_dist.most_common(10)

[(u'\n', 2213),
 (u'the', 767),
 (u'of', 723),
 (u'and', 658),
 (u'in', 397),
 (u'a', 382),
 (u'creativity', 377),
 (u'to', 376),
 (u'that', 170),
 (u'as', 165)]

### 5.2 Removing punctuation and stopwords from the input corpus

In [30]:
for stopword in STOP_WORDS:
    if stopword in input_freq_dist:
        del input_freq_dist[stopword]
        
for punctuation in tokenizer.CHARACTERS_TO_SPLIT:
    if punctuation in input_freq_dist:
        del input_freq_dist[punctuation]

# Re-control most common words after cleaning:
n_input = sum(input_freq_dist.values())
input_freq_dist.most_common(10)

[(u'creativity', 377),
 (u'creative', 163),
 (u'intelligence', 94),
 (u'j', 74),
 (u's', 63),
 (u'freedom', 63),
 (u'e', 56),
 (u'b', 53),
 (u'new', 52),
 (u'c', 50)]

### 5.3 Removing all numbered words

This is an example case for post proceesing in terms of cleaning. The pre-processing, that is data cleaning/preperation during or right after harvesting should be further improved to avoid such processes at this stage.

In [31]:
pattern_letters = re.compile('[a-z]')
def has_letters(x):
    return(pattern_letters.search(x) is not None)

In [32]:
reduced = {k:v for k,v in input_freq_dist.items() if has_letters(k)}
print("Reduction due to all number matches: ", len(input_freq_dist) - len(reduced))
input_freq_dist = reduced

('Reduction due to all number matches: ', 107)


### 5.4 Removing single character words


In [33]:
reduced = {k:v for k,v in input_freq_dist.items() if len(k) > 1}
print("Reduction due to single characters: ", len(input_freq_dist) - len(reduced))
input_freq_dist = reduced

('Reduction due to single characters: ', 22)


### 5.5 Removing rare words from input distribution

In [34]:
reduced = {k:v for k,v in input_freq_dist.items() if v > 5}
print("Reduction due to rare occurances: ", len(input_freq_dist) - len(reduced))
input_freq_dist = reduced

('Reduction due to rare occurances: ', 3127)


## 6. Comparing input vs English corpus volumes

### 6.1 Total words (after cleaning the stopwords) 

In [35]:
print(n_input, n_english)

(10600, 544168)


### 6.2 Number of unique words (after cleaning stopwords and rare words)

In [36]:
n_unique_word_input = len(input_freq_dist.items())
n_unique_word_brown = len(english_freq_dist.items())
n_unique_word_input, n_unique_word_brown

(350, 49598)

### 6.3 Cleaned set of input words/terms

List of words in the corpus in case, for a visual inspection. Such inspections will be used both to improve tokenization as well as filtering.

In [37]:
pp.pprint(sorted(input_freq_dist.items(), key=lambda x:x[1], reverse=True))

[(u'creativity', 377),
 (u'creative', 163),
 (u'intelligence', 94),
 (u'freedom', 63),
 (u'new', 52),
 (u'theory', 49),
 (u'problem', 49),
 (u'thinking', 44),
 (u'people', 40),
 (u'independence', 39),
 (u'process', 37),
 (u'psychology', 37),
 (u'research', 36),
 (u'individual', 36),
 (u'ideas', 36),
 (u'sternberg', 34),
 (u'model', 33),
 (u'press', 33),
 (u'work', 32),
 (u'knowledge', 31),
 (u'cognitive', 31),
 (u'processes', 27),
 (u'study', 26),
 (u'cambridge', 26),
 (u'solving', 26),
 (u'university', 26),
 (u'tests', 26),
 (u'creativity"', 25),
 (u'different', 25),
 (u'divergent', 23),
 (u'personality', 23),
 (u'approach', 23),
 (u'journal', 23),
 (u'levels', 22),
 (u'human', 22),
 (u'handbook', 22),
 (u'kaufman', 22),
 (u'others', 21),
 (u'theories', 21),
 (u'motivation', 21),
 (u'team', 20),
 (u'liberty', 20),
 (u'social', 20),
 (u'number', 20),
 (u'self', 20),
 (u'innovation', 20),
 (u'concept', 19),
 (u'working', 19),
 (u'science', 19),
 (u'high', 19),
 (u'thought', 19),
 (u'gen

### 6.4 Set of terms/words that occure in both corpus.

In [38]:
len(input_freq_dist.keys())

350

In [39]:
common_words = [w for w in set(input_freq_dist.keys()) & set(english_freq_dist.keys())]
print(len(common_words))

313


In [40]:
pp.pprint(sorted(common_words))

[u'ability',
 u'able',
 u'according',
 u'achieve',
 u'across',
 u'actions',
 u'activities',
 u'advertising',
 u'affect',
 u'age',
 u'aggression',
 u'al',
 u'among',
 u'analysis',
 u'ancient',
 u'another',
 u'applied',
 u'approach',
 u'approaches',
 u'areas',
 u'argued',
 u'art',
 u'artists',
 u'arts',
 u'asked',
 u'aspects',
 u'associated',
 u'association',
 u'attention',
 u'autonomy',
 u'average',
 u'become',
 u'behavior',
 u'blending',
 u'brain',
 u'business',
 u'cambridge',
 u'century',
 u'cerebellum',
 u'children',
 u'cognitive',
 u'compared',
 u'component',
 u'computational',
 u'computer',
 u'concept',
 u'conceptual',
 u'conditions',
 u'considered',
 u'control',
 u'correlation',
 u'correlations',
 u'countries',
 u'create',
 u'creation',
 u'creative',
 u'creativity',
 u'culture',
 u'curiosity',
 u'data',
 u'day',
 u'declaration',
 u'definition',
 u'degrees',
 u'developed',
 u'development',
 u'difference',
 u'differences',
 u'different',
 u'discovery',
 u'disorder',
 u'distinct',
 u

### 6.5 Set of terms/words that occure in the sample but not in the reference corpus.

This specific set will be incorporated later below. 

In [41]:
input_specifics = dict()
for w in set(input_freq_dist.keys()) - set(english_freq_dist.keys()):
    input_specifics[w] = input_freq_dist[w]

In [42]:
print(len(input_specifics))

37


In [43]:
pp.pprint(sorted(input_specifics))

[u'"a',
 u'"creativity',
 u'"the',
 u'1016j',
 u'amabile',
 u'archived',
 u'autonomous',
 u'batey',
 u'bipolar',
 u'c"',
 u'cognition',
 u'constructs',
 u'convergent',
 u'creativity"',
 u'csikszentmihalyi',
 u'eds',
 u'extrinsic',
 u'furnham',
 u'honing',
 u'jc',
 u'jrgen',
 u'kaufman',
 u'mc',
 u'meta',
 u'pmid',
 u'psychometric',
 u'rem',
 u'rj',
 u'runco',
 u'schmidhuber',
 u'software',
 u'sternberg',
 u'subset',
 u'torrance',
 u'vandervert',
 u'wallas',
 u'worldview']


## 7. Stemming

In [44]:
stemmer = PorterStemmer()
input_wset_stems = {k: stemmer.stem(k) for k in input_freq_dist.keys()}
common_wset_stems = {k: stemmer.stem(k) for k in common_words}
pp.pprint(common_wset_stems)

{u'ability': u'abil',
 u'able': u'abl',
 u'according': u'accord',
 u'achieve': u'achiev',
 u'across': u'across',
 u'actions': u'action',
 u'activities': u'activ',
 u'advertising': u'advertis',
 u'affect': u'affect',
 u'age': u'age',
 u'aggression': u'aggress',
 u'al': u'al',
 u'among': u'among',
 u'analysis': u'analysi',
 u'ancient': u'ancient',
 u'another': u'anoth',
 u'applied': u'appli',
 u'approach': u'approach',
 u'approaches': u'approach',
 u'areas': u'area',
 u'argued': u'argu',
 u'art': u'art',
 u'artists': u'artist',
 u'arts': u'art',
 u'asked': u'ask',
 u'aspects': u'aspect',
 u'associated': u'associ',
 u'association': u'associ',
 u'attention': u'attent',
 u'autonomy': u'autonomi',
 u'average': u'averag',
 u'become': u'becom',
 u'behavior': u'behavior',
 u'blending': u'blend',
 u'brain': u'brain',
 u'business': u'busi',
 u'cambridge': u'cambridg',
 u'century': u'centuri',
 u'cerebellum': u'cerebellum',
 u'children': u'children',
 u'cognitive': u'cognit',
 u'compared': u'compa

## 8. Handling input specific term set

### 8.1 Identfying matching stems with common words. 

Note that the frequency counts are transferred accordingly.

In [45]:
specifics = {}
for k,v in input_specifics.items():
    stem = input_wset_stems[k]
    words = [w for w,s in common_wset_stems.items() if s == stem]
    if words:
        w = words[0]
        input_freq_dist[w] += input_specifics[k]
        continue
    specifics[k] = v
# Removing the words with matching stems from the specific set.
print("Reduction due to stemm matches: ", len(input_specifics) - len(specifics))
input_specifics = specifics

('Reduction due to stemm matches: ', 2)


### 8.2 Removing open-maker specific terms.

In [46]:
with open("data/specifics_openmaker.txt", "r") as f:
    SPECIFICS_OPENMAKER = set(f.read().strip().split("\n"))
om_specific_stems = {stemmer.stem(k):k for k in SPECIFICS_OPENMAKER}
pp.pprint(om_specific_stems)

{'3-d': '3-d',
 '3d': '3d',
 u'abat': 'abatement',
 u'afford': 'affordable',
 'agenda21': 'agenda21',
 u'anarch': 'anarchism',
 u'autonom': 'autonomous',
 'biodiesel': 'biodiesel',
 u'biodivers': 'biodiversity',
 'biofuel': 'biofuel',
 u'bioga': 'biogas',
 u'biomass': 'biomass',
 u'biospher': 'biosphere',
 u'bricolag': 'bricolage',
 'brundtland': 'brundtland',
 'c2c': 'c2c',
 'cad': 'cad',
 u'cap-and-trad': 'cap-and-trade',
 u'carfre': 'carfree',
 'cdm': 'cdm',
 'christensen': 'christensen',
 u'co-creat': 'co-creation',
 'co-develop': 'co-develop',
 u'co-invent': 'co-invention',
 'co-inventor': 'co-inventor',
 u'coextinct': 'coextinction',
 u'cognit': 'cognition',
 u'commons-bas': 'commons-based',
 u'computer-aid': 'computer-aided',
 u'conferenc': 'conferencing',
 'consortium': 'consortium',
 u'constraint': 'constraints',
 'construct': 'construct',
 'copyleft': 'copyleft',
 'copyright': 'copyright',
 u'cradle-to-cradl': 'cradle-to-cradle',
 u'crowdsourc': 'crowdsourcing',
 u'crowdwork'

In [47]:
specific_wset_dirty = {}
specific_wset_stems_selected = {}
for k,v in input_specifics.items():
    stem = stemmer.stem(k)
    if stem in om_specific_stems.keys():
        if k in specific_wset_stems_selected.keys():
            specific_wset_stems_selected[stem] += v
        else:
            specific_wset_stems_selected[stem] = v
        continue
    if v > 10:
        specific_wset_dirty[k] = v
        
input_specifics = specific_wset_dirty

In [48]:
# The set of stems to be added to the set with makerness counts.

pp.pprint(specific_wset_stems_selected)

{u'autonom': 6,
 u'construct': 11,
 u'extrins': 7,
 u'psychometr': 8,
 u'schmidhub': 12,
 u'softwar': 11,
 u'sternberg': 34}


### 8.3 Remaining frequent input specifics
The manual checking can help to determine what should go into "specifics_openmaker.txt"

In [49]:
print(len(input_specifics))
pp.pprint(sorted(input_specifics))

7
[u'"the',
 u'creativity"',
 u'kaufman',
 u'runco',
 u'subset',
 u'torrance',
 u'vandervert']


In [50]:
specific_wset_stems = {}
for k,v in input_specifics.items():
    stem = stemmer.stem(k)
    if stem in specific_wset_stems.keys():
        specific_wset_stems[stem].append((k,v))
    else:
        specific_wset_stems[stem] = [(k,v)]
pp.pprint(specific_wset_stems)

{u'"the': [(u'"the', 13)],
 u'creativity"': [(u'creativity"', 25)],
 u'kaufman': [(u'kaufman', 22)],
 u'runco': [(u'runco', 15)],
 u'subset': [(u'subset', 11)],
 u'torranc': [(u'torrance', 11)],
 u'vandervert': [(u'vandervert', 18)]}


## 9. Computing representation power of common words.

In [51]:
# combine
word = []
freq = []
score = []
# common_words = [w[0] for w in common_words]
nEng = 1.0 * n_english
nInp = 1.0 * n_input
for w in common_words:
    # Consider only words whose charcater length is larger than 1
    if len(w) > 1:
        # Log likelihood scores are computed:
        print(w, input_freq_dist[w], english_freq_dist[w])
        s = log((input_freq_dist[w] / nInp) / (english_freq_dist[w] / nEng))
        f = input_freq_dist[w]
        word.append(w)
        freq.append(f)
        score.append(s)

(u'learning', 15, 60)
(u'concept', 19, 85)
(u'particularly', 8, 146)
(u'help', 7, 311)
(u'proposed', 9, 84)
(u'less', 10, 437)
(u'developed', 10, 170)
(u'insight', 9, 22)
(u'focus', 8, 40)
(u'actions', 6, 68)
(u'four', 10, 360)
(u'brain', 11, 45)
(u'relevant', 10, 23)
(u'sleep', 10, 65)
(u'higher', 8, 160)
(u'human', 22, 299)
(u'using', 8, 145)
(u'argued', 7, 29)
(u'tend', 6, 43)
(u'children', 7, 355)
(u'aggression', 12, 10)
(u'previous', 6, 86)
(u'iq', 17, 1)
(u'group', 17, 390)
(u'knowledge', 31, 145)
(u'field', 8, 274)
(u'thinking', 44, 145)
(u'workers', 10, 86)
(u'solving', 26, 8)
(u'participants', 9, 7)
(u'originality', 8, 6)
(u'age', 7, 227)
(u'production', 6, 148)
(u'degrees', 17, 23)
(u'factor', 7, 71)
(u'teaching', 8, 67)
(u'include', 9, 113)
(u'might', 6, 672)
(u'achieve', 8, 51)
(u'divergent', 23, 6)
(u'activities', 7, 115)
(u'non', 10, 10)
(u'aspects', 6, 64)
(u'views', 8, 51)
(u'lobe', 9, 3)
(u'frontal', 10, 3)
(u'arts', 6, 66)
(u'guilford', 10, 1)
(u'term', 11, 79)
(u'res

In [52]:
m = pd.Series(score)
f = pd.Series(freq)
k = pd.Series(word)
stem = k.apply(stemmer.stem)
scoring = len(k) * ['common']
df_common = pd.DataFrame({'Word':k, 'Stem':stem, 'Score':m,'Tf':f, 'Type':scoring})
df_common.sort_values(by='Score', ascending=False, inplace = True)
df_common.reset_index(inplace=True, drop=True)
df_common.head(20)

Unnamed: 0,Score,Stem,Tf,Type,Word
0,7.673425,creativ,377,common,creativity
1,6.908818,cognit,39,common,cognitive
2,6.771617,iq,17,common,iq
3,6.336299,handbook,22,common,handbook
4,6.240989,guilford,10,common,guilford
5,6.240989,cerebellum,10,common,cerebellum
6,6.07847,malevol,17,common,malevolent
7,6.017846,fluenci,8,common,fluency
8,5.884314,blend,7,common,blending
9,5.730163,comput,6,common,computational


### 9.1 Computing makerness of specific terms

In [53]:
max_score = df_common.iloc[0,0]
threshold_score = 1.0
mean_w = df_common.Score[df_common.Score > threshold_score].mean()
mean_f = df_common.Tf[df_common.Score > threshold_score].mean()
print(max_score, threshold_score, mean_w, mean_f)

(7.6734246317029537, 1.0, 2.714948873669087, 15.818181818181818)


In [54]:
def compute_speficif_score(f, maxw=10, minw=1, mean_w=3.0, mean_f=25.0):
    return(max(min((f / mean_f) * mean_w, maxw), minw))
compute_speficif_score(mean_w, max_score, threshold_score, mean_w, mean_f)

1.0

In [55]:
stem = []
freq = []
score = []
for k, v in specific_wset_stems_selected.items():
    s = compute_speficif_score(v, max_score, threshold_score, mean_w, mean_f)
    stem.append(k)
    freq.append(v)
    score.append(s)

## 10. Tabulating the results and generating the output file

In [56]:
m = pd.Series(score)
f = pd.Series(freq)
stem = pd.Series(stem)
scoring = len(m) * ['specific']
df_exclusive= pd.DataFrame({'Stem':stem, 'Score':m,'Tf':f, 'Type':scoring})
df_exclusive.sort_values(by='Score', ascending=False, inplace = True)
df_exclusive.reset_index(inplace=True, drop=True)
df_exclusive.head(20)

Unnamed: 0,Score,Stem,Tf,Type
0,5.83558,sternberg,34,specific
1,2.059616,schmidhub,12,specific
2,1.887982,softwar,11,specific
3,1.887982,construct,11,specific
4,1.373078,psychometr,8,specific
5,1.201443,extrins,7,specific
6,1.029808,autonom,6,specific


In [57]:
df_makerness = df_common.append(df_exclusive, ignore_index=True)
df_makerness.sort_values(by='Score', ascending=False, inplace = True)
df_makerness.reset_index(inplace=True, drop=True)
df_makerness.head(42)

Unnamed: 0,Score,Stem,Tf,Type,Word
0,7.673425,creativ,377,common,creativity
1,6.908818,cognit,39,common,cognitive
2,6.771617,iq,17,common,iq
3,6.336299,handbook,22,common,handbook
4,6.240989,guilford,10,common,guilford
5,6.240989,cerebellum,10,common,cerebellum
6,6.07847,malevol,17,common,malevolent
7,6.017846,fluenci,8,common,fluency
8,5.884314,blend,7,common,blending
9,5.83558,sternberg,34,specific,


In [58]:
df_makerness.tail(42)

Unnamed: 0,Score,Stem,Tf,Type,Word
278,0.327486,among,10,common,among
279,0.308744,sever,10,common,several
280,0.23402,mind,8,common,mind
281,0.224832,import,9,common,important
282,0.164002,three,14,common,three
283,0.162364,use,14,common,used
284,0.161056,less,10,common,less
285,0.144521,help,7,common,help
286,0.122263,open,7,common,open
287,0.117035,word,6,common,word


In [59]:
OUTPUT_FOLDER = "./output/"
csvfile_name = OUTPUT_FOLDER + "makerness_" + output_fname + ".csv"
with open(csvfile_name, 'w') as csvfile:
    #thewriter = csv.writer(csvfile, delimiter=',')
    #for k,v in sorted(makerness.items(), key=lambda x:x[1][0], reverse=True): thewriter.writerow([k,v[0],v[1]])
    df_makerness.to_csv(csvfile_name)

### 10.1 Outputfile name for the theme

In [60]:
print(csvfile_name)

./output/makerness_Independence.csv
