# Discovery and Representation of Open Making Related Terms

This notebook sketches the initial exercise on discovering the open making related keywords. The input text is harvested via a Web crawler that identifies and crawls semantically related wikipedia articles.   

In [1]:
from utils import tokenizer
import nltk
from nltk import FreqDist
from nltk.stem.porter import PorterStemmer
from numpy import log, mean
import json, csv, re
import pprint as pp

import pandas as pd

## 1. Loading a reference English language corpus

In [2]:
from nltk.corpus import brown
brown.categories()

['adventure',
 'belles_lettres',
 'editorial',
 'fiction',
 'government',
 'hobbies',
 'humor',
 'learned',
 'lore',
 'mystery',
 'news',
 'religion',
 'reviews',
 'romance',
 'science_fiction']

## 2. Stop words

### 2.1 Standard stop words

In [3]:
with open("data/stopwords_standard.txt", "r") as f:
    STOP_WORDS_STANDARD = set(f.read().strip().split("\n"))
print(STOP_WORDS_STANDARD)

{'yourself', 'have', 'by', 'further', 'their', "couldn't", "i've", 'ought', 'been', 'was', 'too', 'they', "hasn't", "he'll", 'can', 'after', 'i', 'out', "we'd", 'just', "don't", "won't", 'get', "how's", "mustn't", 'more', 'does', 'it', 'same', 'between', "we'll", "you've", 'has', "we've", "what's", 'am', 'he', 'under', "i'll", 'what', 'the', 'had', 'do', "he'd", 'that', 'my', 'a', 'who', 'theirs', "you'd", 'because', 'most', "where's", 'his', 'him', 'only', "here's", 'until', "wouldn't", 'she', 'ours ', 'be', 'are', 'off', 'when', 'whom', 'any', 'them', 'all', 'those', 'about', 'ourselves', 'then', "aren't", "haven't", "she'll", 'could', "let's", "they've", "why's", 'up', 'her', "weren't", 'to', 'having', 'being', 'hers', "who's", 'http', "there's", 'myself', "that's", 'herself', "doesn't", 'your', 'with', 'for', "shan't", 'why', 'down', 'during', "hadn't", 'r', "didn't", 'other', 'our', 'its', 'nor', 'how', 'once', 'some', 'yourselves', 'than', "shouldn't", 'me', 'here', 'itself', 'do

### 2.2 Open-making related stop words

In [4]:
with open("data/stopwords_openmaker.txt", "r") as f:
    STOP_WORDS_OPENMAKER = set(f.read().strip().split("\n"))
print(STOP_WORDS_OPENMAKER)

{'vi', 'html', 'iv', '11th', '10th', '9th', 'britannica', '13th', '15th', 'encyclopedia', 'etc', 'ad', 'third', '3rd', 'tt', 'iii', '4th', 'wikipedia', 'pp', 'pdf', 'ii', 'often', 'org', '8th', 'von', 'many', 'wikipedias', 'well', 'one', 'almost', '7th', 'bc', 'txt', 'second', 'eg', 'may', '2nd', '12th', 'doi', '1st', 'doc', 'also', '6th', 'first', '14th', 'isbn', '5th', 'vol', 'randd', 'na'}


## 3. Removing stop words from the reference English corpus

In [5]:
# merging the two list together
STOP_WORDS = STOP_WORDS_STANDARD.union(STOP_WORDS_OPENMAKER)
print(STOP_WORDS)

{'yourself', 'have', 'by', 'iv', 'further', 'their', "couldn't", "i've", 'etc', 'ought', 'third', 'been', 'was', 'too', '3rd', 'they', "hasn't", "he'll", 'can', 'after', 'i', 'out', "we'd", 'just', 'wikipedia', 'pdf', "don't", "won't", 'get', "how's", "mustn't", 'more', 'does', 'it', 'same', 'between', "we'll", "you've", 'has', "we've", "what's", 'am', 'he', 'under', "i'll", 'also', '14th', '5th', 'what', 'the', 'had', 'do', 'randd', 'vi', "he'd", 'html', 'that', '9th', 'my', 'britannica', '13th', 'a', 'who', '15th', 'theirs', "you'd", 'because', 'most', "where's", '4th', 'his', 'him', 'only', "here's", 'until', "wouldn't", 'she', 'org', 'von', 'ours ', 'be', 'are', 'off', 'when', 'one', 'whom', 'any', 'them', 'all', 'those', 'about', 'ourselves', 'bc', 'then', "aren't", 'txt', "haven't", "she'll", 'could', "let's", "they've", 'second', "why's", 'up', 'her', "weren't", 'to', 'may', 'having', '12th', 'being', 'hers', '6th', 'isbn', "who's", 'http', "there's", 'na', 'myself', "that's", '

In [6]:
# load english words from the Brown corpus removing stop words.
english_freq_dist = FreqDist([w.lower() for w in nltk.corpus.brown.words()
                              if w not in STOP_WORDS])
for stopword in STOP_WORDS:
    if stopword in english_freq_dist:
        del english_freq_dist[stopword]
        
for punctuation in tokenizer.CHARACTERS_TO_SPLIT:
    if punctuation in english_freq_dist:
        del english_freq_dist[punctuation]

# Re-control most common words after cleaning:
n_english = sum(english_freq_dist.values())
english_freq_dist.most_common(10)

[('``', 8837),
 ("''", 8789),
 ('--', 3432),
 ('will', 2245),
 ('said', 1961),
 ('new', 1635),
 ('time', 1598),
 ('two', 1412),
 ('now', 1314),
 ('man', 1207)]

## 4. Loading the input Open Maker corpus

In [7]:
# load the harvested text from wikipedia.
with open("data/wikipedia2.json", "r") as f: OM_Corpus_text = f.read()
OM_Corpus = json.loads(OM_Corpus_text)

In [8]:
# The total number of wiki articles used:
print(len(OM_Corpus))

144


In [9]:
# Column names of the the corpus.
OM_Corpus[0].keys()

dict_keys(['theme.id', 'title', 'url', 'depth', 'text'])

In [10]:
def display_articles(tid):
    articles = [article for article in OM_Corpus if article['theme.id'] == tid]
    for article in articles:
        print(article['depth'],article['title'], article['url'])

In [11]:
display_articles(1)

0 Power (social and political) https://en.wikipedia.org/wiki/Power_(social_and_political)
1 Speaking truth to power https://en.wikipedia.org/wiki/Speaking_truth_to_power
1 State collapse https://en.wikipedia.org/wiki/State_collapse
1 The Anatomy of Revolution https://en.wikipedia.org/wiki/The_Anatomy_of_Revolution
1 Authority bias https://en.wikipedia.org/wiki/Authority_bias
1 Discourse of power https://en.wikipedia.org/wiki/Discourse_of_power
1 Chronemics https://en.wikipedia.org/wiki/Control_of_time_in_power_relationships
1 Personal boundaries https://en.wikipedia.org/wiki/Personal_boundaries
1 Cratology https://en.wikipedia.org/wiki/Cratology
1 Veto https://en.wikipedia.org/wiki/Veto
1 Amity-enmity complex https://en.wikipedia.org/wiki/Amity-enmity_complex
1 Social control https://en.wikipedia.org/wiki/Social_control


In [12]:
display_articles(2)

0 Social influence https://en.wikipedia.org/wiki/Social_influence
0 Need for achievement https://en.wikipedia.org/wiki/Need_for_achievement
1 Goal orientation https://en.wikipedia.org/wiki/Goal_orientation
1 Need for cognition https://en.wikipedia.org/wiki/Need_for_cognition
1 Need for power https://en.wikipedia.org/wiki/Need_for_power
1 Need theory https://en.wikipedia.org/wiki/Need_theory
1 Need for affiliation https://en.wikipedia.org/wiki/Need_for_affiliation
1 Bystander effect https://en.wikipedia.org/wiki/Bystander_effect
1 Social proof https://en.wikipedia.org/wiki/Social_proof
1 Mind shaping https://en.wikipedia.org/wiki/Mind_shaping
1 Propaganda https://en.wikipedia.org/wiki/Propaganda
1 Judge–advisor system https://en.wikipedia.org/wiki/Judge%E2%80%93advisor_system
1 Impression management https://en.wikipedia.org/wiki/Impression_management


In [13]:
display_articles(3)

0 Hedonism https://en.wikipedia.org/wiki/Hedonism
1 Utilitarianism https://en.wikipedia.org/wiki/Utilitarianism
1 Paradox of hedonism https://en.wikipedia.org/wiki/Paradox_of_hedonism
1 Epicureanism https://en.wikipedia.org/wiki/Epicureanism
1 Eudaimonia https://en.wikipedia.org/wiki/Eudaimonia
1 Yangism https://en.wikipedia.org/wiki/Yangism
1 Pleasure principle (psychology) https://en.wikipedia.org/wiki/Pleasure_principle_(psychology)
1 Cyrenaics https://en.wikipedia.org/wiki/Cyrenaics
1 Torbjörn Tännsjö https://en.wikipedia.org/wiki/Torbj%C3%B6rn_T%C3%A4nnsj%C3%B6
1 Hedonism Resorts https://en.wikipedia.org/wiki/Hedonism_Resorts
1 Psychological egoism https://en.wikipedia.org/wiki/Psychological_hedonism
1 Michel Onfray https://en.wikipedia.org/wiki/Michel_Onfray
1 David Pearce (philosopher) https://en.wikipedia.org/wiki/David_Pearce_(philosopher)
1 Affectionism https://en.wikipedia.org/wiki/Affectionism
1 Libertine https://en.wikipedia.org/wiki/Libertine
1 Fred Feldman (philosopher) 

In [14]:
display_articles(4)

0 Stimulation https://en.wikipedia.org/wiki/Stimulation


In [15]:
display_articles(5)

0 Independence https://en.wikipedia.org/wiki/Independence
0 Freedom https://en.wikipedia.org/wiki/Freedom
0 Creativity https://en.wikipedia.org/wiki/Creativity


In [16]:
display_articles(6)

0 Universalism https://en.wikipedia.org/wiki/Universalism
0 Social justice https://en.wikipedia.org/wiki/Social_justice
0 Egalitarianism https://en.wikipedia.org/wiki/Egalitarianism
0 Environmental protection https://en.wikipedia.org/wiki/Environmental_protection


In [17]:
display_articles(7)

0 Loyalty https://en.wikipedia.org/wiki/Loyalty
0 Altruism https://en.wikipedia.org/wiki/Altruism
0 Responsibility https://en.wikipedia.org/wiki/Responsibility
1 Alms https://en.wikipedia.org/wiki/Alms
1 Inclusive fitness https://en.wikipedia.org/wiki/Inclusive_fitness
1 Tragedy of the commons https://en.wikipedia.org/wiki/Comedy_of_the_commons
1 Prisoner's dilemma https://en.wikipedia.org/wiki/Prisoner%27s_dilemma
1 Egotism https://en.wikipedia.org/wiki/Egotism
1 Kin selection https://en.wikipedia.org/wiki/Kin_selection
1 Empathy-altruism https://en.wikipedia.org/wiki/Empathy-altruism
1 Prosocial behavior https://en.wikipedia.org/wiki/Prosocial_behavior
1 Mutual aid (organization theory) https://en.wikipedia.org/wiki/Mutual_aid_(organization)
1 Solidarity https://en.wikipedia.org/wiki/Solidarity_(sociology)
1 Social psychology https://en.wikipedia.org/wiki/Social_psychology
1 Selfishness https://en.wikipedia.org/wiki/Selfishness
1 Humanity (virtue) https://en.wikipedia.org/wiki/Humani

In [18]:
display_articles(8)

0 Tradition https://en.wikipedia.org/wiki/Tradition
0 Modesty https://en.wikipedia.org/wiki/Modesty
1 Perennial philosophy https://en.wikipedia.org/wiki/Perennial_philosophy
1 Folklore https://en.wikipedia.org/wiki/Folklore
1 Origin myth https://en.wikipedia.org/wiki/Aition


In [19]:
display_articles(9)

0 Conformity https://en.wikipedia.org/wiki/Conformity
1 Cultural assimilation https://en.wikipedia.org/wiki/Cultural_assimilation
1 Authoritarian personality https://en.wikipedia.org/wiki/Authoritarian_personality
1 Spiral of silence https://en.wikipedia.org/wiki/Spiral_of_silence
1 Milieu control https://en.wikipedia.org/wiki/Milieu_control
1 Countersignaling https://en.wikipedia.org/wiki/Countersignaling
0 Obedience (human behavior) https://en.wikipedia.org/wiki/Obedience_(human_behavior)
0 Discipline https://en.wikipedia.org/wiki/Discipline
1 Social norm https://en.wikipedia.org/wiki/Norm_(social)
1 Deindividuation https://en.wikipedia.org/wiki/Deindividuation
1 School discipline https://en.wikipedia.org/wiki/School_discipline
1 Positive discipline https://en.wikipedia.org/wiki/Positive_discipline
1 Child discipline https://en.wikipedia.org/wiki/Child_discipline
1 Domestic discipline https://en.wikipedia.org/wiki/Domestic_discipline_(disambiguation)
1 Disinhibition https://en.wikipe

In [20]:
display_articles(10)

0 Security https://en.wikipedia.org/wiki/Security
0 Social order https://en.wikipedia.org/wiki/Social_order
0 Cleanliness https://en.wikipedia.org/wiki/Cleanliness
1 Cleanroom https://en.wikipedia.org/wiki/Clean_room
1 Green cleaning https://en.wikipedia.org/wiki/Green_cleaning
1 Cleaning https://en.wikipedia.org/wiki/Cleaning
1 Waste management https://en.wikipedia.org/wiki/Waste_management
1 Environmental remediation https://en.wikipedia.org/wiki/Environmental_remediation
1 Pollution https://en.wikipedia.org/wiki/Pollution
1 Contamination control https://en.wikipedia.org/wiki/Contamination_control
1 Lady Macbeth effect https://en.wikipedia.org/wiki/Lady_Macbeth_effect
1 Antiseptic https://en.wikipedia.org/wiki/Antiseptic
1 Hygiene https://en.wikipedia.org/wiki/Hygiene
1 Cleaner https://en.wikipedia.org/wiki/Cleaner
1 Ritual purification https://en.wikipedia.org/wiki/Ritual_purification
1 Asepsis https://en.wikipedia.org/wiki/Aseptic_technique
1 Marx's theory of history https://en.wik

## 5. Analyzing and cleaning a specific corpus based on a theme

In [21]:
def get_title(Corpus, theme_id):
    title = ''
    for article in Corpus:
        if article['theme.id'] == theme_id:
            title = article['title']
            break
    return title

### 5.0 Selecting the specific theme (a sub-corpus).

In [451]:
## For a different sub-corpus use a corresponding theme ID.
current_theme_id = 5

In [452]:
current_title = get_title(OM_Corpus, current_theme_id)

In [453]:
output_fname = "_".join([word.capitalize() for word in current_title.split(" ")])
print(current_title, "::", output_fname)

Independence :: Independence


In [454]:
input_text = " ".join([page['text'] for page in OM_Corpus if page['theme.id'] == current_theme_id])

In [455]:
pp.pprint(input_text)

('Independence \n'
 ' For other uses see \n'
 ' Independence disambiguation \n'
 ' Thirteen \n'
 ' British \n'
 ' colonies on the east coast of \n'
 ' North America \n'
 ' issued a \n'
 ' Declaration of Independence \n'
 ' in \n'
 ' Chile \n'
 ' one of several \n'
 ' Spanish \n'
 ' colonies in \n'
 ' South America \n'
 ' issued a \n'
 ' Declaration of independence \n'
 ' in \n'
 ' Independence \n'
 ' is a condition of a \n'
 ' nation \n'
 ' country \n'
 ' or \n'
 ' state \n'
 ' in which its residents and population or some portion thereof exercise \n'
 ' self-government \n'
 ' and usually \n'
 ' sovereignty \n'
 ' over the territory The opposite of independence is the status of a \n'
 ' dependent territory \n'
 ' Definition of independence \n'
 ' Distinction between independence and autonomy \n'
 ' Declarations of independence \n'
 ' Historical overview \n'
 ' Continents \n'
 ' Notes \n'
 ' Definition of independence \n'
 ' Whether the attainment of independence is different from \n'
 

 'institutions \n'
 ' Liberty is linked to human subjectivity freedom is not The Declaration of '
 'Independence for example describes men as having liberty and the nation as '
 'being free Free will \n'
 ' the quality of being free from the control of fate or necessity \n'
 ' may first have been attributed to human will but \n'
 ' Newtonian physics \n'
 ' attributes freedom \n'
 ' degrees of freedom \n'
 ' free bodies \n'
 ' to objects \n'
 ' Freedom differs from liberty as control differs from discipline Liberty '
 'like discipline is linked to institutions and political parties whether '
 'liberal or libertarian freedom is not Although freedom can work for or '
 'against institutions it is not bound to themit travels through unofficial '
 'networks To have liberty is to be liberated from something to be free is to '
 'be self-determining autonomous Freedom can or cannot exist within a state of '
 'liberty one can be liberated yet \n'
 ' unfree \n'
 ' or \n'
 ' free \n'
 ' yet enslav

 'characterised by the \n'
 ' psychometric \n'
 ' approach and the evidence that team creativity is founded on diversity and '
 'difference \n'
 ' One characteristic of creative people as measured by some psychologists is '
 'what is called \n'
 ' divergent production \n'
 ' Divergent production \n'
 ' is the ability of a person to generate a diverse assortment yet an '
 'appropriate amount of responses to a given situation \n'
 ' One way of measuring \n'
 ' divergent production \n'
 ' is by administering the Torrance Tests of Creative Thinking \n'
 ' The Torrance Tests of Creative Thinking assesses the diversity quantity and '
 'appropriateness of participants responses to a variety of open-ended '
 'questions \n'
 ' Other researchers of creativity see the difference in creative people as a '
 'cognitive process of dedication to problem solving and developing expertise '
 'in the field of their creative expression Hard working people study the work '
 'of people before them and within

 ' b \n'
 ' c \n'
 ' d \n'
 ' Cskszentmihlyi Mihly \n'
 ' "Implications of a systems perspective for the study of creativity" In \n'
 ' R J Sternberg \n'
 ' Handbook of Creativity \n'
 ' Cambridge University Press \n'
 ' ^ \n'
 ' a \n'
 ' b \n'
 ' c \n'
 ' d \n'
 ' Robinson K \n'
 ' Azzam A M "Why creativity now?" \n'
 ' Educational Leadership \n'
 ' ^ \n'
 ' Paris C Edwards N Sheffield E Mutinsky M Olexa T Reilly S & Baer J How '
 'early school experiences impact creativity In J C Kaufman & J Baer Eds '
 'Creativity and Reason in Cognitive Development pp New York NY Cambridge '
 'University Press \n'
 ' ^ \n'
 ' a \n'
 ' b \n'
 ' Byrge C Hanson S "The creative platform A new paradigm for teaching '
 'creativity" \n'
 ' Problems of Education in the 21st Century \n'
 ' ^ \n'
 ' Csikszentmihalyi M Evolution and flow In M Csikszentmihalyi Ed The evolving '
 'self A psychology for the third millennium pp New York Harper Perennial \n'
 ' ^ \n'
 ' National Advisory Committee on Creative and 

In [456]:
# Tokenizing the input text:
tokenized = tokenizer.tokenize_words(input_text)
number_of_words = len(tokenized)
print(number_of_words,current_title)

19061 Independence


### 5.1 Computing frequency distributions of each token, i.e word, term, pancuation, etc.

In [457]:
input_freq_dist = FreqDist(tokenized)

In [458]:
input_freq_dist.most_common(10)

[('\n', 2213),
 ('the', 767),
 ('of', 723),
 ('and', 658),
 ('in', 397),
 ('a', 382),
 ('creativity', 377),
 ('to', 376),
 ('that', 170),
 ('as', 165)]

### 5.2 Removing punctuation and stopwords from the input corpus

In [459]:
for stopword in STOP_WORDS:
    if stopword in input_freq_dist:
        del input_freq_dist[stopword]
        
for punctuation in tokenizer.CHARACTERS_TO_SPLIT:
    if punctuation in input_freq_dist:
        del input_freq_dist[punctuation]

# Re-control most common words after cleaning:
n_input = sum(input_freq_dist.values())
input_freq_dist.most_common(10)

[('creativity', 377),
 ('creative', 163),
 ('intelligence', 94),
 ('j', 74),
 ('s', 63),
 ('freedom', 63),
 ('e', 56),
 ('b', 53),
 ('new', 52),
 ('c', 50)]

### 5.3 Removing all numbered words

This is an example case for post proceesing in terms of cleaning. The pre-processing, that is data cleaning/preperation during or right after harvesting should be further improved to avoid such processes at this stage.

In [460]:
pattern_letters = re.compile('[a-z]')
def has_letters(x):
    return(pattern_letters.search(x) is not None)

In [461]:
reduced = {k:v for k,v in input_freq_dist.items() if has_letters(k)}
print("Reduction due to all number matches: ", len(input_freq_dist) - len(reduced))
input_freq_dist = reduced

Reduction due to all number matches:  107


### 5.4 Removing single character words


In [462]:
reduced = {k:v for k,v in input_freq_dist.items() if len(k) > 1}
print("Reduction due to single characters: ", len(input_freq_dist) - len(reduced))
input_freq_dist = reduced

Reduction due to single characters:  22


### 5.5 Removing rare words from input distribution

In [463]:
reduced = {k:v for k,v in input_freq_dist.items() if v > 5}
print("Reduction due to rare occurances: ", len(input_freq_dist) - len(reduced))
input_freq_dist = reduced

Reduction due to rare occurances:  3127


## 6. Comparing input vs English corpus volumes

### 6.1 Total words (after cleaning the stopwords) 

In [464]:
print(n_input, n_english)

10600 544168


### 6.2 Number of unique words (after cleaning stopwords and rare words)

In [465]:
n_unique_word_input = len(input_freq_dist.items())
n_unique_word_brown = len(english_freq_dist.items())
n_unique_word_input, n_unique_word_brown

(350, 49598)

### 6.3 Cleaned set of input words/terms

List of words in the corpus in case, for a visual inspection. Such inspections will be used both to improve tokenization as well as filtering.

In [466]:
pp.pprint(sorted(input_freq_dist.items(), key=lambda x:x[1], reverse=True))

[('creativity', 377),
 ('creative', 163),
 ('intelligence', 94),
 ('freedom', 63),
 ('new', 52),
 ('theory', 49),
 ('problem', 49),
 ('thinking', 44),
 ('people', 40),
 ('independence', 39),
 ('psychology', 37),
 ('process', 37),
 ('individual', 36),
 ('research', 36),
 ('ideas', 36),
 ('sternberg', 34),
 ('model', 33),
 ('press', 33),
 ('work', 32),
 ('cognitive', 31),
 ('knowledge', 31),
 ('processes', 27),
 ('study', 26),
 ('university', 26),
 ('solving', 26),
 ('tests', 26),
 ('cambridge', 26),
 ('different', 25),
 ('creativity"', 25),
 ('personality', 23),
 ('divergent', 23),
 ('approach', 23),
 ('journal', 23),
 ('human', 22),
 ('levels', 22),
 ('kaufman', 22),
 ('handbook', 22),
 ('others', 21),
 ('theories', 21),
 ('motivation', 21),
 ('self', 20),
 ('liberty', 20),
 ('number', 20),
 ('social', 20),
 ('team', 20),
 ('innovation', 20),
 ('concept', 19),
 ('thought', 19),
 ('science', 19),
 ('working', 19),
 ('high', 19),
 ('will', 18),
 ('person', 18),
 ('studies', 18),
 ('gener

### 6.4 Set of terms/words that occure in both corpus.

In [467]:
len(input_freq_dist.keys())

350

In [468]:
common_words = [w for w in set(input_freq_dist.keys()) & set(english_freq_dist.keys())]
print(len(common_words))

313


In [469]:
pp.pprint(sorted(common_words))

['ability',
 'able',
 'according',
 'achieve',
 'across',
 'actions',
 'activities',
 'advertising',
 'affect',
 'age',
 'aggression',
 'al',
 'among',
 'analysis',
 'ancient',
 'another',
 'applied',
 'approach',
 'approaches',
 'areas',
 'argued',
 'art',
 'artists',
 'arts',
 'asked',
 'aspects',
 'associated',
 'association',
 'attention',
 'autonomy',
 'average',
 'become',
 'behavior',
 'blending',
 'brain',
 'business',
 'cambridge',
 'century',
 'cerebellum',
 'children',
 'cognitive',
 'compared',
 'component',
 'computational',
 'computer',
 'concept',
 'conceptual',
 'conditions',
 'considered',
 'control',
 'correlation',
 'correlations',
 'countries',
 'create',
 'creation',
 'creative',
 'creativity',
 'culture',
 'curiosity',
 'data',
 'day',
 'declaration',
 'definition',
 'degrees',
 'developed',
 'development',
 'difference',
 'differences',
 'different',
 'discovery',
 'disorder',
 'distinct',
 'distinction',
 'divergent',
 'diversity',
 'domain',
 'early',
 'economi

### 6.5 Set of terms/words that occure in the sample but not in the reference corpus.

This specific set will be incorporated later below. 

In [470]:
input_specifics = dict()
for w in set(input_freq_dist.keys()) - set(english_freq_dist.keys()):
    input_specifics[w] = input_freq_dist[w]

In [471]:
print(len(input_specifics))

37


In [472]:
pp.pprint(sorted(input_specifics))

['"a',
 '"creativity',
 '"the',
 '1016j',
 'amabile',
 'archived',
 'autonomous',
 'batey',
 'bipolar',
 'c"',
 'cognition',
 'constructs',
 'convergent',
 'creativity"',
 'csikszentmihalyi',
 'eds',
 'extrinsic',
 'furnham',
 'honing',
 'jc',
 'jrgen',
 'kaufman',
 'mc',
 'meta',
 'pmid',
 'psychometric',
 'rem',
 'rj',
 'runco',
 'schmidhuber',
 'software',
 'sternberg',
 'subset',
 'torrance',
 'vandervert',
 'wallas',
 'worldview']


## 7. Stemming

In [473]:
stemmer = PorterStemmer()
input_wset_stems = {k: stemmer.stem(k) for k in input_freq_dist.keys()}
common_wset_stems = {k: stemmer.stem(k) for k in common_words}
pp.pprint(common_wset_stems)

{'ability': 'abil',
 'able': 'abl',
 'according': 'accord',
 'achieve': 'achiev',
 'across': 'across',
 'actions': 'action',
 'activities': 'activ',
 'advertising': 'advertis',
 'affect': 'affect',
 'age': 'age',
 'aggression': 'aggress',
 'al': 'al',
 'among': 'among',
 'analysis': 'analysi',
 'ancient': 'ancient',
 'another': 'anoth',
 'applied': 'appli',
 'approach': 'approach',
 'approaches': 'approach',
 'areas': 'area',
 'argued': 'argu',
 'art': 'art',
 'artists': 'artist',
 'arts': 'art',
 'asked': 'ask',
 'aspects': 'aspect',
 'associated': 'associ',
 'association': 'associ',
 'attention': 'attent',
 'autonomy': 'autonomi',
 'average': 'averag',
 'become': 'becom',
 'behavior': 'behavior',
 'blending': 'blend',
 'brain': 'brain',
 'business': 'busi',
 'cambridge': 'cambridg',
 'century': 'centuri',
 'cerebellum': 'cerebellum',
 'children': 'children',
 'cognitive': 'cognit',
 'compared': 'compar',
 'component': 'compon',
 'computational': 'comput',
 'computer': 'comput',
 'con

## 8. Handling input specific term set

### 8.1 Identfying matching stems with common words. 

Note that the frequency counts are transferred accordingly.

In [474]:
specifics = {}
for k,v in input_specifics.items():
    stem = input_wset_stems[k]
    words = [w for w,s in common_wset_stems.items() if s == stem]
    if words:
        w = words[0]
        input_freq_dist[w] += input_specifics[k]
        continue
    specifics[k] = v
# Removing the words with matching stems from the specific set.
print("Reduction due to stemm matches: ", len(input_specifics) - len(specifics))
input_specifics = specifics

Reduction due to stemm matches:  2


### 8.2 Removing open-maker specific terms.

In [475]:
with open("data/specifics_openmaker.txt", "r") as f:
    SPECIFICS_OPENMAKER = set(f.read().strip().split("\n"))
om_specific_stems = {stemmer.stem(k):k for k in SPECIFICS_OPENMAKER}
pp.pprint(om_specific_stems)

{'3-d': '3-d',
 '3d': '3d',
 'abat': 'abatement',
 'afford': 'affordable',
 'agenda21': 'agenda21',
 'anarch': 'anarchism',
 'autonom': 'autonomous',
 'biodiesel': 'biodiesel',
 'biodivers': 'biodiversity',
 'biofuel': 'biofuel',
 'bioga': 'biogas',
 'biomass': 'biomass',
 'biospher': 'biosphere',
 'bricolag': 'bricolage',
 'brundtland': 'brundtland',
 'c2c': 'c2c',
 'cad': 'cad',
 'cap-and-trad': 'cap-and-trade',
 'carfre': 'carfree',
 'cdm': 'cdm',
 'christensen': 'christensen',
 'co-creat': 'co-creation',
 'co-develop': 'co-develop',
 'co-invent': 'co-invention',
 'co-inventor': 'co-inventor',
 'coextinct': 'coextinction',
 'cognit': 'cognition',
 'commons-bas': 'commons-based',
 'computer-aid': 'computer-aided',
 'conferenc': 'conferencing',
 'consortium': 'consortium',
 'constraint': 'constraints',
 'construct': 'construct',
 'copyleft': 'copyleft',
 'copyright': 'copyright',
 'cradle-to-cradl': 'cradle-to-cradle',
 'crowdsourc': 'crowdsourcing',
 'crowdwork': 'crowdworker',
 'cuv

In [476]:
specific_wset_dirty = {}
specific_wset_stems_selected = {}
for k,v in input_specifics.items():
    stem = stemmer.stem(k)
    if stem in om_specific_stems.keys():
        if k in specific_wset_stems_selected.keys():
            specific_wset_stems_selected[stem] += v
        else:
            specific_wset_stems_selected[stem] = v
        continue
    if v > 10:
        specific_wset_dirty[k] = v
        
input_specifics = specific_wset_dirty

In [477]:
# The set of stems to be added to the set with makerness counts.

pp.pprint(specific_wset_stems_selected)

{'autonom': 6,
 'construct': 11,
 'extrins': 7,
 'psychometr': 8,
 'schmidhub': 12,
 'softwar': 11,
 'sternberg': 34}


### 8.3 Remaining frequent input specifics
The manual checking can help to determine what should go into "specifics_openmaker.txt"

In [478]:
print(len(input_specifics))
pp.pprint(sorted(input_specifics))

7
['"the', 'creativity"', 'kaufman', 'runco', 'subset', 'torrance', 'vandervert']


In [479]:
specific_wset_stems = {}
for k,v in input_specifics.items():
    stem = stemmer.stem(k)
    if stem in specific_wset_stems.keys():
        specific_wset_stems[stem].append((k,v))
    else:
        specific_wset_stems[stem] = [(k,v)]
pp.pprint(specific_wset_stems)

{'"the': [('"the', 13)],
 'creativity"': [('creativity"', 25)],
 'kaufman': [('kaufman', 22)],
 'runco': [('runco', 15)],
 'subset': [('subset', 11)],
 'torranc': [('torrance', 11)],
 'vandervert': [('vandervert', 18)]}


## 9. Computing representation power of common words.

In [480]:
# combine
word = []
freq = []
score = []
# common_words = [w[0] for w in common_words]
nEng = 1.0 * n_english
nInp = 1.0 * n_input
for w in common_words:
    # Consider only words whose charcater length is larger than 1
    if len(w) > 1:
        # Log likelihood scores are computed:
        print(w, input_freq_dist[w], english_freq_dist[w])
        s = log((input_freq_dist[w] / nInp) / (english_freq_dist[w] / nEng))
        f = input_freq_dist[w]
        word.append(w)
        freq.append(f)
        score.append(s)

intelligence 94 48
mark 10 83
nature 8 191
new 52 1635
declaration 8 24
state 10 807
ideas 36 143
list 6 133
motivation 21 11
six 6 220
person 18 174
given 12 377
movement 8 128
gardner 6 4
lobe 9 3
lobes 6 5
experiences 7 53
scored 6 15
high 19 497
correlations 7 2
goals 6 40
term 11 79
various 8 201
analysis 15 108
development 12 334
problem 49 313
aspects 6 64
difference 7 148
students 12 213
technical 6 120
achieve 8 51
multiple 6 36
system 13 416
traits 9 6
early 11 366
create 9 54
physical 9 138
another 9 684
results 7 149
art 11 208
yet 8 419
three 14 610
processing 10 38
idea 16 195
similar 6 157
factor 7 71
freedom 63 128
theory 49 129
studies 18 103
views 8 51
press 33 127
developed 10 170
model 33 77
ways 9 128
data 11 173
view 9 186
necessary 7 222
considered 6 151
little 6 831
distinction 6 41
terms 9 163
review 13 56
order 11 376
blending 7 1
political 8 258
found 15 536
concept 19 85
supportive 7 7
independence 39 70
open 7 318
scientific 6 86
successful 6 95
among 10 37

In [481]:
m = pd.Series(score)
f = pd.Series(freq)
k = pd.Series(word)
stem = k.apply(stemmer.stem)
scoring = len(k) * ['common']
df_common = pd.DataFrame({'Word':k, 'Stem':stem, 'Score':m,'Tf':f, 'Type':scoring})
df_common.sort_values(by='Score', ascending=False, inplace = True)
df_common.reset_index(inplace=True, drop=True)
df_common.head(20)

Unnamed: 0,Score,Stem,Tf,Type,Word
0,7.673425,creativ,377,common,creativity
1,6.908818,cognit,39,common,cognitive
2,6.771617,iq,17,common,iq
3,6.336299,handbook,22,common,handbook
4,6.240989,guilford,10,common,guilford
5,6.240989,cerebellum,10,common,cerebellum
6,6.07847,malevol,17,common,malevolent
7,6.017846,fluenci,8,common,fluency
8,5.884314,blend,7,common,blending
9,5.730163,comput,6,common,computational


### 9.1 Computing makerness of specific terms

In [482]:
max_score = df_common.iloc[0,0]
threshold_score = 1.0
mean_w = df_common.Score[df_common.Score > threshold_score].mean()
mean_f = df_common.Tf[df_common.Score > threshold_score].mean()
print(max_score, threshold_score, mean_w, mean_f)

7.6734246317 1.0 2.714948873669087 15.818181818181818


In [483]:
def compute_speficif_score(f, maxw=10, minw=1, mean_w=3.0, mean_f=25.0):
    return(max(min((f / mean_f) * mean_w, maxw), minw))
compute_speficif_score(mean_w, max_score, threshold_score, mean_w, mean_f)

1.0

In [484]:
stem = []
freq = []
score = []
for k, v in specific_wset_stems_selected.items():
    s = compute_speficif_score(v, max_score, threshold_score, mean_w, mean_f)
    stem.append(k)
    freq.append(v)
    score.append(s)

## 10. Tabulating the results and generating the output file

In [485]:
m = pd.Series(score)
f = pd.Series(freq)
stem = pd.Series(stem)
scoring = len(m) * ['specific']
df_exclusive= pd.DataFrame({'Stem':stem, 'Score':m,'Tf':f, 'Type':scoring})
df_exclusive.sort_values(by='Score', ascending=False, inplace = True)
df_exclusive.reset_index(inplace=True, drop=True)
df_exclusive.head(20)

Unnamed: 0,Score,Stem,Tf,Type
0,5.83558,sternberg,34,specific
1,2.059616,schmidhub,12,specific
2,1.887982,softwar,11,specific
3,1.887982,construct,11,specific
4,1.373078,psychometr,8,specific
5,1.201443,extrins,7,specific
6,1.029808,autonom,6,specific


In [486]:
df_makerness = df_common.append(df_exclusive, ignore_index=True)
df_makerness.sort_values(by='Score', ascending=False, inplace = True)
df_makerness.reset_index(inplace=True, drop=True)
df_makerness.head(42)

Unnamed: 0,Score,Stem,Tf,Type,Word
0,7.673425,creativ,377,common,creativity
1,6.908818,cognit,39,common,cognitive
2,6.771617,iq,17,common,iq
3,6.336299,handbook,22,common,handbook
4,6.240989,guilford,10,common,guilford
5,6.240989,cerebellum,10,common,cerebellum
6,6.07847,malevol,17,common,malevolent
7,6.017846,fluenci,8,common,fluency
8,5.884314,blend,7,common,blending
9,5.83558,sternberg,34,specific,


In [487]:
df_makerness.tail(42)

Unnamed: 0,Score,Stem,Tf,Type,Word
278,0.327486,among,10,common,among
279,0.308744,sever,10,common,several
280,0.23402,mind,8,common,mind
281,0.224832,import,9,common,important
282,0.164002,three,14,common,three
283,0.162364,use,14,common,used
284,0.161056,less,10,common,less
285,0.144521,help,7,common,help
286,0.122263,open,7,common,open
287,0.117035,word,6,common,word


In [488]:
OUTPUT_FOLDER = "./output/"
csvfile_name = OUTPUT_FOLDER + "makerness_" + output_fname + ".csv"
with open(csvfile_name, 'w') as csvfile:
    #thewriter = csv.writer(csvfile, delimiter=',')
    #for k,v in sorted(makerness.items(), key=lambda x:x[1][0], reverse=True): thewriter.writerow([k,v[0],v[1]])
    df_makerness.to_csv(csvfile_name)

### 10.1 Outputfile name for the theme

In [489]:
print(csvfile_name)

./output/makerness_Independence.csv
