# Discovery and Representation of Open Making Related Terms

This notebook sketches the initial exercise on discovering the open making related keywords. The input text is harvested via a Web crawler that identifies and crawls semantically related wikipedia articles.   

In [1]:
from utils import tokenizer
import nltk
from nltk import FreqDist
from nltk.stem.porter import PorterStemmer
from numpy import log, mean
import json, csv, re
import pprint as pp

import pandas as pd

## 1. Loading a reference English language corpus

In [2]:
from nltk.corpus import brown
brown.categories()

[u'adventure',
 u'belles_lettres',
 u'editorial',
 u'fiction',
 u'government',
 u'hobbies',
 u'humor',
 u'learned',
 u'lore',
 u'mystery',
 u'news',
 u'religion',
 u'reviews',
 u'romance',
 u'science_fiction']

## 2. Stop words

### 2.1 Standard stop words

In [3]:
with open("data/stopwords_standard.txt", "r") as f:
    STOP_WORDS_STANDARD = set(f.read().strip().split("\n"))
print(STOP_WORDS_STANDARD)

set(['all', "she'll", 'just', "don't", 'being', 'over', 'through', 'yourselves', 'its', 'before', "he's", "when's", "we've", 'had', 'should', "he'd", 'to', 'only', "there's", 'those', 'under', 'has', "haven't", 'do', 'them', 'his', "they'll", 'get', 'very', "who's", "they'd", 'cannot', 'know', 'they', 'not', 'during', 'yourself', 'him', 'nor', "we'll", 'like', 'did', "they've", 'this', 'she', 'each', "won't", 'where', "mustn't", "isn't", "i'll", "why's", 'www', 'because', "you'd", 'doing', 'some', 'up', 'are', 'further', 'ourselves', 'out', 'what', 'for', 'while', "wasn't", 'does', "shouldn't", 'above', 'between', 'ought', 'be', 'we', 'who', "you're", 'were', 'here', 'hers', "aren't", 'by', 'both', 'about', 'would', 'of', 'could', "i'd", "weren't", "i'm", 'com', 'or', "can't", 'own', 'into', 'whom', 'down', "hadn't", "couldn't", 'your', "doesn't", 'from', "how's", 'her', 'their', "it's", 'there', 'been', 'why', 'few', 'too', 'themselves', 'was', 'until', 'more', 'himself', "where's", "

### 2.2 Open-making related stop words

In [4]:
with open("data/stopwords_openmaker.txt", "r") as f:
    STOP_WORDS_OPENMAKER = set(f.read().strip().split("\n"))
print(STOP_WORDS_OPENMAKER)

set(['isbn', 'often', 'vol', 'vi', 'eg', 'one', 'ii', 'second', '1st', '7th', '11th', 'txt', 'ad', 'pp', '6th', '3rd', 'na', '5th', 'wikipedia', 'randd', '14th', 'also', 'html', 'von', '15th', 'first', 'bc', 'may', '4th', 'wikipedias', 'org', 'iv', 'iii', '13th', 'almost', 'doi', 'third', 'many', 'well', 'britannica', '2nd', 'etc', 'encyclopedia', '9th', 'doc', 'pdf', '10th', 'tt', '12th', '8th'])


## 3. Removing stop words from the reference English corpus

In [5]:
# merging the two list together
STOP_WORDS = STOP_WORDS_STANDARD.union(STOP_WORDS_OPENMAKER)
print(STOP_WORDS)

set(['all', "she'll", 'just', "don't", 'being', 'over', 'through', 'during', 'once', 'its', 'before', "he's", "when's", "we've", 'tt', 'had', 'html', 'randd', 'should', "he'd", 'to', 'only', 'does', "here's", 'under', 'has', "haven't", 'do', 'them', 'his', 'above', 'get', 'very', "who's", "they'd", 'cannot', 'know', 'they', 'not', 'yourselves', 'one', 'him', 'nor', "we'll", 'like', 'did', '12th', "they've", "wasn't", 'she', 'each', "won't", 'where', "mustn't", "isn't", "i'll", "why's", 'www', 'because', "you'd", 'doing', 'eg', 'theirs', 'some', "hasn't", 'second', 'are', '7th', 'further', '11th', 'ourselves', 'out', 'what', 'for', 'herself', 'bc', 'wikipedia', 'below', '14th', 'may', "there's", "shouldn't", "they'll", 'between', '15th', 'can', 'be', 'we', 'after', "doesn't", 'doc', 'here', 'hers', 'org', "aren't", 'by', 'von', 'both', 'about', 'her', '8th', 'of', 'could', 'britannica', 'etc', "i'd", "weren't", 'pdf', "i'm", 'com', 'or', "can't", 'first', 'own', 'isbn', 'into', 'yoursel

In [6]:
# load english words from the Brown corpus removing stop words.
english_freq_dist = FreqDist([w.lower() for w in nltk.corpus.brown.words()
                              if w not in STOP_WORDS])
for stopword in STOP_WORDS:
    if stopword in english_freq_dist:
        del english_freq_dist[stopword]
        
for punctuation in tokenizer.CHARACTERS_TO_SPLIT:
    if punctuation in english_freq_dist:
        del english_freq_dist[punctuation]

# Re-control most common words after cleaning:
n_english = sum(english_freq_dist.values())
english_freq_dist.most_common(10)

[(u'``', 8837),
 (u"''", 8789),
 (u'--', 3432),
 (u'will', 2245),
 (u'said', 1961),
 (u'new', 1635),
 (u'time', 1598),
 (u'two', 1412),
 (u'now', 1314),
 (u'man', 1207)]

## 4. Loading the input Open Maker corpus

In [55]:
# load the harvested text from wikipedia.
with open("data/wikipedia.json", "r") as f: OM_Corpus_text = f.read()
OM_Corpus = json.loads(OM_Corpus_text)

In [56]:
# The total number of wiki articles used:
print(len(OM_Corpus))

276


In [57]:
# Column names of the the corpus.
OM_Corpus[0].keys()

[u'url', u'text', u'depth', u'theme.id', u'title']

In [58]:
def display_articles(tid):
    articles = [article for article in OM_Corpus if article['theme.id'] == tid]
    for article in articles:
        print(article['depth'],article['title'], article['url'])

In [59]:
display_articles(1)

(0, u'Power (social and political)', u'https://en.wikipedia.org/wiki/Power_(social_and_political)')
(1, u'State collapse', u'https://en.wikipedia.org/wiki/State_collapse')
(1, u'Speaking truth to power', u'https://en.wikipedia.org/wiki/Speaking_truth_to_power')
(1, u'Chronemics', u'https://en.wikipedia.org/wiki/Control_of_time_in_power_relationships')
(1, u'The Anatomy of Revolution', u'https://en.wikipedia.org/wiki/The_Anatomy_of_Revolution')
(1, u'Personal boundaries', u'https://en.wikipedia.org/wiki/Personal_boundaries')
(1, u'Discourse of power', u'https://en.wikipedia.org/wiki/Discourse_of_power')
(1, u'Cratology', u'https://en.wikipedia.org/wiki/Cratology')
(1, u'Amity-enmity complex', u'https://en.wikipedia.org/wiki/Amity-enmity_complex')
(1, u'Social control', u'https://en.wikipedia.org/wiki/Social_control')
(1, u'Veto', u'https://en.wikipedia.org/wiki/Veto')


In [60]:
display_articles(2)

(0, u'Need for achievement', u'https://en.wikipedia.org/wiki/Need_for_achievement')
(0, u'Social influence', u'https://en.wikipedia.org/wiki/Social_influence')
(1, u'Judge\u2013advisor system', u'https://en.wikipedia.org/wiki/Judge%E2%80%93advisor_system')
(1, u'Bystander effect', u'https://en.wikipedia.org/wiki/Bystander_effect')
(1, u'Mind shaping', u'https://en.wikipedia.org/wiki/Mind_shaping')
(1, u'Social proof', u'https://en.wikipedia.org/wiki/Social_proof')
(1, u'Authority bias', u'https://en.wikipedia.org/wiki/Authority_bias')
(1, u'Propaganda', u'https://en.wikipedia.org/wiki/Propaganda')
(1, u'Impression management', u'https://en.wikipedia.org/wiki/Impression_management')
(1, u'Goal orientation', u'https://en.wikipedia.org/wiki/Goal_orientation')
(1, u'Need for affiliation', u'https://en.wikipedia.org/wiki/Need_for_affiliation')
(1, u'Need for cognition', u'https://en.wikipedia.org/wiki/Need_for_cognition')
(1, u'Need theory', u'https://en.wikipedia.org/wiki/Need_theory')
(1,

In [61]:
display_articles(3)

(0, u'Hedonism', u'https://en.wikipedia.org/wiki/Hedonism')
(1, u'Pleasure principle (psychology)', u'https://en.wikipedia.org/wiki/Pleasure_principle_(psychology)')
(1, u'David Pearce (philosopher)', u'https://en.wikipedia.org/wiki/David_Pearce_(philosopher)')
(1, u'Cyrenaics', u'https://en.wikipedia.org/wiki/Cyrenaics')
(1, u'Michel Onfray', u'https://en.wikipedia.org/wiki/Michel_Onfray')
(1, u'Libertine', u'https://en.wikipedia.org/wiki/Libertine')
(1, u'Paradox of hedonism', u'https://en.wikipedia.org/wiki/Paradox_of_hedonism')
(1, u'Torbj\xf6rn T\xe4nnsj\xf6', u'https://en.wikipedia.org/wiki/Torbj%C3%B6rn_T%C3%A4nnsj%C3%B6')
(1, u'Hedonism Resorts', u'https://en.wikipedia.org/wiki/Hedonism_Resorts')
(1, u'Yangism', u'https://en.wikipedia.org/wiki/Yangism')
(1, u'Utilitarianism', u'https://en.wikipedia.org/wiki/Utilitarianism')
(1, u'Epicureanism', u'https://en.wikipedia.org/wiki/Epicureanism')
(1, u'Eudaimonia', u'https://en.wikipedia.org/wiki/Eudaimonia')
(1, u'Affectionism', u'h

In [62]:
display_articles(4)

(0, u'Stimulation', u'https://en.wikipedia.org/wiki/Stimulation')


In [63]:
display_articles(5)

(0, u'Freedom', u'https://en.wikipedia.org/wiki/Freedom')
(0, u'Independence', u'https://en.wikipedia.org/wiki/Independence')
(0, u'Creativity', u'https://en.wikipedia.org/wiki/Creativity')
(1, u'Computational creativity', u'https://en.wikipedia.org/wiki/Computational_creativity')
(1, u'Lists of active separatist movements', u'https://en.wikipedia.org/wiki/Lists_of_active_separatist_movements')
(1, u'United Nations list of Non-Self-Governing Territories', u'https://en.wikipedia.org/wiki/United_Nations_list_of_Non-Self-Governing_Territories')
(1, u'Invention', u'https://en.wikipedia.org/wiki/Invention')
(1, u'Learned industriousness', u'https://en.wikipedia.org/wiki/Learned_industriousness')
(1, u'Greatness', u'https://en.wikipedia.org/wiki/Greatness')
(1, u'Visual arts', u'https://en.wikipedia.org/wiki/Visual_arts')
(1, u'Heroic theory of invention and scientific development', u'https://en.wikipedia.org/wiki/Heroic_theory_of_invention_and_scientific_development')
(1, u'Brainstorming', 

In [64]:
display_articles(6)

(0, u'Universalism', u'https://en.wikipedia.org/wiki/Universalism')
(0, u'Social justice', u'https://en.wikipedia.org/wiki/Social_justice')
(0, u'Egalitarianism', u'https://en.wikipedia.org/wiki/Egalitarianism')
(0, u'Environmental protection', u'https://en.wikipedia.org/wiki/Environmental_protection')
(1, u'Sustainability', u'https://en.wikipedia.org/wiki/Sustainability')
(1, u'Participation (decision making)', u'https://en.wikipedia.org/wiki/Participation_(decision_making)')
(1, u'List of international environmental agreements', u'https://en.wikipedia.org/wiki/List_of_international_environmental_agreements')
(1, u'Biodiversity', u'https://en.wikipedia.org/wiki/Biodiversity')
(1, u'Environmental personhood', u'https://en.wikipedia.org/wiki/Environmental_personhood')
(1, u'Environmental organization', u'https://en.wikipedia.org/wiki/Environmental_organizations')
(1, u'Natural resource management', u'https://en.wikipedia.org/wiki/Natural_resource_management')
(1, u'Environmental law', u

In [65]:
display_articles(7)

(0, u'Altruism', u'https://en.wikipedia.org/wiki/Altruism')
(0, u'Loyalty', u'https://en.wikipedia.org/wiki/Loyalty')
(0, u'Responsibility', u'https://en.wikipedia.org/wiki/Responsibility')
(1, u'Earning to give', u'https://en.wikipedia.org/wiki/Earning_to_give')
(1, u'Prosocial behavior', u'https://en.wikipedia.org/wiki/Prosocial_behavior')
(1, u'Gene-centered view of evolution', u'https://en.wikipedia.org/wiki/Gene-centered_view_of_evolution')
(1, u'The Giving Pledge', u'https://en.wikipedia.org/wiki/Giving_Pledge')
(1, u'Charitable organization', u'https://en.wikipedia.org/wiki/Charitable_organization')
(1, u'Kin selection', u'https://en.wikipedia.org/wiki/Kin_selection')
(1, u'Group selection', u'https://en.wikipedia.org/wiki/Group_selection')
(1, u'Consideration', u'https://en.wikipedia.org/wiki/Consideration')
(1, u'Selfishness', u'https://en.wikipedia.org/wiki/Selfishness')
(1, u'Humanity (virtue)', u'https://en.wikipedia.org/wiki/Humanity_(virtue)')
(1, u'Altruria, California',

In [66]:
display_articles(8)

(0, u'Tradition', u'https://en.wikipedia.org/wiki/Tradition')
(0, u'Modesty', u'https://en.wikipedia.org/wiki/Modesty')
(1, u'Folklore', u'https://en.wikipedia.org/wiki/Folklore')
(1, u'Origin myth', u'https://en.wikipedia.org/wiki/Aition')


In [67]:
display_articles(9)

(0, u'Conformity', u'https://en.wikipedia.org/wiki/Conformity')
(0, u'Obedience (human behavior)', u'https://en.wikipedia.org/wiki/Obedience_(human_behavior)')
(0, u'Discipline', u'https://en.wikipedia.org/wiki/Discipline')
(1, u'Depersonalization', u'https://en.wikipedia.org/wiki/Depersonalization')
(1, u'Animal training', u'https://en.wikipedia.org/wiki/Animal_training')
(1, u'Superstition', u'https://en.wikipedia.org/wiki/Superstition')
(1, u'Compliance (psychology)', u'https://en.wikipedia.org/wiki/Compliance_(psychology)')
(1, u'Codependency', u'https://en.wikipedia.org/wiki/Codependency')
(1, u'Sycophancy', u'https://en.wikipedia.org/wiki/Sycophancy')
(1, u'Horse training', u'https://en.wikipedia.org/wiki/Horse_breaking')
(1, u'Filial piety', u'https://en.wikipedia.org/wiki/Filial_piety')
(1, u'Obedience training', u'https://en.wikipedia.org/wiki/Obedience_training')
(1, u'Civil disobedience', u'https://en.wikipedia.org/wiki/Civil_disobedience')
(1, u'Countersignaling', u'https:/

In [68]:
display_articles(10)

(0, u'Security', u'https://en.wikipedia.org/wiki/Security')
(0, u'Social order', u'https://en.wikipedia.org/wiki/Social_order')
(0, u'Cleanliness', u'https://en.wikipedia.org/wiki/Cleanliness')
(1, u'Safety', u'https://en.wikipedia.org/wiki/Safety')
(1, u'Security increase', u'https://en.wikipedia.org/wiki/Security_increase')
(1, u'Social norm', u'https://en.wikipedia.org/wiki/Norm_(sociology)')
(1, u'Social stratification', u'https://en.wikipedia.org/wiki/Social_hierarchy')
(1, u'Anti-social behaviour', u'https://en.wikipedia.org/wiki/Anti-social_behaviour')
(1, u'Risk', u'https://en.wikipedia.org/wiki/Security_risk')
(1, u"Marx's theory of history", u'https://en.wikipedia.org/wiki/Marx%27s_theory_of_history')
(1, u'Peace', u'https://en.wikipedia.org/wiki/Peace')
(1, u'Green cleaning', u'https://en.wikipedia.org/wiki/Green_cleaning')
(1, u'Environmental remediation', u'https://en.wikipedia.org/wiki/Environmental_remediation')
(1, u'Asepsis', u'https://en.wikipedia.org/wiki/Aseptic_tec

## 5. Analyzing a specific corpus based on a theme

In [69]:
def get_title(Corpus, theme_id):
    title = ''
    for article in Corpus:
        if article['theme.id'] == theme_id:
            title = article['title']
            break
    return title

## 6.0 Selecting the specific theme (a sub-corpus).

In [74]:
## For a different sub-corpus use a corresponding theme ID.
current_theme_id = 3

In [75]:
current_title = get_title(OM_Corpus, current_theme_id)

In [76]:
output_fname = "_".join([word.capitalize() for word in current_title.split(" ")])
print(current_title, "::", output_fname)

(u'Hedonism', '::', u'Hedonism')


In [77]:
input_text = " ".join([page['text'] for page in OM_Corpus if page['theme.id'] == current_theme_id])

In [17]:
pp.pprint(input_text)

u'Do it yourself \n For other uses see \n Do it yourself disambiguation \n "DIY" redirects here For other uses see \n DIY disambiguation \n This article has multiple issues \n Please help \n improve it \n or discuss these issues on the \n talk page \n Learn how and when to remove these template messages \n This article \n possibly contains \n original research \n Please \n improve it \n by \n verifying \n the claims made and adding \n inline citations \n Statements consisting only of original research should be removed \n November \n Learn how and when to remove this template message \n This article \n needs additional or better citations for \n verification \n Please help \n improve this article \n by \n adding citations to reliable sources \n Unsourced material may be challenged and removed \n September \n Learn how and when to remove this template message \n Learn how and when to remove this template message \n Part of a series on \n Individualism \n Topics and concepts \n Autonomy 

In [78]:
# Tokenizing the input text:
tokenized = tokenizer.tokenize_words(input_text)
number_of_words = len(tokenized)
print(number_of_words,current_title)

(39891, u'Hedonism')


### 6.1 Computing frequency distributions of each token, i.e word, term, pancuation, etc.

In [79]:
input_freq_dist = FreqDist(tokenized)

In [20]:
input_freq_dist.most_common(10)

[(u'\n', 3787),
 (u'the', 1248),
 (u'and', 777),
 (u'of', 771),
 (u'to', 664),
 (u'a', 660),
 (u'in', 565),
 (u'is', 302),
 (u'as', 277),
 (u'for', 257)]

### 6.2 Removing punctuation and stopwords from the input corpus

In [80]:
for stopword in STOP_WORDS:
    if stopword in input_freq_dist:
        del input_freq_dist[stopword]
        
for punctuation in tokenizer.CHARACTERS_TO_SPLIT:
    if punctuation in input_freq_dist:
        del input_freq_dist[punctuation]

# Re-control most common words after cleaning:
n_input = sum(input_freq_dist.values())
input_freq_dist.most_common(10)

[(u'pleasure', 215),
 (u's', 206),
 (u'utilitarianism', 177),
 (u'"', 169),
 (u'happiness', 152),
 (u'philosophy', 130),
 (u'hedonism', 122),
 (u'moral', 110),
 (u'life', 108),
 (u'mill', 95)]

### 6.3 Removing all numbered words

This is an example case for post proceesing in terms of cleaning. The pre-processing, that is data cleaning/preperation during or right after harvesting should be further improved to avoid such processes at this stage.

In [81]:
pattern_letters = re.compile('[a-z]')
def has_letters(x):
    return(pattern_letters.search(x) is not None)

In [82]:
reduced = {k:v for k,v in input_freq_dist.items() if has_letters(k)}
print("Reduction due to all number matches: ", len(input_freq_dist) - len(reduced))
input_freq_dist = reduced

('Reduction due to all number matches: ', 77)


### 6.4 Removing single character words


In [83]:
reduced = {k:v for k,v in input_freq_dist.items() if len(k) > 1}
print("Reduction due to single characters: ", len(input_freq_dist) - len(reduced))
input_freq_dist = reduced

('Reduction due to single characters: ', 20)


### 6.5 Removing rare words from input distribution

In [84]:
reduced = {k:v for k,v in input_freq_dist.items() if v > 5}
print("Reduction due to rare occurances: ", len(input_freq_dist) - len(reduced))
input_freq_dist = reduced

('Reduction due to rare occurances: ', 5010)


## 7. Comparing input vs English corpus volumes

### 7.1 Total words (after cleaning the stopwords) 

In [85]:
print(n_input, n_english)

(20508, 544168)


### 7.2 Number of unique words (after cleaning stopwords and rare words)

In [86]:
n_unique_word_input = len(input_freq_dist.items())
n_unique_word_brown = len(english_freq_dist.items())
n_unique_word_input, n_unique_word_brown

(728, 49598)

### 7.3 Cleaned set of input words/terms

List of words in the corpus in case, for a visual inspection. Such inspections will be used both to improve tokenization as well as filtering.

In [87]:
pp.pprint(sorted(input_freq_dist.items(), key=lambda x:x[1], reverse=True))

[(u'pleasure', 215),
 (u'utilitarianism', 177),
 (u'happiness', 152),
 (u'philosophy', 130),
 (u'hedonism', 122),
 (u'moral', 110),
 (u'life', 108),
 (u'mill', 95),
 (u'good', 90),
 (u'de', 77),
 (u'human', 77),
 (u'eudaimonia', 77),
 (u'utility', 75),
 (u'ethics', 70),
 (u'john', 70),
 (u'virtue', 69),
 (u'theory', 69),
 (u'pain', 66),
 (u'bentham', 66),
 (u'people', 65),
 (u'action', 63),
 (u'la', 61),
 (u'person', 60),
 (u'will', 58),
 (u'principle', 56),
 (u'university', 55),
 (u'psychological', 52),
 (u'press', 50),
 (u'actions', 48),
 (u'epicurus', 47),
 (u'self', 44),
 (u'suffering', 42),
 (u'act', 42),
 (u'even', 41),
 (u'pleasures', 41),
 (u'view', 39),
 (u'onfray', 38),
 (u'utilitarian', 38),
 (u'school', 36),
 (u'aristippus', 35),
 (u'ethical', 35),
 (u'two', 35),
 (u'book', 35),
 (u'others', 34),
 (u'argues', 34),
 (u'however', 34),
 (u'general', 34),
 (u'thought', 34),
 (u'"the', 34),
 (u'rule', 34),
 (u'social', 33),
 (u'according', 33),
 (u'things', 33),
 (u'world', 33),

### 7.4 Set of terms/words that occure in both corpus.

In [88]:
len(input_freq_dist.keys())

728

In [89]:
common_words = [w for w in set(input_freq_dist.keys()) & set(english_freq_dist.keys())]
print(len(common_words))

640


In [31]:
pp.pprint(common_words)

[u'concept',
 u'global',
 u'focus',
 u'founded',
 u'code',
 u'higher',
 u'children',
 u'issues',
 u'poorly',
 u'layers',
 u'removal',
 u'environment',
 u'topic',
 u'program',
 u'include',
 u'resources',
 u'activities',
 u'results',
 u'every',
 u'school',
 u'level',
 u'skills',
 u'notes',
 u'companies',
 u'gun',
 u'large',
 u'small',
 u'radio',
 u'added',
 u'revolution',
 u'trend',
 u'direct',
 u'cost',
 u'video',
 u'makers',
 u'machines',
 u'even',
 u'established',
 u'errors',
 u'business',
 u'consumers',
 u'section',
 u'current',
 u'indian',
 u'template',
 u'capital',
 u'new',
 u'firms',
 u'method',
 u'movement',
 u'full',
 u'reporter',
 u'men',
 u'handyman',
 u'french',
 u'water',
 u'growing',
 u'deposition',
 u'objects',
 u'focused',
 u'groups',
 u'others',
 u'active',
 u'along',
 u'change',
 u'box',
 u'great',
 u'technical',
 u'involved',
 u'commonly',
 u'products',
 u'social',
 u'usually',
 u'military',
 u'anarchist',
 u'changes',
 u'maker',
 u'via',
 u'love',
 u'architectural',
 

### 7.5 Set of terms/words that occure in the sample but not in the reference corpus.

This specific set will be incorporated later below. 

In [90]:
input_specifics = dict()
for w in set(input_freq_dist.keys()) - set(english_freq_dist.keys()):
    input_specifics[w] = input_freq_dist[w]

In [91]:
print(len(input_specifics))

88


In [92]:
pp.pprint(input_specifics)

{u'"a': 11,
 u'"the': 34,
 u'"utilitarianism': 7,
 u'ajita': 6,
 u'altruistic': 15,
 u'anarchism': 7,
 u'annas': 6,
 u'aponia': 7,
 u'aret': 11,
 u'aristippus': 35,
 u'ataraxia': 7,
 u'atheism': 8,
 u'atheist': 14,
 u'autrement': 7,
 u'behaviorism': 6,
 u'bioethics': 6,
 u'calculus': 8,
 u'clarendon': 6,
 u'consequentialism': 16,
 u'crvka': 7,
 u'cyrenaic': 12,
 u'cyrenaics': 25,
 u'cyrene': 14,
 u'demandingness': 7,
 u'edinburgh': 7,
 u'egoism': 29,
 u'egoist': 8,
 u'epicureanism': 21,
 u'etymology': 8,
 u'eudaimon': 14,
 u'eudaimonia': 77,
 u'feldman': 15,
 u'flammarion': 7,
 u'galile': 28,
 u'gassendi': 8,
 u'grasset': 28,
 u'guisn': 7,
 u'harsanyi': 11,
 u'hdoniste': 8,
 u'hedone': 6,
 u'hedonic': 16,
 u'hedonist': 8,
 u'hegesias': 8,
 u'hermarchus': 7,
 u'histoire': 6,
 u'hutcheson': 8,
 u'jeremy': 30,
 u'jstor': 11,
 u'julien': 6,
 u'kesakambali': 6,
 u'lampsacus': 6,
 u'libertinism': 6,
 u'manifeste': 7,
 u'manifesto': 7,
 u'mccloskey': 6,
 u'metrodorus': 6,
 u'mettrie': 6,
 u'm

## 8. Stemming

In [93]:
stemmer = PorterStemmer()
input_wset_stems = {k: stemmer.stem(k) for k in input_freq_dist.keys()}
common_wset_stems = {k: stemmer.stem(k) for k in common_words}
pp.pprint(common_wset_stems)

{u'18th': u'18th',
 u'able': u'abl',
 u'absence': u'absenc',
 u'accordance': u'accord',
 u'according': u'accord',
 u'account': u'account',
 u'achieve': u'achiev',
 u'achieving': u'achiev',
 u'act': u'act',
 u'action': u'action',
 u'actions': u'action',
 u'activities': u'activ',
 u'activity': u'activ',
 u'acts': u'act',
 u'actual': u'actual',
 u'adams': u'adam',
 u'affairs': u'affair',
 u'against': u'against',
 u'agree': u'agre',
 u'aim': u'aim',
 u'aims': u'aim',
 u'allow': u'allow',
 u'alone': u'alon',
 u'although': u'although',
 u'altruism': u'altruism',
 u'always': u'alway',
 u'american': u'american',
 u'among': u'among',
 u'amount': u'amount',
 u'ancient': u'ancient',
 u'animal': u'anim',
 u'animals': u'anim',
 u'another': u'anoth',
 u'answer': u'answer',
 u'anti': u'anti',
 u'anything': u'anyth',
 u'applied': u'appli',
 u'approach': u'approach',
 u'argue': u'argu',
 u'argued': u'argu',
 u'argues': u'argu',
 u'argument': u'argument',
 u'arguments': u'argument',
 u'aristotle': u'ari

## 9. Handling input specific term set

### 9.1 Identfying matching stems with common words. 

Note that the frequency counts are transferred accordingly.

In [94]:
specifics = {}
for k,v in input_specifics.items():
    stem = input_wset_stems[k]
    words = [w for w,s in common_wset_stems.items() if s == stem]
    if words:
        w = words[0]
        input_freq_dist[w] += input_specifics[k]
        continue
    specifics[k] = v
# Removing the words with matching stems from the specific set.
print("Reduction due to stemm matches: ", len(input_specifics) - len(specifics))
input_specifics = specifics

('Reduction due to stemm matches: ', 7)


### 9.2 Removing open-maker specific terms.

In [95]:
with open("data/specifics_openmaker.txt", "r") as f:
    SPECIFICS_OPENMAKER = set(f.read().strip().split("\n"))
om_specific_stems = {stemmer.stem(k):k for k in SPECIFICS_OPENMAKER}
pp.pprint(om_specific_stems)

{'3-d': '3-d',
 '3d': '3d',
 u'abat': 'abatement',
 u'afford': 'affordable',
 'agenda21': 'agenda21',
 u'anarch': 'anarchism',
 u'autonom': 'autonomous',
 'biodiesel': 'biodiesel',
 u'biodivers': 'biodiversity',
 'biofuel': 'biofuel',
 u'bioga': 'biogas',
 u'biomass': 'biomass',
 u'biospher': 'biosphere',
 u'bricolag': 'bricolage',
 'brundtland': 'brundtland',
 'c2c': 'c2c',
 'cad': 'cad',
 u'cap-and-trad': 'cap-and-trade',
 u'carfre': 'carfree',
 'cdm': 'cdm',
 'christensen': 'christensen',
 u'co-creat': 'co-creation',
 'co-develop': 'co-develop',
 u'co-invent': 'co-invention',
 'co-inventor': 'co-inventor',
 u'coextinct': 'coextinction',
 u'cognit': 'cognition',
 u'commons-bas': 'commons-based',
 u'computer-aid': 'computer-aided',
 u'conferenc': 'conferencing',
 'consortium': 'consortium',
 u'constraint': 'constraints',
 'construct': 'construct',
 'copyleft': 'copyleft',
 'copyright': 'copyright',
 u'cradle-to-cradl': 'cradle-to-cradle',
 u'crowdsourc': 'crowdsourcing',
 u'crowdwork'

In [96]:
specific_wset_dirty = {}
specific_wset_stems_selected = {}
for k,v in input_specifics.items():
    stem = stemmer.stem(k)
    if stem in om_specific_stems.keys():
        if k in specific_wset_stems_selected.keys():
            specific_wset_stems_selected[stem] += v
        else:
            specific_wset_stems_selected[stem] = v
        continue
    if v > 10:
        specific_wset_dirty[k] = v
        
input_specifics = specific_wset_dirty

In [39]:
# The set of stems to be added to the set with makerness counts.

pp.pprint(specific_wset_stems_selected)

{u'3d': 143,
 u'anarch': 12,
 u'bricolag': 44,
 u'cad': 9,
 u'diy': 51,
 u'edupunk': 15,
 u'greenoman': 13,
 u'hackerspac': 25,
 u'internet': 19,
 u'kludg': 46,
 u'kluge': 32,
 u'laser': 20,
 u'mit': 6,
 u'onlin': 17,
 u'prosum': 20,
 u'prosumpt': 10,
 u'prototyp': 8,
 u'softwar': 33,
 u'sustain': 9,
 u'websit': 10}


### 9.3 Remaining frequent input specifics
The manual checking can help to determine what should go into "specifics_openmaker.txt"

In [97]:
print(len(input_specifics))
pp.pprint(input_specifics)

30
{u'"a': 11,
 u'"the': 34,
 u'altruistic': 15,
 u'aret': 11,
 u'aristippus': 35,
 u'atheist': 14,
 u'consequentialism': 16,
 u'cyrenaic': 12,
 u'cyrenaics': 25,
 u'cyrene': 14,
 u'egoism': 29,
 u'epicureanism': 21,
 u'eudaimon': 14,
 u'eudaimonia': 77,
 u'feldman': 15,
 u'galile': 28,
 u'grasset': 28,
 u'harsanyi': 11,
 u'jeremy': 30,
 u'jstor': 11,
 u'michel': 17,
 u'onfray': 38,
 u'paley': 16,
 u'sidgwick': 14,
 u'socrates': 24,
 u'tnnsj': 15,
 u'transhumanism': 12,
 u'yangism': 22,
 u'yangists': 12,
 u'zhu': 11}


In [98]:
pp.pprint(input_specifics.keys())

[u'"a',
 u'onfray',
 u'altruistic',
 u'egoism',
 u'galile',
 u'eudaimonia',
 u'jeremy',
 u'aristippus',
 u'aret',
 u'harsanyi',
 u'cyrenaic',
 u'paley',
 u'cyrenaics',
 u'consequentialism',
 u'transhumanism',
 u'eudaimon',
 u'sidgwick',
 u'zhu',
 u'tnnsj',
 u'yangists',
 u'atheist',
 u'grasset',
 u'yangism',
 u'socrates',
 u'jstor',
 u'cyrene',
 u'feldman',
 u'"the',
 u'michel',
 u'epicureanism']


In [99]:
specific_wset_stems = {}
for k,v in input_specifics.items():
    stem = stemmer.stem(k)
    if stem in specific_wset_stems.keys():
        specific_wset_stems[stem].append((k,v))
    else:
        specific_wset_stems[stem] = [(k,v)]
pp.pprint(specific_wset_stems)

{u'"a': [(u'"a', 11)],
 u'"the': [(u'"the', 34)],
 u'altruist': [(u'altruistic', 15)],
 u'aret': [(u'aret', 11)],
 u'aristippu': [(u'aristippus', 35)],
 u'atheist': [(u'atheist', 14)],
 u'consequenti': [(u'consequentialism', 16)],
 u'cyren': [(u'cyrene', 14)],
 u'cyrena': [(u'cyrenaic', 12), (u'cyrenaics', 25)],
 u'egoism': [(u'egoism', 29)],
 u'epicurean': [(u'epicureanism', 21)],
 u'eudaimon': [(u'eudaimon', 14)],
 u'eudaimonia': [(u'eudaimonia', 77)],
 u'feldman': [(u'feldman', 15)],
 u'galil': [(u'galile', 28)],
 u'grasset': [(u'grasset', 28)],
 u'harsanyi': [(u'harsanyi', 11)],
 u'jeremi': [(u'jeremy', 30)],
 u'jstor': [(u'jstor', 11)],
 u'michel': [(u'michel', 17)],
 u'onfray': [(u'onfray', 38)],
 u'paley': [(u'paley', 16)],
 u'sidgwick': [(u'sidgwick', 14)],
 u'socrat': [(u'socrates', 24)],
 u'tnnsj': [(u'tnnsj', 15)],
 u'transhuman': [(u'transhumanism', 12)],
 u'yangism': [(u'yangism', 22)],
 u'yangist': [(u'yangists', 12)],
 u'zhu': [(u'zhu', 11)]}


## 10. Computing representation power of common words.

In [100]:
# combine
word = []
freq = []
score = []
# common_words = [w[0] for w in common_words]
nEng = 1.0 * n_english
nInp = 1.0 * n_input
for w in common_words:
    # Consider only words whose charcater length is larger than 1
    if len(w) > 1:
        # Log likelihood scores are computed:
        print(w, input_freq_dist[w], english_freq_dist[w])
        s = log((input_freq_dist[w] / nInp) / (english_freq_dist[w] / nEng))
        f = input_freq_dist[w]
        word.append(w)
        freq.append(f)
        score.append(s)

(u'limited', 7, 106)
(u'code', 6, 39)
(u'consider', 11, 127)
(u'chinese', 7, 56)
(u'demand', 6, 102)
(u'desirable', 14, 36)
(u'results', 9, 149)
(u'founded', 7, 20)
(u'concept', 20, 85)
(u'controversial', 6, 12)
(u'welfare', 8, 53)
(u'go', 6, 626)
(u'follow', 8, 97)
(u'mill', 95, 11)
(u'religious', 7, 165)
(u'children', 6, 355)
(u'whose', 10, 251)
(u'everything', 11, 185)
(u'concerned', 6, 135)
(u'young', 7, 385)
(u'unpleasant', 6, 15)
(u'promoting', 6, 13)
(u'imperative', 8, 10)
(u'include', 9, 113)
(u'worth', 8, 94)
(u'hume', 9, 2)
(u'virtuous', 16, 6)
(u'laws', 7, 88)
(u'far', 8, 426)
(u'choice', 10, 113)
(u'every', 17, 491)
(u'gratification', 7, 4)
(u'word', 15, 274)
(u'difference', 9, 148)
(u'materialism', 6, 7)
(u'school', 36, 493)
(u'judaism', 9, 2)
(u'level', 16, 213)
(u'notes', 10, 57)
(u'excellence', 6, 15)
(u'jamaica', 11, 2)
(u'des', 18, 9)
(u'thinkers', 10, 6)
(u'sensation', 12, 14)
(u'past', 7, 281)
(u'likely', 13, 151)
(u'believes', 6, 43)
(u'even', 41, 1170)
(u'will', 5

In [44]:
m = pd.Series(score)
f = pd.Series(freq)
k = pd.Series(word)
stem = k.apply(stemmer.stem)
scoring = len(k) * ['common']
df_common = pd.DataFrame({'Word':k, 'Stem':stem, 'Score':m,'Tf':f, 'Type':scoring})
df_common.sort_values(by='Score', ascending=False, inplace = True)
df_common.reset_index(inplace=True, drop=True)
df_common.head(20)

Unnamed: 0,Score,Stem,Tf,Type,Word
0,6.624507,handyman,46,common,handyman
1,6.484745,handymen,20,common,handymen
2,6.28222,printer,49,common,printer
3,6.197063,oman,15,common,oman
4,6.12807,anti,14,common,anti
5,5.791597,addit,30,common,additive
6,5.434922,citat,7,common,citations
7,5.434922,anarchist,7,common,anarchist
8,5.434922,model,7,common,modeling
9,5.434922,1960,7,common,1960s


## Computing makerness of specific terms

In [101]:
max_score = df_common.iloc[0,0]
threshold_score = 1.0
mean_w = df_common.Score[df_common.Score > threshold_score].mean()
mean_f = df_common.Tf[df_common.Score > threshold_score].mean()
print(max_score, threshold_score, mean_w, mean_f)

(6.6245065252214532, 1.0, 2.3054787693982646, 13.838323353293413)


In [102]:
def compute_speficif_score(f, maxw=10, minw=1, mean_w=3.0, mean_f=25.0):
    return(max(min((f / mean_f) * mean_w, maxw), minw))
compute_speficif_score(mean_w, max_score, threshold_score, mean_w, mean_f)

1.0

In [103]:
stem = []
freq = []
score = []
for k, v in specific_wset_stems_selected.items():
    s = compute_speficif_score(v, max_score, threshold_score, mean_w, mean_f)
    stem.append(k)
    freq.append(v)
    score.append(s)

In [48]:
m = pd.Series(score)
f = pd.Series(freq)
stem = pd.Series(stem)
scoring = len(m) * ['specific']
df_exclusive= pd.DataFrame({'Stem':stem, 'Score':m,'Tf':f, 'Type':scoring})
df_exclusive.sort_values(by='Score', ascending=False, inplace = True)
df_exclusive.reset_index(inplace=True, drop=True)
df_exclusive.head(20)

Unnamed: 0,Score,Stem,Tf,Type
0,6.624507,3d,143,specific
1,6.624507,bricolag,44,specific
2,6.624507,kludg,46,specific
3,6.624507,diy,51,specific
4,5.497834,softwar,33,specific
5,5.331233,kluge,32,specific
6,4.165025,hackerspac,25,specific
7,3.33202,prosum,20,specific
8,3.33202,laser,20,specific
9,3.165419,internet,19,specific


In [104]:
df_makerness = df_common.append(df_exclusive, ignore_index=True)
df_makerness.sort_values(by='Score', ascending=False, inplace = True)
df_makerness.reset_index(inplace=True, drop=True)
df_makerness.head(20)

Unnamed: 0,Score,Stem,Tf,Type,Word
0,6.624507,handyman,46,common,handyman
1,6.624507,3d,143,specific,
2,6.624507,kludg,46,specific,
3,6.624507,diy,51,specific,
4,6.624507,bricolag,44,specific,
5,6.484745,handymen,20,common,handymen
6,6.28222,printer,49,common,printer
7,6.197063,oman,15,common,oman
8,6.12807,anti,14,common,anti
9,5.791597,addit,30,common,additive


In [50]:
df_makerness.head(100)

Unnamed: 0,Score,Stem,Tf,Type,Word
0,6.624507,handyman,46,common,handyman
1,6.624507,3d,143,specific,
2,6.624507,kludg,46,specific,
3,6.624507,diy,51,specific,
4,6.624507,bricolag,44,specific,
5,6.484745,handymen,20,common,handymen
6,6.282220,printer,49,common,printer
7,6.197063,oman,15,common,oman
8,6.128070,anti,14,common,anti
9,5.791597,addit,30,common,additive


In [105]:
OUTPUT_FOLDER = "./output/"
csvfile_name = OUTPUT_FOLDER + "makerness_" + output_fname + ".csv"
with open(csvfile_name, 'w') as csvfile:
    #thewriter = csv.writer(csvfile, delimiter=',')
    #for k,v in sorted(makerness.items(), key=lambda x:x[1][0], reverse=True): thewriter.writerow([k,v[0],v[1]])
    df_makerness.to_csv(csvfile_name)

In [106]:
print(csvfile_name)

./output/makerness_Hedonism.csv
