# Discovery and Representation of Open Making Related Terms

This notebook sketches the initial exercise on discovering the open making related keywords. The input text is harvested via a Web crawler that identifies and crawls semantically related wikipedia articles.   

In [6]:
from utils import tokenizer
import nltk
from nltk import FreqDist
from nltk.stem.porter import PorterStemmer
from numpy import log, mean
import json, csv, re
import pprint as pp

import pandas as pd

## 1. Loading a reference English language corpus

In [1]:
#nltk.download()
from nltk.corpus import brown
brown.categories()

['adventure',
 'belles_lettres',
 'editorial',
 'fiction',
 'government',
 'hobbies',
 'humor',
 'learned',
 'lore',
 'mystery',
 'news',
 'religion',
 'reviews',
 'romance',
 'science_fiction']

## 2. Stop words

### 2.1 Standard stop words

In [2]:
with open("data/stopwords_standard.txt", "r") as f:
    STOP_WORDS_STANDARD = set(f.read().strip().split("\n"))
print(STOP_WORDS_STANDARD)

{"they've", 'could', 'yourselves', "he's", 'for', "weren't", "wouldn't", 'which', 'me', 'below', "doesn't", "isn't", 'its', 'having', 'too', 'this', 'know', 'there', 'them', "they'd", 'no', 'above', 'be', 'her', 'few', "aren't", "won't", 'further', 'what', 'his', "they'll", 'a', 'he', 'only', 'on', 'being', 'other', 'whom', "can't", "i'll", 'same', "don't", "let's", "shan't", "he'll", 'were', 'during', 'has', 'before', "we'll", 'as', 'by', 'off', 'your', "you're", "why's", 'is', 'under', 'such', "he'd", 'www', 'but', "i've", 'each', "couldn't", 'r', 'these', 'we', 'out', 'where', 'when', 'been', 'some', 'you', "we've", "wasn't", 'i', 'so', "you've", 'again', 'http', 'to', "what's", 'why', 'my', "you'd", 'than', 'theirs', 'com', 'while', "that's", "there's", 'after', "didn't", 'does', 'all', 'nor', 'over', 'was', 'had', 'did', 'him', "she'll", 'then', 'until', "here's", 'between', 'yourself', 'if', "we'd", 'once', 'hers', 'itself', 'and', 'ought', 'am', 'she', "haven't", "hasn't", 'our'

### 2.2 Open-making related stop words

To be extended

In [3]:
with open("data/stopwords_openmaker.txt", "r") as f:
    STOP_WORDS_OPENMAKER = set(f.read().strip().split("\n"))
print(STOP_WORDS_OPENMAKER)

{'15th', 'html', 'wikipedia', 'doi', 'well', '9th', 'first', 'etc', '12th', 'vi', 'also', 'britannica', '10th', 'na', 'one', 'wikipedias', 'doc', '13th', 'almost', 'eg', 'isbn', 'ad', '4th', 'may', 'randd', 'iii', '5th', '6th', '11th', 'von', 'second', '1st', 'pp', 'third', '7th', 'many', 'vol', 'pdf', 'org', 'encyclopedia', '2nd', '3rd', '8th', 'tt', 'ii', 'often', 'iv', 'bc', '14th', 'txt'}


## 3. Removing stop words from the reference English corpus

In [4]:
# merging the two list together
STOP_WORDS = STOP_WORDS_STANDARD.union(STOP_WORDS_OPENMAKER)
print(STOP_WORDS)

{'15th', 'could', 'well', '9th', "he's", 'for', "weren't", "wouldn't", 'which', 'me', '10th', 'this', 'above', 'her', "aren't", "won't", 'further', 'what', "they'll", 'isbn', 'randd', 'a', 'only', '5th', '6th', 'being', "can't", 'same', "let's", 'were', "he'll", 'during', 'off', 'by', "you're", 'your', 'iv', '14th', 'is', 'such', 'www', 'but', "couldn't", 'these', 'vi', 'we', 'out', 'where', 'wikipedias', 'so', 'eg', "you've", 'iii', "what's", "you'd", 'than', 'com', '7th', 'many', "that's", 'after', 'vol', 'had', 'did', 'him', 'txt', "here's", 'yourself', 'if', 'first', 'wikipedia', 'once', 'hers', 'itself', 'britannica', 'and', "haven't", 'almost', 'very', 'the', "she'd", 'not', "where's", "mustn't", 'himself', 'from', "it's", 'like', '3rd', 'about', "i'd", 'do', 'themselves', "who's", 'one', "hadn't", 'doing', 'up', '13th', "when's", "you'll", 'ad', "they're", "how's", 'ours ', 'most', 'get', 'often', 'or', 'bc', "i'm", 'more', 'in', "they've", 'html', 'yourselves', '12th', 'below',

In [7]:
# load english words from the Brown corpus removing stop words.
english_freq_dist = FreqDist([w.lower() for w in nltk.corpus.brown.words()
                              if w not in STOP_WORDS])
for stopword in STOP_WORDS:
    if stopword in english_freq_dist:
        del english_freq_dist[stopword]
        
for punctuation in tokenizer.CHARACTERS_TO_SPLIT:
    if punctuation in english_freq_dist:
        del english_freq_dist[punctuation]

# Re-control most common words after cleaning:
n_english = sum(english_freq_dist.values())
english_freq_dist.most_common(10)

[('``', 8837),
 ("''", 8789),
 ('--', 3432),
 ('will', 2245),
 ('said', 1961),
 ('new', 1635),
 ('time', 1598),
 ('two', 1412),
 ('now', 1314),
 ('man', 1207)]

## 4. Loading the input Open Maker corpus

In [7]:
# load the harvested text from wikipedia.
with open("data/corpuses/achievement.json", "r") as f: OM_Corpus_text = f.read()
OM_Corpus = json.loads(OM_Corpus_text)

In [9]:
# The total number of wiki articles used:
print(len(OM_Corpus))

14


In [10]:
# Column names of the the corpus.
OM_Corpus[0].keys()

dict_keys(['theme', 'theme.id', 'document.id', 'title', 'url', 'depth', 'text'])

In [10]:
pp.pprint(OM_Corpus[0]['theme'])
pp.pprint(OM_Corpus[0]['theme.id'])

'achievement'
2


In [11]:
def display_articles(corpus, tid):
    articles = [article for article in corpus if article['theme.id'] == tid]
    for article in articles:
        print(article['document.id'],article['theme.id'], article['theme'], article['depth'], article['url'])

In [12]:
display_articles(OM_Corpus, 2)

1 2 achievement 0 https://en.wikipedia.org/wiki/Need_for_achievement
2 2 achievement 0 https://en.wikipedia.org/wiki/Social_influence
3 2 achievement 1 https://en.wikipedia.org/wiki/Goal_orientation
4 2 achievement 1 https://en.wikipedia.org/wiki/Need_theory
5 2 achievement 1 https://en.wikipedia.org/wiki/Propaganda
6 2 achievement 1 https://en.wikipedia.org/wiki/Mind_shaping
7 2 achievement 1 https://en.wikipedia.org/wiki/Judge%E2%80%93advisor_system
8 2 achievement 1 https://en.wikipedia.org/wiki/Impression_management
9 2 achievement 1 https://en.wikipedia.org/wiki/Need_for_power
10 2 achievement 1 https://en.wikipedia.org/wiki/Social_proof
11 2 achievement 1 https://en.wikipedia.org/wiki/Need_for_cognition
12 2 achievement 1 https://en.wikipedia.org/wiki/Need_for_affiliation
13 2 achievement 1 https://en.wikipedia.org/wiki/Bystander_effect
14 2 achievement 1 https://en.wikipedia.org/wiki/Authority_bias


## 5. Analyzing and cleaning a specific corpus based on a theme

In [13]:
def get_theme(Corpus, theme_id):
    title = ''
    for article in Corpus:
        if article['theme.id'] == theme_id:
            title = article['theme']
            break
    return title

### 5.0 Selecting the specific theme (a sub-corpus).

In [14]:
## For a different sub-corpus use a corresponding theme ID.
current_theme_id = 2

In [15]:
current_theme = get_theme(OM_Corpus, current_theme_id)

In [16]:
output_fname = "_".join([word.capitalize() for word in current_theme.split(" ")])
print(current_theme, "::", output_fname)

achievement :: Achievement


In [17]:
display_articles(OM_Corpus, current_theme_id)

1 2 achievement 0 https://en.wikipedia.org/wiki/Need_for_achievement
2 2 achievement 0 https://en.wikipedia.org/wiki/Social_influence
3 2 achievement 1 https://en.wikipedia.org/wiki/Goal_orientation
4 2 achievement 1 https://en.wikipedia.org/wiki/Need_theory
5 2 achievement 1 https://en.wikipedia.org/wiki/Propaganda
6 2 achievement 1 https://en.wikipedia.org/wiki/Mind_shaping
7 2 achievement 1 https://en.wikipedia.org/wiki/Judge%E2%80%93advisor_system
8 2 achievement 1 https://en.wikipedia.org/wiki/Impression_management
9 2 achievement 1 https://en.wikipedia.org/wiki/Need_for_power
10 2 achievement 1 https://en.wikipedia.org/wiki/Social_proof
11 2 achievement 1 https://en.wikipedia.org/wiki/Need_for_cognition
12 2 achievement 1 https://en.wikipedia.org/wiki/Need_for_affiliation
13 2 achievement 1 https://en.wikipedia.org/wiki/Bystander_effect
14 2 achievement 1 https://en.wikipedia.org/wiki/Authority_bias


In [18]:
def filter_corpus(corpus, remove_list= []):
    return([p for p in corpus if p['document.id'] not in remove_list])

In [19]:
fcorpus = filter_corpus(OM_Corpus, remove_list = [7,13])
display_articles(fcorpus, current_theme_id)

1 2 achievement 0 https://en.wikipedia.org/wiki/Need_for_achievement
2 2 achievement 0 https://en.wikipedia.org/wiki/Social_influence
3 2 achievement 1 https://en.wikipedia.org/wiki/Goal_orientation
4 2 achievement 1 https://en.wikipedia.org/wiki/Need_theory
5 2 achievement 1 https://en.wikipedia.org/wiki/Propaganda
6 2 achievement 1 https://en.wikipedia.org/wiki/Mind_shaping
8 2 achievement 1 https://en.wikipedia.org/wiki/Impression_management
9 2 achievement 1 https://en.wikipedia.org/wiki/Need_for_power
10 2 achievement 1 https://en.wikipedia.org/wiki/Social_proof
11 2 achievement 1 https://en.wikipedia.org/wiki/Need_for_cognition
12 2 achievement 1 https://en.wikipedia.org/wiki/Need_for_affiliation
14 2 achievement 1 https://en.wikipedia.org/wiki/Authority_bias


In [20]:
def merge_documents(corpus, theme_id, filters = []):
    docs = [page for page in corpus if page['theme.id'] == theme_id]
    docs_filtered = [page['text'] for page in corpus if page['document.id'] not in filters]
    text = "\n".join(docs_filtered)
    return(text)

In [21]:
input_text = merge_documents(fcorpus, current_theme_id)

In [22]:
pp.pprint(input_text)

('Need for achievement \n'
 ' Need for achievement \n'
 ' N-Ach \n'
 " refers to an individual's desire for significant accomplishment mastering "
 'of skills control or high standards The term was first used by \n'
 ' Henry Murray \n'
 ' and associated with a range of actions These include "intense prolonged and '
 'repeated efforts to accomplish something difficult To work with singleness '
 'of purpose towards a high and distant \n'
 ' goal \n'
 ' To have the determination to win" The concept of N-Ach was subsequently '
 'popularised by the psychologist \n'
 ' David McClelland \n'
 ' This personality trait is characterized by an enduring and consistent '
 'concern with setting and meeting high standards of achievement This need is '
 'influenced by internal drive for action intrinsic motivation and the '
 'pressure exerted by the expectations of others extrinsic motivation Measured '
 'with the \n'
 ' thematic apperception test \n'
 ' TAT need for achievement motivates an individual

 'it his own Conversion may also refer to individual members of a group '
 'changing from their initial and varied opinions to adopt the opinions of '
 'others which may differ from their original opinions The resulting group '
 'position may be a hybrid of various aspects of individual initial opinions '
 'or it may be an alternative independent of the initial positions reached '
 'through consensus \n'
 ' What appears to be conformity may in fact be congruence Congruence occurs '
 "when an individual's behavior belief or thinking is already aligned with "
 'that of the others and no change occurs \n'
 ' In situations where conformity including compliance conversion and '
 'congruence is absent there are non-conformity processes such as independence '
 'and anti-conformity Independence also referred to as dissent involves an '
 'individual either through their actions or lack of action or through the '
 'public expression of their beliefs or thinking being aligned with their '
 'perso

 'above and beyond cognitive ability \n'
 ' Based on this research goal orientation rather than cognitive ability '
 'serves as useful tool for practitioners to use to predict job performance \n'
 ' Need for achievement \n'
 ' Need for achievement \n'
 ' Need for achievement refers to the degree to which an individual "maintains '
 'high standards" and "aspires to accomplish difficult tasks" \n'
 ' Goal orientation dimensions have been conceptualized as manifestations of '
 "Atkinson's need for achievement and need to avoid failure "
 'competence-relevant motives \n'
 ' In a meta-analysis by Payne et al \n'
 ' the authors found that need for achievement was positively correlated with '
 'LGO negatively associated with APGO and unrelated to PPGO Another '
 'interesting finding by these authors was that need for achievement '
 'correlated more strongly with LGO than the trait conscientiousness Although '
 'LGO and need for achievement were found to be strongly related the findings '
 'de

 'propagandists i e the use of emotionally provocative imagery to distort '
 "facts Workplace propaganda is suggested to use 'distorted data' to overrule "
 'emotion For example workplace propaganda may provide rationales for '
 'ideologically driven pay cuts \n'
 ' Techniques \n'
 ' Further information \n'
 ' Propaganda techniques \n'
 ' Anti-capitalist \n'
 ' propaganda \n'
 ' Common media for transmitting propaganda messages include news reports '
 'government reports historical revision \n'
 ' junk science \n'
 ' books leaflets \n'
 ' movies \n'
 ' radio television and posters Some propaganda campaigns follow a strategic '
 'transmission pattern to \n'
 ' indoctrinate \n'
 ' the target group This may begin with a simple transmission such as a '
 'leaflet or advertisement dropped from a plane or an advertisement Generally '
 'these messages will contain directions on how to obtain more information via '
 'a web site hot line radio program etc as it is seen also for selling '
 'purpo

 ' were circulated during the late 1930s and contained depictions of Jews as '
 'devils child molesters and other morally charged figures Slogans such as '
 '"Judas the Jew betrayed Jesus the German to the Jews" were recited in '
 'class \n'
 ' The following is an example of a propagandistic math problem recommended by '
 'the National Socialist Essence of Education "The Jews are aliens in '
 'Germanyin there were inhabitants in the German Reich of whom 75% were Jews '
 '" \n'
 ' Ace military \n'
 ' Black propaganda \n'
 ' Cartographic propaganda \n'
 ' Crowd manipulation \n'
 ' Disinformation \n'
 ' Edith Cavell First World War propaganda \n'
 ' Fake news \n'
 ' Fake news website \n'
 ' Mind games \n'
 ' Misinformation \n'
 ' Moral panic \n'
 ' Music and political warfare \n'
 ' Nazi propaganda \n'
 ' Overview of 21st century propaganda \n'
 ' Perception management \n'
 ' Politainment \n'
 ' Political warfare \n'
 ' Post-truth politics \n'
 ' Category Propaganda by country \n'
 ' Prop

 ' "The Machiavellian Boss" \n'
 ' Psychology Today \n'
 ' ^ \n'
 ' Leary Kowalski \n'
 ' ^ \n'
 ' Schlenker \n'
 ' p \n'
 ' ^ \n'
 ' a \n'
 ' b \n'
 ' Schlenker \n'
 ' p \n'
 ' ^ \n'
 ' Schlenker \n'
 ' p \n'
 ' ^ \n'
 ' Schlenker \n'
 ' p \n'
 ' ^ \n'
 ' Moffitt Kimberly \n'
 ' "Social Interactions Definition & Types" \n'
 ' ^ \n'
 ' Brown Jonathon \n'
 ' "CHAPTER SELF-PRESENTATION" \n'
 ' PDF \n'
 ' ^ \n'
 ' a \n'
 ' b \n'
 ' Rosenfeld Paul Giacalone Robert A Riordan Catherine A 1994-03-01 \n'
 ' "Impression Management Theory and Diversity Lessons for Organizational '
 'Behavior" \n'
 ' American Behavioral Scientist \n'
 ' doi \n'
 ' 1177/0002764294037005002 \n'
 ' ISSN \n'
 ' 0002-7642 \n'
 ' ^ \n'
 ' a \n'
 ' b \n'
 ' Norris Ashley \n'
 ' "Impression Management Considering Cultural Social and Spiritual Factors" \n'
 ' ^ \n'
 ' a \n'
 ' b \n'
 ' c \n'
 ' d \n'
 ' Lewin Simon Reeves Scott 2011-05-01 "Enacting \'team\' and \'teamwork\' '
 "using Goffman's theory of impression managem

 'negative outcomes with the result often depending on whether an individual '
 'also reports a strong sense of responsibility When combined with a low score '
 'on a measure of responsibility a high nPow score predicts higher rates of \n'
 ' externalizing \n'
 ' self-destructive \n'
 ' behavior such as binge-drinking and physical aggression Men with this '
 'combination of personality traits are more likely to divorce separate or '
 'physically abuse their spouses However this association disappears for '
 'individuals with average or high responsibility scores who are '
 'disproportionately likely to report positive outcomes like taking on social '
 'leadership roles \n'
 ' For society \n'
 ' As with individual outcomes whether a high need for power results in '
 "positive or negative outcomes is influenced by the individual's other traits "
 'particularly responsibility and empathy An argumentative group member may '
 'prevent \n'
 ' groupthink \n'
 ' or they may intimidate other gr

 ' self-esteem \n'
 ' masculine sex-role attitudes and \n'
 ' absorption \n'
 ' NFC is negatively related to \n'
 ' social anxiety \n'
 ' more strongly in females than males \n'
 ' It has been speculated that people who more carefully analyse their world '
 'feel a greater sense of mastery and hence greater self-esteem although it is '
 'also possible that higher self-esteem may lead to greater motivation to '
 'engage in thinking \n'
 ' NFC may be related to masculine sex-role due to the stereotype associating '
 'masculinity with rationality \n'
 ' Regarding absorption people high in NFC may find it easier to devote their '
 'attentional processes exclusively to intellectual tasks \n'
 ' Regarding social anxiety it is possible that greater attention to cognitive '
 'activity may be associated with reduced attention to social cues associated '
 'with negative evaluation \n'
 ' Consumers \n'
 ' Research has shown that high-need for cognition consumers prefer '
 'advertising that featur

In [23]:
# Tokenizing the input text:
tokenized = tokenizer.tokenize_words(input_text)
number_of_words = len(tokenized)
print(number_of_words,current_theme)

29696 achievement


### 5.1 Computing frequency distributions of each token, i.e word, term, pancuation, etc.

In [24]:
input_freq_dist = FreqDist(tokenized)

In [25]:
input_freq_dist.most_common(10)

[('\n', 2452),
 ('the', 1277),
 ('of', 895),
 ('to', 879),
 ('and', 813),
 ('a', 600),
 ('in', 593),
 ('that', 369),
 ('for', 336),
 ('is', 330)]

### 5.2 Removing punctuation and stopwords from the input corpus

In [26]:
for stopword in STOP_WORDS:
    if stopword in input_freq_dist:
        del input_freq_dist[stopword]
        
for punctuation in tokenizer.CHARACTERS_TO_SPLIT:
    if punctuation in input_freq_dist:
        del input_freq_dist[punctuation]

# Re-control most common words after cleaning:
n_input = sum(input_freq_dist.values())
input_freq_dist.most_common(10)

[('social', 202),
 ('s', 173),
 ('need', 171),
 ('propaganda', 146),
 ('people', 139),
 ('goal', 132),
 ('orientation', 103),
 ('self', 94),
 ('high', 93),
 ('others', 85)]

### 5.3 Removing all numbered words

This is an example case for post proceesing in terms of cleaning. The pre-processing, that is data cleaning/preperation during or right after harvesting should be further improved to avoid such processes at this stage.

In [27]:
pattern_letters = re.compile('[a-z]')
def has_letters(x):
    return(pattern_letters.search(x) is not None)

In [28]:
reduced = {k:v for k,v in input_freq_dist.items() if has_letters(k)}
print("Reduction due to all number matches: ", len(input_freq_dist) - len(reduced))
input_freq_dist = reduced

Reduction due to all number matches:  94


### 5.4 Removing single character words


In [29]:
reduced = {k:v for k,v in input_freq_dist.items() if len(k) > 1}
print("Reduction due to single characters: ", len(input_freq_dist) - len(reduced))
input_freq_dist = reduced

Reduction due to single characters:  19


### 5.5 Removing rare words from input distribution

In [30]:
reduced = {k:v for k,v in input_freq_dist.items() if v > 5}
print("Reduction due to rare occurances: ", len(input_freq_dist) - len(reduced))
input_freq_dist = reduced

Reduction due to rare occurances:  3957


## 6. Comparing input vs English corpus volumes

### 6.1 Total words (after cleaning the stopwords) 

In [31]:
print(n_input, n_english)

16206 544168


### 6.2 Number of unique words (after cleaning stopwords and rare words)

In [32]:
n_unique_word_input = len(input_freq_dist.items())
n_unique_word_brown = len(english_freq_dist.items())
n_unique_word_input, n_unique_word_brown

(588, 49598)

### 6.3 Cleaned set of input words/terms

List of words in the corpus in case, for a visual inspection. Such inspections will be used both to improve tokenization as well as filtering.

In [33]:
pp.pprint(sorted(input_freq_dist.items(), key=lambda x:x[1], reverse=True))

[('social', 202),
 ('need', 171),
 ('propaganda', 146),
 ('people', 139),
 ('goal', 132),
 ('orientation', 103),
 ('self', 94),
 ('high', 93),
 ('others', 85),
 ('performance', 85),
 ('management', 81),
 ('influence', 80),
 ('individual', 74),
 ('achievement', 72),
 ('individuals', 72),
 ('learning', 72),
 ('impression', 71),
 ('person', 67),
 ('theory', 62),
 ('information', 62),
 ('personality', 54),
 ('found', 53),
 ('will', 52),
 ('group', 48),
 ('power', 46),
 ('research', 45),
 ('cognition', 45),
 ('used', 42),
 ('work', 41),
 ('communication', 41),
 ('mcclelland', 40),
 ('behavior', 40),
 ('use', 40),
 ('example', 39),
 ('goals', 38),
 ('affiliation', 37),
 ('situations', 37),
 ('likely', 37),
 ('corporate', 36),
 ('war', 34),
 ('proof', 33),
 ('public', 32),
 ('low', 30),
 ('psychology', 30),
 ('cognitive', 30),
 ('control', 29),
 ('related', 29),
 ('new', 29),
 ('situation', 29),
 ('two', 29),
 ('study', 29),
 ('nfc', 29),
 ('needs', 28),
 ('authority', 28),
 ('media', 28),
 (

### 6.4 Set of terms/words that occure in both corpus.

In [34]:
len(input_freq_dist.keys())

588

In [35]:
common_words = [w for w in set(input_freq_dist.keys()) & set(english_freq_dist.keys())]
print(len(common_words))

549


In [36]:
pp.pprint(sorted(common_words))

['20th',
 'abilities',
 'ability',
 'academic',
 'accept',
 'acceptance',
 'accepted',
 'according',
 'achieve',
 'achievement',
 'act',
 'action',
 'actions',
 'activities',
 'activity',
 'actually',
 'addition',
 'adopt',
 'advertising',
 'affect',
 'affiliation',
 'against',
 'al',
 'although',
 'ambiguous',
 'american',
 'among',
 'analysis',
 'another',
 'antecedents',
 'anti',
 'appearance',
 'approach',
 'appropriate',
 'argued',
 'argues',
 'article',
 'asked',
 'aspects',
 'associated',
 'attempt',
 'attention',
 'attitude',
 'attitudes',
 'attractive',
 'audience',
 'audiences',
 'authority',
 'authors',
 'avoid',
 'avoidance',
 'based',
 'become',
 'behavior',
 'behaviors',
 'behaviour',
 'belief',
 'beliefs',
 'believe',
 'believed',
 'best',
 'better',
 'bias',
 'biases',
 'big',
 'black',
 'book',
 'brand',
 'bring',
 'called',
 'cause',
 'causes',
 'centered',
 'century',
 'certain',
 'change',
 'channels',
 'characteristics',
 'children',
 'civil',
 'cognitive',
 'cohen

### 6.5 Set of terms/words that occur in the sample but not in the reference corpus.

This specific set will be incorporated later below. 

In [37]:
input_specifics = dict()
for w in set(input_freq_dist.keys()) - set(english_freq_dist.keys()):
    input_specifics[w] = input_freq_dist[w]

In [38]:
print(len(input_specifics))

39


In [39]:
pp.pprint(sorted(input_specifics))

['"a',
 '"the',
 'ach',
 'apgo',
 'apperception',
 'cacioppo',
 'cialdini',
 'cmc',
 'cognition',
 'cognition"',
 'conceptualized',
 'constructs',
 'copycat',
 'dweck',
 'eison',
 'goffman',
 'im',
 'internalization',
 'interprofessional',
 'kelman',
 'lgo',
 'mcclelland',
 'mediated',
 'meta',
 'metacognition',
 'metacognitive',
 'milgram',
 'motivational',
 'networking',
 'nfc',
 'nicholls',
 'npow',
 'online',
 'openness',
 'reactance',
 'schlenker',
 'strategies',
 'vandewalle',
 'workplace']


## 7. Stemming

In [40]:
stemmer = PorterStemmer()
input_wset_stems = {k: stemmer.stem(k) for k in input_freq_dist.keys()}
common_wset_stems = {k: stemmer.stem(k) for k in common_words}
pp.pprint(common_wset_stems)

{'20th': '20th',
 'abilities': 'abil',
 'ability': 'abil',
 'academic': 'academ',
 'accept': 'accept',
 'acceptance': 'accept',
 'accepted': 'accept',
 'according': 'accord',
 'achieve': 'achiev',
 'achievement': 'achiev',
 'act': 'act',
 'action': 'action',
 'actions': 'action',
 'activities': 'activ',
 'activity': 'activ',
 'actually': 'actual',
 'addition': 'addit',
 'adopt': 'adopt',
 'advertising': 'advertis',
 'affect': 'affect',
 'affiliation': 'affili',
 'against': 'against',
 'al': 'al',
 'although': 'although',
 'ambiguous': 'ambigu',
 'american': 'american',
 'among': 'among',
 'analysis': 'analysi',
 'another': 'anoth',
 'antecedents': 'anteced',
 'anti': 'anti',
 'appearance': 'appear',
 'approach': 'approach',
 'appropriate': 'appropri',
 'argued': 'argu',
 'argues': 'argu',
 'article': 'articl',
 'asked': 'ask',
 'aspects': 'aspect',
 'associated': 'associ',
 'attempt': 'attempt',
 'attention': 'attent',
 'attitude': 'attitud',
 'attitudes': 'attitud',
 'attractive': 'at

 'seen': 'seen',
 'self': 'self',
 'sense': 'sens',
 'serve': 'serv',
 'set': 'set',
 'setting': 'set',
 'show': 'show',
 'showing': 'show',
 'shown': 'shown',
 'shows': 'show',
 'similar': 'similar',
 'since': 'sinc',
 'sites': 'site',
 'situation': 'situat',
 'situations': 'situat',
 'skills': 'skill',
 'social': 'social',
 'society': 'societi',
 'someone': 'someon',
 'source': 'sourc',
 'sources': 'sourc',
 'soviet': 'soviet',
 'specific': 'specif',
 'standards': 'standard',
 'state': 'state',
 'stated': 'state',
 'states': 'state',
 'status': 'statu',
 'stories': 'stori',
 'strategy': 'strategi',
 'strength': 'strength',
 'strong': 'strong',
 'strongly': 'strongli',
 'structure': 'structur',
 'students': 'student',
 'studied': 'studi',
 'studies': 'studi',
 'study': 'studi',
 'subject': 'subject',
 'subjects': 'subject',
 'success': 'success',
 'successful': 'success',
 'suggested': 'suggest',
 'suggests': 'suggest',
 'surrounding': 'surround',
 'system': 'system',
 'take': 'take',

## 8. Handling input specific term set

### 8.1 Identfying matching stems with common words. 

Note that the frequency counts are transferred accordingly.

In [41]:
specifics = {}
for k,v in input_specifics.items():
    stem = input_wset_stems[k]
    words = [w for w,s in common_wset_stems.items() if s == stem]
    if words:
        w = words[0]
        input_freq_dist[w] += input_specifics[k]
        continue
    specifics[k] = v
# Removing the words with matching stems from the specific set.
print("Reduction due to stemm matches: ", len(input_specifics) - len(specifics))
input_specifics = specifics

Reduction due to stemm matches:  6


### 8.2 Removing open-maker specific terms.

In [42]:
with open("data/specifics_openmaker.txt", "r") as f:
    SPECIFICS_OPENMAKER = set(f.read().strip().split("\n"))
om_specific_stems = {stemmer.stem(k):k for k in SPECIFICS_OPENMAKER}
pp.pprint(om_specific_stems)

{'3-d': '3-d',
 '3d': '3d',
 'abat': 'abatement',
 'afford': 'affordable',
 'agenda21': 'agenda21',
 'anarch': 'anarchism',
 'autonom': 'autonomous',
 'biodiesel': 'biodiesel',
 'biodivers': 'biodiversity',
 'biofuel': 'biofuel',
 'bioga': 'biogas',
 'biomass': 'biomass',
 'biospher': 'biosphere',
 'bricolag': 'bricolage',
 'brundtland': 'brundtland',
 'c2c': 'c2c',
 'cad': 'cad',
 'cap-and-trad': 'cap-and-trade',
 'carfre': 'carfree',
 'cdm': 'cdm',
 'christensen': 'christensen',
 'co-creat': 'co-creation',
 'co-develop': 'co-develop',
 'co-invent': 'co-invention',
 'co-inventor': 'co-inventor',
 'coextinct': 'coextinction',
 'cognit': 'cognition',
 'commons-bas': 'commons-based',
 'computer-aid': 'computer-aided',
 'conferenc': 'conferencing',
 'consortium': 'consortium',
 'constraint': 'constraints',
 'construct': 'construct',
 'copyleft': 'copyleft',
 'copyright': 'copyright',
 'cradle-to-cradl': 'cradle-to-cradle',
 'crowdsourc': 'crowdsourcing',
 'crowdwork': 'crowdworker',
 'cuv

In [43]:
specific_wset_dirty = {}
specific_wset_stems_selected = {}
for k,v in input_specifics.items():
    stem = stemmer.stem(k)
    if stem in om_specific_stems.keys():
        if k in specific_wset_stems_selected.keys():
            specific_wset_stems_selected[stem] += v
        else:
            specific_wset_stems_selected[stem] = v
        continue
    if v > 10:
        specific_wset_dirty[k] = v
        
input_specifics = specific_wset_dirty

In [44]:
# The set of stems to be added to the set with makerness counts.

pp.pprint(specific_wset_stems_selected)

{'onlin': 8}


### 8.3 Remaining frequent input specifics
The manual checking can help to determine what should go into "specifics_openmaker.txt"

In [45]:
print(len(input_specifics))
pp.pprint(sorted(input_specifics))

10
['"the',
 'cmc',
 'conceptualized',
 'goffman',
 'lgo',
 'mcclelland',
 'nfc',
 'openness',
 'vandewalle',
 'workplace']


In [46]:
specific_wset_stems = {}
for k,v in input_specifics.items():
    stem = stemmer.stem(k)
    if stem in specific_wset_stems.keys():
        specific_wset_stems[stem].append((k,v))
    else:
        specific_wset_stems[stem] = [(k,v)]
pp.pprint(specific_wset_stems)

{'"the': [('"the', 14)],
 'cmc': [('cmc', 12)],
 'conceptu': [('conceptualized', 12)],
 'goffman': [('goffman', 16)],
 'lgo': [('lgo', 15)],
 'mcclelland': [('mcclelland', 40)],
 'nfc': [('nfc', 29)],
 'open': [('openness', 12)],
 'vandewal': [('vandewalle', 11)],
 'workplac': [('workplace', 11)]}


## 9. Computing representation power of common words.

In [47]:
# combine
word = []
freq = []
score = []
# common_words = [w[0] for w in common_words]
nEng = 1.0 * n_english
nInp = 1.0 * n_input
for w in common_words:
    # Consider only words whose charcater length is larger than 1
    if len(w) > 1:
        # Log likelihood scores are computed:
        print(w, input_freq_dist[w], english_freq_dist[w])
        s = log((input_freq_dist[w] / nInp) / (english_freq_dist[w] / nEng))
        f = input_freq_dist[w]
        word.append(w)
        freq.append(f)
        score.append(s)

negative 21 53
human 18 299
impact 9 67
colleagues 9 23
creating 7 29
attitudes 8 48
responsibility 8 118
take 9 610
gender 6 2
show 10 288
development 7 334
without 6 583
civil 6 91
thoughts 6 54
corporate 36 19
strategy 22 22
success 13 93
status 7 97
big 12 360
types 17 116
new 29 1635
good 6 806
channels 7 23
want 8 328
informational 10 2
lack 9 110
learning 72 60
cultures 7 12
proof 33 40
explain 6 64
construct 23 12
view 9 186
known 9 245
surrounding 8 27
efforts 8 127
high 93 497
play 7 200
presence 9 76
false 13 29
maintain 6 60
content 10 53
importance 13 108
objective 6 91
control 29 223
desire 14 79
self 94 39
school 11 493
soviet 6 129
receive 6 76
avoid 23 58
satisfaction 8 28
best 13 351
communication 41 67
used 42 611
information 62 269
media 28 13
causes 6 58
against 10 627
potential 11 67
perceptions 7 9
showing 6 61
motivated 21 9
front 9 221
competence 16 18
act 9 283
activities 11 115
group 48 390
communist 6 97
either 19 284
person 67 174
term 25 79
time 23 1598
or

In [48]:
m = pd.Series(score)
f = pd.Series(freq)
k = pd.Series(word)
stem = k.apply(stemmer.stem)
scoring = len(k) * ['common']
df_common = pd.DataFrame({'Word':k, 'Stem':stem, 'Score':m,'Tf':f, 'Type':scoring})
df_common.sort_values(by='Score', ascending=False, inplace = True)
df_common.reset_index(inplace=True, drop=True)
df_common.head(20)

Unnamed: 0,Score,Stem,Tf,Type,Word
0,7.138217,cognit,75,common,cognitive
1,6.152934,anti,14,common,anti
2,5.998783,orient,12,common,orientations
3,5.7385,affili,37,common,affiliation
4,5.711101,behavior,9,common,behaviors
5,5.711101,tat,9,common,tat
6,5.593318,norm,8,common,normative
7,5.459787,themat,7,common,thematic
8,5.410996,trait,20,common,trait
9,5.376017,orient,103,common,orientation


### 9.1 Computing makerness of specific terms

In [49]:
max_score = df_common.iloc[0,0]
threshold_score = 1.0
mean_w = df_common.Score[df_common.Score > threshold_score].mean()
mean_f = df_common.Tf[df_common.Score > threshold_score].mean()
print(max_score, threshold_score, mean_w, mean_f)

7.13821741165 1.0 2.4332084572479884 18.131195335276967


In [50]:
def compute_speficif_score(f, maxw=10, minw=1, mean_w=3.0, mean_f=25.0):
    return(max(min((f / mean_f) * mean_w, maxw), minw))
compute_speficif_score(mean_w, max_score, threshold_score, mean_w, mean_f)

1.0

In [51]:
stem = []
freq = []
score = []
for k, v in specific_wset_stems_selected.items():
    s = compute_speficif_score(v, max_score, threshold_score, mean_w, mean_f)
    stem.append(k)
    freq.append(v)
    score.append(s)

## 10. Tabulating the results and generating the output file

In [52]:
m = pd.Series(score)
f = pd.Series(freq)
stem = pd.Series(stem)
scoring = len(m) * ['specific']
df_exclusive= pd.DataFrame({'Stem':stem, 'Score':m,'Tf':f, 'Type':scoring})
df_exclusive.sort_values(by='Score', ascending=False, inplace = True)
df_exclusive.reset_index(inplace=True, drop=True)
df_exclusive.head(20)

Unnamed: 0,Score,Stem,Tf,Type
0,1.073601,onlin,8,specific


In [53]:
df_makerness = df_common.append(df_exclusive, ignore_index=True)
df_makerness.sort_values(by='Score', ascending=False, inplace = True)
df_makerness.reset_index(inplace=True, drop=True)
df_makerness.head(10)

Unnamed: 0,Score,Stem,Tf,Type,Word
0,7.138217,cognit,75,common,cognitive
1,6.152934,anti,14,common,anti
2,5.998783,orient,12,common,orientations
3,5.7385,affili,37,common,affiliation
4,5.711101,behavior,9,common,behaviors
5,5.711101,tat,9,common,tat
6,5.593318,norm,8,common,normative
7,5.459787,themat,7,common,thematic
8,5.410996,trait,20,common,trait
9,5.376017,orient,103,common,orientation


In [54]:
df_makerness.tail(10)

Unnamed: 0,Score,Stem,Tf,Type,Word
540,-0.926157,men,9,common,men
541,-0.930912,much,11,common,much
542,-0.985933,even,13,common,even
543,-1.0089,must,11,common,must
544,-1.038244,american,6,common,american
545,-1.04,great,7,common,great
546,-1.062551,without,6,common,without
547,-1.112729,right,6,common,right
548,-1.314437,made,9,common,made
549,-1.386448,good,6,common,good


In [55]:
OUTPUT_FOLDER = "./output/"
csvfile_name = OUTPUT_FOLDER + "makerness_" + output_fname + ".csv"
with open(csvfile_name, 'w') as csvfile:
    #thewriter = csv.writer(csvfile, delimiter=',')
    #for k,v in sorted(makerness.items(), key=lambda x:x[1][0], reverse=True): thewriter.writerow([k,v[0],v[1]])
    df_makerness.to_csv(csvfile_name)

### 10.1 Outputfile name for the theme

In [56]:
print(csvfile_name)

./output/makerness_Achievement.csv


In [52]:
%connect_info

{
  "stdin_port": 57010, 
  "ip": "127.0.0.1", 
  "control_port": 57011, 
  "hb_port": 57012, 
  "signature_scheme": "hmac-sha256", 
  "key": "4fccddc9-093c3731ac31185888272232", 
  "kernel_name": "", 
  "shell_port": 57008, 
  "transport": "tcp", 
  "iopub_port": 57009
}

Paste the above JSON into a file, and connect with:
    $> jupyter <app> --existing <file>
or, if you are local, you can connect with just:
    $> jupyter <app> --existing kernel-c0b883ea-4350-4fd0-9746-b97ca85b048e.json
or even just:
    $> jupyter <app> --existing
if this is the most recent Jupyter kernel you have started.
