# Discovery and Representation of Open Making Related Terms

This notebook sketches the initial exercise on discovering the open making related keywords. The input text is harvested via a Web crawler that identifies and crawls semantically related wikipedia articles.   

In [1]:
from utils import tokenizer
import nltk
from nltk import FreqDist
from nltk.stem.porter import PorterStemmer
from numpy import log, mean
import json, csv, re
import pprint as pp

import pandas as pd

## 1. Loading a reference English language corpus

In [2]:
from nltk.corpus import brown
brown.categories()

['adventure',
 'belles_lettres',
 'editorial',
 'fiction',
 'government',
 'hobbies',
 'humor',
 'learned',
 'lore',
 'mystery',
 'news',
 'religion',
 'reviews',
 'romance',
 'science_fiction']

## 2. Stop words

### 2.1 Standard stop words

In [3]:
with open("data/stopwords_standard.txt", "r") as f:
    STOP_WORDS_STANDARD = set(f.read().strip().split("\n"))
print(STOP_WORDS_STANDARD)

{'doing', "doesn't", "they'll", 'would', 'that', 'www', "shouldn't", 'over', 'or', "you're", 'while', 'a', "didn't", 'itself', 'how', "he'll", 'for', 'too', 'me', 'after', "he'd", "where's", 'again', 'in', 'such', 'no', 'having', 'because', 'been', "you'll", 'its', 'get', 'like', 'here', 'our', 'nor', "haven't", 'hers', "i've", 'has', 'any', "hasn't", "that's", 'do', 'through', "hadn't", 'being', 'it', 'about', 'which', 'com', 'your', 'yourself', 'those', 'with', 'so', 'very', 'as', 'by', 'than', 'off', 'were', "i'm", 'under', "she'd", "i'd", 'during', 'myself', "wouldn't", 'be', 'just', 'on', 'are', "there's", 'until', 'of', 'all', "she'll", 'http', "shan't", 'to', 'at', "he's", 'some', "let's", 'should', 'have', 'he', 'was', 'not', "isn't", 'could', 'out', 'above', 'cannot', 'ours ', 'whom', 'r', 'when', 'my', 'ourselves', 'each', "you've", 'this', 'who', "it's", "won't", 'and', 'down', 'they', 'where', 'further', 'same', 'there', 'what', 'own', "they'd", "we've", 'does', 'the', 'you

### 2.2 Open-making related stop words

In [4]:
with open("data/stopwords_openmaker.txt", "r") as f:
    STOP_WORDS_OPENMAKER = set(f.read().strip().split("\n"))
print(STOP_WORDS_OPENMAKER)

{'iv', 'wikipedia', '6th', '5th', 'bc', 'iii', '12th', '8th', 'often', 'one', 'wikipedias', '14th', 'also', 'well', 'vi', 'eg', 'vol', 'na', 'von', 'ad', 'randd', 'third', 'first', 'etc', 'ii', '1st', 'org', 'second', '7th', 'may', '11th', 'many', '10th', '13th', 'pp', 'tt', 'encyclopedia', 'almost', 'pdf', 'html', 'britannica', 'doi', '15th', 'isbn', '3rd', '4th', 'doc', '9th', '2nd', 'txt'}


## 3. Removing stop words from the reference English corpus

In [5]:
# merging the two list together
STOP_WORDS = STOP_WORDS_STANDARD.union(STOP_WORDS_OPENMAKER)
print(STOP_WORDS)

{'doing', 'iii', '8th', 'would', 'www', 'over', 'eg', 'ad', 'a', 'itself', 'how', 'too', 'me', "where's", 'again', 'having', 'because', 'here', 'our', 'doc', "haven't", 'hers', '6th', 'has', "hasn't", 'do', 'often', 'vi', 'it', 'about', 'your', 'com', 'yourself', 'those', 'with', 'first', 'as', 'org', '10th', "i'm", 'pp', 'under', 'pdf', 'html', 'during', "wouldn't", 'be', 'just', 'are', 'until', 'of', "she'll", "shan't", 'bc', "he's", "let's", 'vol', 'have', 'he', 'not', 'above', 'ours ', 'ii', 'r', 'my', 'and', 'down', '13th', 'further', 'same', 'what', 'yours', 'own', "they'd", 'the', "aren't", 'did', 'themselves', 'we', 'had', 'then', 'why', 'only', 'one', 'i', 'wikipedias', 'also', 'into', "she's", 'him', "couldn't", 'both', "don't", 'ought', 'most', 'from', "how's", 'below', 'himself', "they're", 'their', 'theirs', 'once', '7th', "you'd", "why's", 'can', 'other', "we're", 'if', 'an', "mustn't", 'between', "doesn't", '5th', "they'll", 'that', "shouldn't", 'well', "you're", 'or', '

In [6]:
# load english words from the Brown corpus removing stop words.
english_freq_dist = FreqDist([w.lower() for w in nltk.corpus.brown.words()
                              if w not in STOP_WORDS])
for stopword in STOP_WORDS:
    if stopword in english_freq_dist:
        del english_freq_dist[stopword]
        
for punctuation in tokenizer.CHARACTERS_TO_SPLIT:
    if punctuation in english_freq_dist:
        del english_freq_dist[punctuation]

# Re-control most common words after cleaning:
n_english = sum(english_freq_dist.values())
english_freq_dist.most_common(10)

[('``', 8837),
 ("''", 8789),
 ('--', 3432),
 ('will', 2245),
 ('said', 1961),
 ('new', 1635),
 ('time', 1598),
 ('two', 1412),
 ('now', 1314),
 ('man', 1207)]

## 4. Loading the input Open Maker corpus

In [7]:
# load the harvested text from wikipedia.
with open("data/corpuses/achievement.json", "r") as f: OM_Corpus_text = f.read()
OM_Corpus = json.loads(OM_Corpus_text)

In [8]:
# The total number of wiki articles used:
print(len(OM_Corpus))

14


In [9]:
# Column names of the the corpus.
OM_Corpus[0].keys()

dict_keys(['theme', 'theme.id', 'document.id', 'title', 'url', 'depth', 'text'])

In [10]:
pp.pprint(OM_Corpus[0]['theme'])
pp.pprint(OM_Corpus[0]['theme.id'])

'achievement'
2


In [11]:
def display_articles(corpus, tid):
    articles = [article for article in corpus if article['theme.id'] == tid]
    for article in articles:
        print(article['document.id'],article['theme.id'], article['theme'], article['depth'], article['url'])

In [12]:
display_articles(OM_Corpus, 2)

1 2 achievement 0 https://en.wikipedia.org/wiki/Need_for_achievement
2 2 achievement 0 https://en.wikipedia.org/wiki/Social_influence
3 2 achievement 1 https://en.wikipedia.org/wiki/Goal_orientation
4 2 achievement 1 https://en.wikipedia.org/wiki/Need_theory
5 2 achievement 1 https://en.wikipedia.org/wiki/Propaganda
6 2 achievement 1 https://en.wikipedia.org/wiki/Mind_shaping
7 2 achievement 1 https://en.wikipedia.org/wiki/Judge%E2%80%93advisor_system
8 2 achievement 1 https://en.wikipedia.org/wiki/Impression_management
9 2 achievement 1 https://en.wikipedia.org/wiki/Need_for_power
10 2 achievement 1 https://en.wikipedia.org/wiki/Social_proof
11 2 achievement 1 https://en.wikipedia.org/wiki/Need_for_cognition
12 2 achievement 1 https://en.wikipedia.org/wiki/Need_for_affiliation
13 2 achievement 1 https://en.wikipedia.org/wiki/Bystander_effect
14 2 achievement 1 https://en.wikipedia.org/wiki/Authority_bias


## 5. Analyzing and cleaning a specific corpus based on a theme

In [13]:
def get_theme(Corpus, theme_id):
    title = ''
    for article in Corpus:
        if article['theme.id'] == theme_id:
            title = article['theme']
            break
    return title

### 5.0 Selecting the specific theme (a sub-corpus).

In [14]:
## For a different sub-corpus use a corresponding theme ID.
current_theme_id = 2

In [15]:
current_theme = get_theme(OM_Corpus, current_theme_id)

In [16]:
output_fname = "_".join([word.capitalize() for word in current_theme.split(" ")])
print(current_theme, "::", output_fname)

achievement :: Achievement


In [17]:
display_articles(OM_Corpus, current_theme_id)

1 2 achievement 0 https://en.wikipedia.org/wiki/Need_for_achievement
2 2 achievement 0 https://en.wikipedia.org/wiki/Social_influence
3 2 achievement 1 https://en.wikipedia.org/wiki/Goal_orientation
4 2 achievement 1 https://en.wikipedia.org/wiki/Need_theory
5 2 achievement 1 https://en.wikipedia.org/wiki/Propaganda
6 2 achievement 1 https://en.wikipedia.org/wiki/Mind_shaping
7 2 achievement 1 https://en.wikipedia.org/wiki/Judge%E2%80%93advisor_system
8 2 achievement 1 https://en.wikipedia.org/wiki/Impression_management
9 2 achievement 1 https://en.wikipedia.org/wiki/Need_for_power
10 2 achievement 1 https://en.wikipedia.org/wiki/Social_proof
11 2 achievement 1 https://en.wikipedia.org/wiki/Need_for_cognition
12 2 achievement 1 https://en.wikipedia.org/wiki/Need_for_affiliation
13 2 achievement 1 https://en.wikipedia.org/wiki/Bystander_effect
14 2 achievement 1 https://en.wikipedia.org/wiki/Authority_bias


In [18]:
def filter_corpus(corpus, remove_list= []):
    return([p for p in corpus if p['document.id'] not in remove_list])

In [19]:
fcorpus = filter_corpus(OM_Corpus, remove_list = [3,4,5])
display_articles(fcorpus, current_theme_id)

1 2 achievement 0 https://en.wikipedia.org/wiki/Need_for_achievement
2 2 achievement 0 https://en.wikipedia.org/wiki/Social_influence
6 2 achievement 1 https://en.wikipedia.org/wiki/Mind_shaping
7 2 achievement 1 https://en.wikipedia.org/wiki/Judge%E2%80%93advisor_system
8 2 achievement 1 https://en.wikipedia.org/wiki/Impression_management
9 2 achievement 1 https://en.wikipedia.org/wiki/Need_for_power
10 2 achievement 1 https://en.wikipedia.org/wiki/Social_proof
11 2 achievement 1 https://en.wikipedia.org/wiki/Need_for_cognition
12 2 achievement 1 https://en.wikipedia.org/wiki/Need_for_affiliation
13 2 achievement 1 https://en.wikipedia.org/wiki/Bystander_effect
14 2 achievement 1 https://en.wikipedia.org/wiki/Authority_bias


In [20]:
def merge_documents(corpus, theme_id, filters = []):
    docs = [page for page in corpus if page['theme.id'] == theme_id]
    docs_filtered = [page['text'] for page in corpus if page['document.id'] not in filters]
    text = "\n".join(docs_filtered)
    return(text)

In [21]:
input_text = merge_documents(fcorpus, current_theme_id)

In [22]:
pp.pprint(input_text)

('Need for achievement \n'
 ' Need for achievement \n'
 ' N-Ach \n'
 " refers to an individual's desire for significant accomplishment mastering "
 'of skills control or high standards The term was first used by \n'
 ' Henry Murray \n'
 ' and associated with a range of actions These include "intense prolonged and '
 'repeated efforts to accomplish something difficult To work with singleness '
 'of purpose towards a high and distant \n'
 ' goal \n'
 ' To have the determination to win" The concept of N-Ach was subsequently '
 'popularised by the psychologist \n'
 ' David McClelland \n'
 ' This personality trait is characterized by an enduring and consistent '
 'concern with setting and meeting high standards of achievement This need is '
 'influenced by internal drive for action intrinsic motivation and the '
 'pressure exerted by the expectations of others extrinsic motivation Measured '
 'with the \n'
 ' thematic apperception test \n'
 ' TAT need for achievement motivates an individual

 'by people or groups that are influential to the individual The individual '
 'accepts the influence because the content of the influence accepted is '
 "intrinsically rewarding It is congruent with the individual's value system "
 'and according to Kelman the "reward" of internalization is "the content of '
 'the new behavior" \n'
 ' Conformity \n'
 ' Conformity \n'
 ' Conformity is a type of social influence involving a change in behavior '
 'belief or thinking to align with those of others or with normative standards '
 'It is the most common and pervasive form of social influence \n'
 ' Social psychology \n'
 ' research in conformity tends to distinguish between two varieties \n'
 ' informational conformity \n'
 ' also called \n'
 ' social proof \n'
 ' or "internalization" in Kelman\'s terms and \n'
 ' normative conformity \n'
 ' "compliance" in Kelman\'s terms \n'
 ' In the case of \n'
 ' peer pressure \n'
 ' a person is convinced to do something that they might not want to do su

 'an icon or is most "popular" within a group This person has the most '
 "influence over others For example in a child's school life people who seem "
 'to control the perceptions of the students at school are most powerful in '
 'having a social influence over other children \n'
 ' Culture \n'
 ' Culture \n'
 ' appears to play a role in the willingness of an individual to conform to '
 'the standards of a group \n'
 ' Stanley Milgram \n'
 ' found that conformity was higher in \n'
 ' Norway \n'
 ' than in \n'
 ' France \n'
 " This has been attributed to Norway's longstanding tradition of social "
 "responsibility compared to France's cultural focus on individualism Japan "
 'likewise has a collectivist culture and thus a higher propensity to '
 'conformity However a \n'
 ' Asch-style study \n'
 ' found that when alienated Japanese students were more susceptible to \n'
 ' anticonformity \n'
 ' giving answers that were \n'
 ' incorrect \n'
 ' even when the group had collaborated on \n'


 'or cheatersfellow humans who undermine social life by deception theft or '
 'other non-cooperative behavior \n'
 ' There are many methods behind self-presentation including \n'
 ' self disclosure \n'
 ' identifying what makes you "you" to another person managing appearances '
 "trying to fit in ingratiation aligning actions making one's actions seem "
 'appealing or understandable and alter-casting imposing identities on other '
 'people These self-presentation methods can also be used on the corporate '
 'level as impression management \n'
 ' Self-presentation \n'
 ' Self-presentation is conveying information about oneself or an image of '
 'oneself to others There are two types and motivations of self-presentation \n'
 " presentation meant to match one's own self-image and \n"
 ' presentation meant to match audience expectations and preferences \n'
 ' Self-presentation is expressive Individuals construct an image of '
 'themselves to claim personal identity and present themselves i

 'have of themselves shape and are shaped by social interactions \n'
 ' Our self-concept develops from social experience early in life \n'
 ' Schlenker further suggests that children anticipate the effect that their '
 'behaviours will have on others and how others will evaluate them They '
 'control the impressions they might form on others and in doing so they '
 'control the outcomes they obtain from social interactions \n'
 ' Social identity \n'
 ' refers to how people are defined and regarded in social interactions \n'
 ' Individuals use impression management strategies to influence the social '
 'identity they project to others \n'
 ' The identity that people establish influences their behaviour in front of '
 "others others' treatment of them and the outcomes they receive Therefore in "
 'their attempts to influence the impressions others form of themselves a '
 'person plays an important role in affecting his social outcomes \n'
 ' Social interaction is the process by which we 

 "present In McClelland's research he found that people who had need for "
 'affiliation were often unpopular tried to avoid interpersonal conflicts '
 'because they have levels of anxiety about if others will accept them \n'
 ' Will to power\n'
 'Social proof \n'
 ' This article is about behavior imitation to reflect correct social behavior '
 'For imitation to gain approval see \n'
 ' normative social influence \n'
 ' Social proof \n'
 ' also known as \n'
 ' informational social influence \n'
 ' is a \n'
 ' psychological \n'
 ' and \n'
 ' social \n'
 ' phenomenon where people assume the actions of others in an attempt to '
 'reflect correct behavior in a given situation \n'
 ' Social proof is considered prominent in ambiguous social situations where '
 'people are unable to determine the appropriate \n'
 ' mode of behavior \n'
 ' and is driven by the assumption that the surrounding people possess more '
 'knowledge about the current situation \n'
 ' The effects of \n'
 ' social influ

 'Instagram and YouTube The number of followers fans views likes favorites and '
 'even comments that a user has made positively affects how other users '
 'perceive them A user on Twitter with a million followers is perceived as '
 'more trustworthy and reputable than a similar user with a thousand followers '
 'resulting in faster growth of followers and higher engagement and '
 'click-through-rates \n'
 ' Although these fake followers will never help meet business objectives or '
 'generate sales directly \n'
 ' An entire multimillion-dollar industry known as \n'
 ' ghost followers \n'
 ' exist for the sole purpose of increasing social proof on social media \n'
 ' The environment \n'
 ' Social norms are often not clearly articulated for sustainable or '
 'pro-environmental conduct \n'
 ' A good example is the consumption of bottled water Many people are '
 'generally unaware of the negative environmental consequences associated with '
 'the production and consumption of bottled wate

 ' David McClelland \n'
 " and describes a person's \n"
 ' need \n'
 ' to feel a sense of involvement and "belonging" within a \n'
 ' social group \n'
 " McClellend's thinking was strongly influenced by the pioneering work of \n"
 ' Henry Murray \n'
 ' who first identified underlying psychological human needs and \n'
 ' motivational \n'
 ' processes It was Murray who set out a taxonomy of needs including '
 'achievement \n'
 ' power \n'
 ' and affiliationand placed these in the context of an integrated '
 'motivational model People with a high need for affiliation require warm \n'
 ' interpersonal relationships \n'
 ' and approval from those with whom they have regular contact Having a strong '
 'bond with others make a person feel as if they are a part of something '
 'important that creates a powerful impact People who place high emphasis on '
 'affiliation tend to be supportive team members but may be less effective '
 'in \n'
 ' leadership positions \n'
 ' A person who takes part i

In [23]:
# Tokenizing the input text:
tokenized = tokenizer.tokenize_words(input_text)
number_of_words = len(tokenized)
print(number_of_words,current_theme)

27680 achievement


### 5.1 Computing frequency distributions of each token, i.e word, term, pancuation, etc.

In [24]:
input_freq_dist = FreqDist(tokenized)

In [25]:
input_freq_dist.most_common(10)

[('\n', 2005),
 ('the', 1253),
 ('of', 862),
 ('to', 851),
 ('and', 669),
 ('a', 591),
 ('in', 585),
 ('that', 349),
 ('is', 317),
 ('for', 312)]

### 5.2 Removing punctuation and stopwords from the input corpus

In [26]:
for stopword in STOP_WORDS:
    if stopword in input_freq_dist:
        del input_freq_dist[stopword]
        
for punctuation in tokenizer.CHARACTERS_TO_SPLIT:
    if punctuation in input_freq_dist:
        del input_freq_dist[punctuation]

# Re-control most common words after cleaning:
n_input = sum(input_freq_dist.values())
input_freq_dist.most_common(10)

[('social', 219),
 ('s', 152),
 ('people', 148),
 ('need', 141),
 ('person', 93),
 ('others', 91),
 ('advice', 83),
 ('self', 82),
 ('decision', 80),
 ('group', 79)]

### 5.3 Removing all numbered words

This is an example case for post proceesing in terms of cleaning. The pre-processing, that is data cleaning/preperation during or right after harvesting should be further improved to avoid such processes at this stage.

In [27]:
pattern_letters = re.compile('[a-z]')
def has_letters(x):
    return(pattern_letters.search(x) is not None)

In [28]:
reduced = {k:v for k,v in input_freq_dist.items() if has_letters(k)}
print("Reduction due to all number matches: ", len(input_freq_dist) - len(reduced))
input_freq_dist = reduced

Reduction due to all number matches:  76


### 5.4 Removing single character words


In [29]:
reduced = {k:v for k,v in input_freq_dist.items() if len(k) > 1}
print("Reduction due to single characters: ", len(input_freq_dist) - len(reduced))
input_freq_dist = reduced

Reduction due to single characters:  17


### 5.5 Removing rare words from input distribution

In [30]:
reduced = {k:v for k,v in input_freq_dist.items() if v > 5}
print("Reduction due to rare occurances: ", len(input_freq_dist) - len(reduced))
input_freq_dist = reduced

Reduction due to rare occurances:  3517


## 6. Comparing input vs English corpus volumes

### 6.1 Total words (after cleaning the stopwords) 

In [31]:
print(n_input, n_english)

14880 544168


### 6.2 Number of unique words (after cleaning stopwords and rare words)

In [32]:
n_unique_word_input = len(input_freq_dist.items())
n_unique_word_brown = len(english_freq_dist.items())
n_unique_word_input, n_unique_word_brown

(561, 49598)

### 6.3 Cleaned set of input words/terms

List of words in the corpus in case, for a visual inspection. Such inspections will be used both to improve tokenization as well as filtering.

In [33]:
pp.pprint(sorted(input_freq_dist.items(), key=lambda x:x[1], reverse=True))

[('social', 219),
 ('people', 148),
 ('need', 141),
 ('person', 93),
 ('others', 91),
 ('advice', 83),
 ('self', 82),
 ('decision', 80),
 ('group', 79),
 ('influence', 77),
 ('high', 75),
 ('management', 75),
 ('individual', 71),
 ('impression', 71),
 ('research', 62),
 ('bystander', 60),
 ('situations', 55),
 ('behavior', 53),
 ('likely', 53),
 ('effect', 51),
 ('will', 50),
 ('help', 50),
 ('achievement', 49),
 ('theory', 49),
 ('individuals', 49),
 ('information', 49),
 ('bystanders', 49),
 ('situation', 46),
 ('cognition', 45),
 ('personality', 43),
 ('study', 42),
 ('power', 39),
 ('example', 39),
 ('making', 38),
 ('mcclelland', 37),
 ('psychology', 35),
 ('proof', 34),
 ('used', 32),
 ('low', 32),
 ('found', 32),
 ('judge', 32),
 ('advisor', 32),
 ('communication', 32),
 ('work', 31),
 ('corporate', 31),
 ('jas', 30),
 ('affiliation', 29),
 ('use', 29),
 ('judges', 29),
 ('nfc', 29),
 ('authority', 28),
 ('control', 27),
 ('groups', 27),
 ('responsibility', 27),
 ('conformity', 

### 6.4 Set of terms/words that occure in both corpus.

In [34]:
len(input_freq_dist.keys())

561

In [35]:
common_words = [w for w in set(input_freq_dist.keys()) & set(english_freq_dist.keys())]
print(len(common_words))

526


In [36]:
pp.pprint(sorted(common_words))

['ability',
 'accept',
 'acceptance',
 'accepted',
 'according',
 'account',
 'accurate',
 'achievement',
 'act',
 'action',
 'actions',
 'activities',
 'actually',
 'addition',
 'advice',
 'advisor',
 'advisors',
 'affect',
 'affiliation',
 'african',
 'against',
 'al',
 'alone',
 'although',
 'ambiguity',
 'ambiguous',
 'among',
 'amount',
 'analysis',
 'another',
 'applied',
 'appropriate',
 'argued',
 'around',
 'article',
 'asked',
 'aspects',
 'assistance',
 'associated',
 'association',
 'attempt',
 'attention',
 'attitude',
 'attitudes',
 'attractive',
 'audience',
 'authority',
 'awareness',
 'based',
 'basic',
 'become',
 'behavior',
 'behaviors',
 'behaviour',
 'belief',
 'beliefs',
 'believe',
 'believed',
 'best',
 'better',
 'beyond',
 'bias',
 'book',
 'brand',
 'bystander',
 'called',
 'case',
 'cases',
 'cause',
 'certain',
 'change',
 'channels',
 'characteristics',
 'chat',
 'children',
 'clearly',
 'cognitive',
 'cohen',
 'cohesive',
 'cohesiveness',
 'college',
 'c

### 6.5 Set of terms/words that occure in the sample but not in the reference corpus.

This specific set will be incorporated later below. 

In [37]:
input_specifics = dict()
for w in set(input_freq_dist.keys()) - set(english_freq_dist.keys()):
    input_specifics[w] = input_freq_dist[w]

In [38]:
print(len(input_specifics))

35


In [39]:
pp.pprint(sorted(input_specifics))

['"the',
 'ach',
 'apperception',
 'bystanders',
 'cacioppo',
 'cialdini',
 'cmc',
 'cognition',
 'cognition"',
 'constructs',
 'copycat',
 'darley',
 'genovese',
 'goffman',
 'im',
 'inputs',
 'internalization',
 'interprofessional',
 'jas',
 'judgeadvisor',
 'kelman',
 'latan',
 'mcclelland',
 'mediated',
 'milgram',
 'motivational',
 'networking',
 'nfc',
 'npow',
 'online',
 'openness',
 'reactance',
 'schlenker',
 'strategies',
 'zack']


## 7. Stemming

In [40]:
stemmer = PorterStemmer()
input_wset_stems = {k: stemmer.stem(k) for k in input_freq_dist.keys()}
common_wset_stems = {k: stemmer.stem(k) for k in common_words}
pp.pprint(common_wset_stems)

{'ability': 'abil',
 'accept': 'accept',
 'acceptance': 'accept',
 'accepted': 'accept',
 'according': 'accord',
 'account': 'account',
 'accurate': 'accur',
 'achievement': 'achiev',
 'act': 'act',
 'action': 'action',
 'actions': 'action',
 'activities': 'activ',
 'actually': 'actual',
 'addition': 'addit',
 'advice': 'advic',
 'advisor': 'advisor',
 'advisors': 'advisor',
 'affect': 'affect',
 'affiliation': 'affili',
 'african': 'african',
 'against': 'against',
 'al': 'al',
 'alone': 'alon',
 'although': 'although',
 'ambiguity': 'ambigu',
 'ambiguous': 'ambigu',
 'among': 'among',
 'amount': 'amount',
 'analysis': 'analysi',
 'another': 'anoth',
 'applied': 'appli',
 'appropriate': 'appropri',
 'argued': 'argu',
 'around': 'around',
 'article': 'articl',
 'asked': 'ask',
 'aspects': 'aspect',
 'assistance': 'assist',
 'associated': 'associ',
 'association': 'associ',
 'attempt': 'attempt',
 'attention': 'attent',
 'attitude': 'attitud',
 'attitudes': 'attitud',
 'attractive': 'at

 'showed': 'show',
 'showing': 'show',
 'shown': 'shown',
 'shows': 'show',
 'significant': 'signific',
 'similar': 'similar',
 'similarity': 'similar',
 'sites': 'site',
 'situation': 'situat',
 'situations': 'situat',
 'size': 'size',
 'smoke': 'smoke',
 'social': 'social',
 'society': 'societi',
 'someone': 'someon',
 'sometimes': 'sometim',
 'sources': 'sourc',
 'specific': 'specif',
 'specifically': 'specif',
 'standards': 'standard',
 'states': 'state',
 'status': 'statu',
 'still': 'still',
 'stories': 'stori',
 'story': 'stori',
 'strangers': 'stranger',
 'strength': 'strength',
 'strong': 'strong',
 'strongly': 'strongli',
 'structure': 'structur',
 'students': 'student',
 'studied': 'studi',
 'studies': 'studi',
 'study': 'studi',
 'style': 'style',
 'subject': 'subject',
 'subjects': 'subject',
 'success': 'success',
 'suggested': 'suggest',
 'suggests': 'suggest',
 'support': 'support',
 'surrounding': 'surround',
 'system': 'system',
 'systems': 'system',
 'take': 'take',


## 8. Handling input specific term set

### 8.1 Identfying matching stems with common words. 

Note that the frequency counts are transferred accordingly.

In [41]:
specifics = {}
for k,v in input_specifics.items():
    stem = input_wset_stems[k]
    words = [w for w,s in common_wset_stems.items() if s == stem]
    if words:
        w = words[0]
        input_freq_dist[w] += input_specifics[k]
        continue
    specifics[k] = v
# Removing the words with matching stems from the specific set.
print("Reduction due to stemm matches: ", len(input_specifics) - len(specifics))
input_specifics = specifics

Reduction due to stemm matches:  5


### 8.2 Removing open-maker specific terms.

In [42]:
with open("data/specifics_openmaker.txt", "r") as f:
    SPECIFICS_OPENMAKER = set(f.read().strip().split("\n"))
om_specific_stems = {stemmer.stem(k):k for k in SPECIFICS_OPENMAKER}
pp.pprint(om_specific_stems)

{'3-d': '3-d',
 '3d': '3d',
 'abat': 'abatement',
 'afford': 'affordable',
 'agenda21': 'agenda21',
 'anarch': 'anarchism',
 'autonom': 'autonomous',
 'biodiesel': 'biodiesel',
 'biodivers': 'biodiversity',
 'biofuel': 'biofuel',
 'bioga': 'biogas',
 'biomass': 'biomass',
 'biospher': 'biosphere',
 'bricolag': 'bricolage',
 'brundtland': 'brundtland',
 'c2c': 'c2c',
 'cad': 'cad',
 'cap-and-trad': 'cap-and-trade',
 'carfre': 'carfree',
 'cdm': 'cdm',
 'christensen': 'christensen',
 'co-creat': 'co-creation',
 'co-develop': 'co-develop',
 'co-invent': 'co-invention',
 'co-inventor': 'co-inventor',
 'coextinct': 'coextinction',
 'cognit': 'cognition',
 'commons-bas': 'commons-based',
 'computer-aid': 'computer-aided',
 'conferenc': 'conferencing',
 'consortium': 'consortium',
 'constraint': 'constraints',
 'construct': 'construct',
 'copyleft': 'copyleft',
 'copyright': 'copyright',
 'cradle-to-cradl': 'cradle-to-cradle',
 'crowdsourc': 'crowdsourcing',
 'crowdwork': 'crowdworker',
 'cuv

In [43]:
specific_wset_dirty = {}
specific_wset_stems_selected = {}
for k,v in input_specifics.items():
    stem = stemmer.stem(k)
    if stem in om_specific_stems.keys():
        if k in specific_wset_stems_selected.keys():
            specific_wset_stems_selected[stem] += v
        else:
            specific_wset_stems_selected[stem] = v
        continue
    if v > 10:
        specific_wset_dirty[k] = v
        
input_specifics = specific_wset_dirty

In [44]:
# The set of stems to be added to the set with makerness counts.

pp.pprint(specific_wset_stems_selected)

{'construct': 7, 'onlin': 11}


### 8.3 Remaining frequent input specifics
The manual checking can help to determine what should go into "specifics_openmaker.txt"

In [45]:
print(len(input_specifics))
pp.pprint(sorted(input_specifics))

10
['"the',
 'cmc',
 'darley',
 'genovese',
 'goffman',
 'jas',
 'latan',
 'mcclelland',
 'nfc',
 'openness']


In [46]:
specific_wset_stems = {}
for k,v in input_specifics.items():
    stem = stemmer.stem(k)
    if stem in specific_wset_stems.keys():
        specific_wset_stems[stem].append((k,v))
    else:
        specific_wset_stems[stem] = [(k,v)]
pp.pprint(specific_wset_stems)

{'"the': [('"the', 12)],
 'cmc': [('cmc', 12)],
 'darley': [('darley', 11)],
 'genoves': [('genovese', 11)],
 'goffman': [('goffman', 16)],
 'ja': [('jas', 30)],
 'latan': [('latan', 12)],
 'mcclelland': [('mcclelland', 37)],
 'nfc': [('nfc', 29)],
 'open': [('openness', 11)]}


## 9. Computing representation power of common words.

In [47]:
# combine
word = []
freq = []
score = []
# common_words = [w[0] for w in common_words]
nEng = 1.0 * n_english
nInp = 1.0 * n_input
for w in common_words:
    # Consider only words whose charcater length is larger than 1
    if len(w) > 1:
        # Log likelihood scores are computed:
        print(w, input_freq_dist[w], english_freq_dist[w])
        s = log((input_freq_dist[w] / nInp) / (english_freq_dist[w] / nEng))
        f = input_freq_dist[w]
        word.append(w)
        freq.append(f)
        score.append(s)

beliefs 8 23
presentation 26 33
satisfaction 8 28
important 19 369
performance 14 122
scale 10 60
chat 8 5
long 6 752
motivation 22 11
computer 11 13
engagement 8 22
organizations 7 61
asked 17 398
means 9 310
maintain 6 60
tat 9 1
showed 8 141
order 13 376
action 11 291
confidence 10 56
fear 7 127
conducted 7 55
norm 10 10
level 12 213
four 17 360
tests 6 61
experiments 11 66
input 17 20
observed 6 74
regarding 6 40
believe 8 200
murray 20 8
children 15 355
used 32 611
reactions 9 42
results 16 149
relationship 19 88
cohen 7 3
result 11 244
motivated 8 9
go 6 626
type 19 200
interactions 12 3
trust 11 52
phenomenon 10 35
amount 12 172
times 7 300
relevant 10 23
later 7 397
tend 22 43
score 6 66
conform 11 10
make 19 794
human 17 299
strength 9 137
tasks 13 29
levels 8 69
leadership 7 92
rather 16 373
seen 10 279
need 141 360
motives 8 20
henry 6 83
opinions 18 44
corporate 31 19
management 75 91
subjects 15 81
time 22 1598
cause 6 130
understanding 7 121
members 14 325
trait 7 3
exper

In [48]:
m = pd.Series(score)
f = pd.Series(freq)
k = pd.Series(word)
stem = k.apply(stemmer.stem)
scoring = len(k) * ['common']
df_common = pd.DataFrame({'Word':k, 'Stem':stem, 'Score':m,'Tf':f, 'Type':scoring})
df_common.sort_values(by='Score', ascending=False, inplace = True)
df_common.reset_index(inplace=True, drop=True)
df_common.head(20)

Unnamed: 0,Score,Stem,Tf,Type,Word
0,8.290588,bystand,109,common,bystander
1,7.064976,advisor,32,common,advisor
2,7.000437,cognit,60,common,cognitive
3,5.901825,discount,10,common,discounting
4,5.796465,tat,9,common,tat
5,5.796465,norm,9,common,normative
6,5.580241,affili,29,common,affiliation
7,5.54515,themat,7,common,thematic
8,5.54515,empathi,7,common,empathy
9,5.54515,egocentr,7,common,egocentric


### 9.1 Computing makerness of specific terms

In [49]:
max_score = df_common.iloc[0,0]
threshold_score = 1.0
mean_w = df_common.Score[df_common.Score > threshold_score].mean()
mean_f = df_common.Tf[df_common.Score > threshold_score].mean()
print(max_score, threshold_score, mean_w, mean_f)

8.29058787553 1.0 2.5279661650034853 17.294460641399418


In [50]:
def compute_speficif_score(f, maxw=10, minw=1, mean_w=3.0, mean_f=25.0):
    return(max(min((f / mean_f) * mean_w, maxw), minw))
compute_speficif_score(mean_w, max_score, threshold_score, mean_w, mean_f)

1.0

In [51]:
stem = []
freq = []
score = []
for k, v in specific_wset_stems_selected.items():
    s = compute_speficif_score(v, max_score, threshold_score, mean_w, mean_f)
    stem.append(k)
    freq.append(v)
    score.append(s)

## 10. Tabulating the results and generating the output file

In [52]:
m = pd.Series(score)
f = pd.Series(freq)
stem = pd.Series(stem)
scoring = len(m) * ['specific']
df_exclusive= pd.DataFrame({'Stem':stem, 'Score':m,'Tf':f, 'Type':scoring})
df_exclusive.sort_values(by='Score', ascending=False, inplace = True)
df_exclusive.reset_index(inplace=True, drop=True)
df_exclusive.head(20)

Unnamed: 0,Score,Stem,Tf,Type
0,1.607892,onlin,11,specific
1,1.023204,construct,7,specific


In [53]:
df_makerness = df_common.append(df_exclusive, ignore_index=True)
df_makerness.sort_values(by='Score', ascending=False, inplace = True)
df_makerness.reset_index(inplace=True, drop=True)
df_makerness.head(10)

Unnamed: 0,Score,Stem,Tf,Type,Word
0,8.290588,bystand,109,common,bystander
1,7.064976,advisor,32,common,advisor
2,7.000437,cognit,60,common,cognitive
3,5.901825,discount,10,common,discounting
4,5.796465,tat,9,common,tat
5,5.796465,norm,9,common,normative
6,5.580241,affili,29,common,affiliation
7,5.54515,themat,7,common,thematic
8,5.54515,empathi,7,common,empathy
9,5.54515,egocentr,7,common,egocentric


In [54]:
df_makerness.tail(10)

Unnamed: 0,Score,Stem,Tf,Type,Word
518,-0.840793,men,9,common,men
519,-0.895796,against,7,common,against
520,-0.977188,without,6,common,without
521,-1.048351,go,6,common,go
522,-1.05472,come,6,common,come
523,-1.092108,new,15,common,new
524,-1.116705,still,7,common,still
525,-1.124207,must,9,common,must
526,-1.231737,long,6,common,long
527,-1.480388,made,7,common,made


In [55]:
OUTPUT_FOLDER = "./output/"
csvfile_name = OUTPUT_FOLDER + "makerness_" + output_fname + ".csv"
with open(csvfile_name, 'w') as csvfile:
    #thewriter = csv.writer(csvfile, delimiter=',')
    #for k,v in sorted(makerness.items(), key=lambda x:x[1][0], reverse=True): thewriter.writerow([k,v[0],v[1]])
    df_makerness.to_csv(csvfile_name)

### 10.1 Outputfile name for the theme

In [56]:
print(csvfile_name)

./output/makerness_Achievement.csv
