# Discovery and Representation of Open Making Related Terms

This notebook sketches the initial exercise on discovering the open making related keywords. The input text is harvested via a Web crawler that identifies and crawls semantically related wikipedia articles.   

In [1]:
from utils import tokenizer
import nltk
from nltk import FreqDist
from nltk.stem.porter import PorterStemmer
from numpy import log, mean
import json, csv, re
import pprint as pp

import pandas as pd

## 1. Loading a reference English language corpus

In [2]:
from nltk.corpus import brown
brown.categories()

['adventure',
 'belles_lettres',
 'editorial',
 'fiction',
 'government',
 'hobbies',
 'humor',
 'learned',
 'lore',
 'mystery',
 'news',
 'religion',
 'reviews',
 'romance',
 'science_fiction']

## 2. Stop words

### 2.1 Standard stop words

In [3]:
with open("data/stopwords_standard.txt", "r") as f:
    STOP_WORDS_STANDARD = set(f.read().strip().split("\n"))
print(STOP_WORDS_STANDARD)

{"i'm", 'this', 'has', 'only', "hasn't", 'more', 'there', "aren't", 'other', 'the', 'does', 'do', "don't", "here's", 'of', 'before', 'com', 'could', 'been', "wouldn't", 'on', "that's", 'nor', 'r', "she's", "she'd", 'him', 'i', 'is', "who's", 'no', 'be', 'under', 'a', 'cannot', "can't", "weren't", 'out', 'and', 'when', 'we', 'herself', 'all', 'at', 'by', 'our', 'in', 'here', 'you', 'himself', "i've", 'themselves', 'where', "why's", 'into', "it's", 'until', 'those', 'yours', "how's", "i'd", 'just', 'it', 'http', 'own', 'then', 'an', 'ourselves', 'some', "doesn't", 'did', "haven't", 'but', 'while', 'over', 'about', 'below', "there's", 'what', "mustn't", 'www', 'have', 'off', "she'll", 'so', 'too', 'who', 'very', 'such', "i'll", 'once', 'than', 'were', 'with', 'same', 'your', 'if', 'them', "you've", "let's", 'would', 'my', "shan't", 'how', "you'd", 'to', "you're", "couldn't", "wasn't", "didn't", 'whom', 'during', 'being', 'they', "he's", 'because', 'that', 'know', 'having', 'me', 'why', 'h

### 2.2 Open-making related stop words

In [4]:
with open("data/stopwords_openmaker.txt", "r") as f:
    STOP_WORDS_OPENMAKER = set(f.read().strip().split("\n"))
print(STOP_WORDS_OPENMAKER)

{'etc', 'britannica', 'almost', 'org', '10th', 'third', '8th', 'isbn', 'may', '4th', 'na', 'one', 'iv', 'doi', 'eg', 'also', 'wikipedia', 'von', '11th', '13th', 'second', '2nd', 'vol', 'first', 'ii', 'html', 'iii', 'often', '5th', '7th', '1st', '3rd', '6th', 'tt', 'randd', 'encyclopedia', 'ad', 'pp', 'pdf', '12th', 'many', 'wikipedias', 'bc', 'doc', 'well', '14th', '15th', '9th', 'txt', 'vi'}


## 3. Removing stop words from the reference English corpus

In [5]:
# merging the two list together
STOP_WORDS = STOP_WORDS_STANDARD.union(STOP_WORDS_OPENMAKER)
print(STOP_WORDS)

{"i'm", 'this', 'org', "hasn't", 'more', "aren't", '4th', 'one', 'do', 'of', 'eg', 'before', 'could', 'been', "she's", 'him', "who's", 'no', "weren't", 'out', 'and', 'pp', '14th', 'all', 'at', 'by', 'our', 'himself', 'where', 'themselves', "why's", "it's", '10th', '8th', 'may', 'those', 'iv', "how's", 'just', 'it', 'http', 'then', 'an', 'some', "doesn't", '2nd', 'vol', "haven't", 'while', 'ii', 'over', '1st', 'www', '3rd', 'have', 'off', "she'll", '6th', "i'll", 'once', 'wikipedias', 'than', 'were', 'well', 'with', 'same', 'your', 'if', 'them', 'britannica', 'would', 'my', 'to', "couldn't", "wasn't", 'whom', 'during', 'being', "he's", 'because', 'that', 'why', 'both', 'after', '5th', 'few', "what's", "you'll", 'any', 'ours ', "we'll", 'for', 'was', 'had', 'her', "he'll", 'are', 'theirs', 'yourself', "we've", 'etc', "they'll", 'hers', "they're", 'above', 'can', 'also', 'most', 'get', 'second', "where's", "they'd", 'myself', 'like', "won't", 'often', 'between', 'randd', 'ad', 'doc', 'am'

In [6]:
# load english words from the Brown corpus removing stop words.
english_freq_dist = FreqDist([w.lower() for w in nltk.corpus.brown.words()
                              if w not in STOP_WORDS])
for stopword in STOP_WORDS:
    if stopword in english_freq_dist:
        del english_freq_dist[stopword]
        
for punctuation in tokenizer.CHARACTERS_TO_SPLIT:
    if punctuation in english_freq_dist:
        del english_freq_dist[punctuation]

# Re-control most common words after cleaning:
n_english = sum(english_freq_dist.values())
english_freq_dist.most_common(10)

[('``', 8837),
 ("''", 8789),
 ('--', 3432),
 ('will', 2245),
 ('said', 1961),
 ('new', 1635),
 ('time', 1598),
 ('two', 1412),
 ('now', 1314),
 ('man', 1207)]

## 4. Loading the input Open Maker corpus

In [7]:
# load the harvested text from wikipedia.
with open("data/wikipedia.json", "r") as f: OM_Corpus_text = f.read()
OM_Corpus = json.loads(OM_Corpus_text)

In [8]:
# The total number of wiki articles used:
print(len(OM_Corpus))

152


In [9]:
# Column names of the the corpus.
OM_Corpus[0].keys()

dict_keys(['theme.id', 'title', 'url', 'depth', 'text'])

In [10]:
def display_articles(tid):
    articles = [article for article in OM_Corpus if article['theme.id'] == tid]
    for article in articles:
        print(article['depth'],article['title'], article['url'])

In [11]:
display_articles(0)

0 Do it yourself https://en.wikipedia.org/wiki/Do_it_yourself
1 Edupunk https://en.wikipedia.org/wiki/Edupunk
1 Prosumer https://en.wikipedia.org/wiki/Prosumer
1 How-to https://en.wikipedia.org/wiki/How-to
1 Kludge https://en.wikipedia.org/wiki/Kludge
1 Bricolage https://en.wikipedia.org/wiki/Bricolage
1 Junk box https://en.wikipedia.org/wiki/Junk_box
1 Number 8 wire https://en.wikipedia.org/wiki/Number_8_wire
1 Ready-to-assemble furniture https://en.wikipedia.org/wiki/Ready-to-assemble_furniture
1 Open design https://en.wikipedia.org/wiki/Open_Design
1 Hackerspace https://en.wikipedia.org/wiki/Hackerspace
1 Instructables https://en.wikipedia.org/wiki/Instructables
1 Handyman https://en.wikipedia.org/wiki/Handyman
1 Circuit bending https://en.wikipedia.org/wiki/Circuit_bending
1 Project GreenWorld International https://en.wikipedia.org/wiki/Project_GreenOman
1 3D printing https://en.wikipedia.org/wiki/3D_printing


In [12]:
display_articles(1)

0 Open design https://en.wikipedia.org/wiki/Open_design
1 Knowledge commons https://en.wikipedia.org/wiki/Knowledge_commons
1 Open Source Ecology https://en.wikipedia.org/wiki/Open_Source_Ecology
1 Computer-aided design https://en.wikipedia.org/wiki/Computer-aided_design
1 Open Source Initiative https://en.wikipedia.org/wiki/Open_Source_Initiative
1 Open Architecture Network https://en.wikipedia.org/wiki/Open_Architecture_Network
1 Open-source architecture https://en.wikipedia.org/wiki/Open-source_architecture
1 Commons-based peer production https://en.wikipedia.org/wiki/Commons-based_peer_production
1 Open standard https://en.wikipedia.org/wiki/Open_standard
1 OpenCores https://en.wikipedia.org/wiki/OpenCores
1 Co-creation https://en.wikipedia.org/wiki/Co-creation
1 OpenBTS https://en.wikipedia.org/wiki/OpenBTS
1 Open manufacturing https://en.wikipedia.org/wiki/Open_manufacturing
1 Open-source hardware https://en.wikipedia.org/wiki/Open-source_hardware
1 Open source appropriate techno

In [13]:
display_articles(2)

0 Sustainability https://en.wikipedia.org/wiki/Sustainability
1 Sustainability standards and certification https://en.wikipedia.org/wiki/Sustainability_standards_and_certification
1 Appropriate technology https://en.wikipedia.org/wiki/Appropriate_technology
1 Sustainable development https://en.wikipedia.org/wiki/Sustainable_development
1 Environmental issue https://en.wikipedia.org/wiki/Environmental_issue
1 World Cities Summit https://en.wikipedia.org/wiki/World_Cities_Summit
1 Ecopsychology https://en.wikipedia.org/wiki/Ecopsychology
1 Book:Sustainability https://en.wikipedia.org/wiki/Book:Sustainability
1 Sustainable design https://en.wikipedia.org/wiki/Sustainable_design
1 Circles of Sustainability https://en.wikipedia.org/wiki/Circles_of_Sustainability
1 Sustainability science https://en.wikipedia.org/wiki/Sustainability_science
1 Sustainable living https://en.wikipedia.org/wiki/Sustainable_living
1 Index of sustainability articles https://en.wikipedia.org/wiki/List_of_sustainabil

In [14]:
display_articles(3)

0 Maker culture https://en.wikipedia.org/wiki/Maker_culture
1 Modular design https://en.wikipedia.org/wiki/Modular_design
1 Open-source car https://en.wikipedia.org/wiki/Open-source_car
1 Electric vehicle conversion https://en.wikipedia.org/wiki/Electric_vehicle_conversion
1 Thingiverse https://en.wikipedia.org/wiki/Thingiverse
1 Fab lab https://en.wikipedia.org/wiki/Fab_Lab_(fabrication_laboratory)
1 SparkFun Electronics https://en.wikipedia.org/wiki/SparkFun
1 RepRap project https://en.wikipedia.org/wiki/RepRap
1 Distributed manufacturing https://en.wikipedia.org/wiki/Distributed_manufacturing
1 Craft production https://en.wikipedia.org/wiki/Craft_production
1 Autonomous building https://en.wikipedia.org/wiki/Autonomous_building
1 Open-source hardware https://en.wikipedia.org/wiki/Open_source_hardware
1 Kit car https://en.wikipedia.org/wiki/Kit_car


In [15]:
display_articles(4)

0 Innovation https://en.wikipedia.org/wiki/Innovation
1 Competitive intelligence https://en.wikipedia.org/wiki/Creative_competitive_intelligence
1 Multiple discovery https://en.wikipedia.org/wiki/Multiple_discovery
1 UNDP Innovation Facility https://en.wikipedia.org/wiki/UNDP_Innovation_Facility
1 Open Innovations (event) https://en.wikipedia.org/wiki/Open_Innovations_(Forum_and_Technology_Show)
1 Trans-cultural diffusion https://en.wikipedia.org/wiki/Diffusion_(anthropology)
1 Individual capital https://en.wikipedia.org/wiki/Individual_capital
1 Innovation system https://en.wikipedia.org/wiki/Innovation_system
1 Public domain https://en.wikipedia.org/wiki/Public_domain
1 Ingenuity https://en.wikipedia.org/wiki/Ingenuity
1 Sustainable Development Goals https://en.wikipedia.org/wiki/Sustainable_Development_Goals
1 Participatory design https://en.wikipedia.org/wiki/Participatory_design
1 Innovation management https://en.wikipedia.org/wiki/Innovation_management
1 Information revolution ht

In [16]:
display_articles(5)

0 Collaboration https://en.wikipedia.org/wiki/Collaboration
1 Wikinomics https://en.wikipedia.org/wiki/Wikinomics
1 Collaborative editing https://en.wikipedia.org/wiki/Collaborative_editing
1 Telepresence https://en.wikipedia.org/wiki/Telepresence
1 Knowledge management https://en.wikipedia.org/wiki/Knowledge_management
1 The Culture of Collaboration https://en.wikipedia.org/wiki/The_Culture_of_Collaboration
1 Collaborative governance https://en.wikipedia.org/wiki/Collaborative_governance
1 Community film https://en.wikipedia.org/wiki/Community_film
1 Collaborative innovation network https://en.wikipedia.org/wiki/Collaborative_innovation_network
1 Design thinking https://en.wikipedia.org/wiki/Design_thinking
1 Role-based collaboration https://en.wikipedia.org/wiki/Role-based_collaboration
1 Intranet portal https://en.wikipedia.org/wiki/Intranet_portal
1 Critical thinking https://en.wikipedia.org/wiki/Critical_thinking
1 Facilitation (business) https://en.wikipedia.org/wiki/Facilitation

## 5. Analyzing a specific corpus based on a theme

In [17]:
def get_title(Corpus, theme_id):
    title = ''
    for article in Corpus:
        if article['theme.id'] == theme_id:
            title = article['title']
            break
    return title

## 6.0 Selecting the specific theme (a sub-corpus).

In [18]:
## For a different sub-corpus use a corresponding theme ID.
current_theme_id = 3

In [19]:
current_title = get_title(OM_Corpus, current_theme_id)

In [20]:
output_fname = "_".join([word.capitalize() for word in current_title.split(" ")])
print(current_title, "::", output_fname)

Maker culture :: Maker_Culture


In [21]:
input_text = " ".join([page['text'] for page in OM_Corpus if page['theme.id'] == current_theme_id])

In [22]:
pp.pprint(input_text)

('Maker culture \n'
 ' Silicon Valley \n'
 ' billboard \n'
 ' The \n'
 ' maker culture \n'
 ' is a contemporary \n'
 ' culture \n'
 ' or \n'
 ' subculture \n'
 ' representing a technology-based extension of \n'
 ' DIY culture \n'
 ' citation needed \n'
 ' that intersects with \n'
 ' hacker culture \n'
 ' which is less concerned with physical objects as it focuses on software and '
 'revels in the creation of new devices as well as \n'
 ' tinkering \n'
 ' with existing ones The maker culture in general supports \n'
 ' open-source hardware \n'
 ' Typical interests enjoyed by the maker culture include engineering-oriented '
 'pursuits such as \n'
 ' electronics \n'
 ' robotics \n'
 ' 3-D printing \n'
 ' and the use of \n'
 ' Computer Numeric Control \n'
 ' tools as well as more traditional activities such as \n'
 ' metalworking \n'
 ' woodworking \n'
 ' and mainly its predecessor the traditional \n'
 ' arts and crafts \n'
 ' The subculture stresses a cut-and-paste approach to standardized

In [23]:
# Tokenizing the input text:
tokenized = tokenizer.tokenize_words(input_text)
number_of_words = len(tokenized)
print(number_of_words,current_title)

16950 Maker culture


### 6.1 Computing frequency distributions of each token, i.e word, term, pancuation, etc.

In [24]:
input_freq_dist = FreqDist(tokenized)

In [25]:
input_freq_dist.most_common(10)

[('\n', 2081),
 ('the', 785),
 ('and', 458),
 ('of', 436),
 ('to', 364),
 ('a', 325),
 ('in', 304),
 ('as', 163),
 ('is', 161),
 ('"', 161)]

### 6.2 Removing punctuation and stopwords from the input corpus

In [26]:
for stopword in STOP_WORDS:
    if stopword in input_freq_dist:
        del input_freq_dist[stopword]
        
for punctuation in tokenizer.CHARACTERS_TO_SPLIT:
    if punctuation in input_freq_dist:
        del input_freq_dist[punctuation]

# Re-control most common words after cleaning:
n_input = sum(input_freq_dist.values())
input_freq_dist.most_common(10)

[('hardware', 134),
 ('open', 111),
 ('vehicle', 86),
 ('open-source', 76),
 ('reprap', 69),
 ('design', 69),
 ('maker', 64),
 ('source', 64),
 ('electric', 59),
 ('manufacturing', 45)]

### 6.3 Removing all numbered words

This is an example case for post proceesing in terms of cleaning. The pre-processing, that is data cleaning/preperation during or right after harvesting should be further improved to avoid such processes at this stage.

In [27]:
pattern_letters = re.compile('[a-z]')
def has_letters(x):
    return(pattern_letters.search(x) is not None)

In [28]:
reduced = {k:v for k,v in input_freq_dist.items() if has_letters(k)}
print("Reduction due to all number matches: ", len(input_freq_dist) - len(reduced))
input_freq_dist = reduced

Reduction due to all number matches:  24


### 6.4 Removing single character words


In [29]:
reduced = {k:v for k,v in input_freq_dist.items() if len(k) > 1}
print("Reduction due to single characters: ", len(input_freq_dist) - len(reduced))
input_freq_dist = reduced

Reduction due to single characters:  18


### 6.5 Removing rare words from input distribution

In [30]:
reduced = {k:v for k,v in input_freq_dist.items() if v > 5}
print("Reduction due to rare occurances: ", len(input_freq_dist) - len(reduced))
input_freq_dist = reduced

Reduction due to rare occurances:  3052


## 7. Comparing input vs English corpus volumes

### 7.1 Total words (after cleaning the stopwords) 

In [31]:
print(n_input, n_english)

9166 544563


### 7.2 Number of unique words (after cleaning stopwords and rare words)

In [32]:
n_unique_word_input = len(input_freq_dist.items())
n_unique_word_brown = len(english_freq_dist.items())
n_unique_word_input, n_unique_word_brown

(287, 49600)

### 7.3 Cleaned set of input words/terms

List of words in the corpus in case, for a visual inspection. Such inspections will be used both to improve tokenization as well as filtering.

In [33]:
pp.pprint(sorted(input_freq_dist.items(), key=lambda x:x[1], reverse=True))

[('hardware', 134),
 ('open', 111),
 ('vehicle', 86),
 ('open-source', 76),
 ('reprap', 69),
 ('design', 69),
 ('maker', 64),
 ('source', 64),
 ('electric', 59),
 ('manufacturing', 45),
 ('production', 44),
 ('car', 44),
 ('modular', 43),
 ('conversion', 42),
 ('license', 39),
 ('3d', 38),
 ('printing', 34),
 ('vehicles', 34),
 ('free', 33),
 ('project', 32),
 ('software', 31),
 ('batteries', 31),
 ('used', 30),
 ('craft', 29),
 ('culture', 28),
 ('using', 28),
 ('distributed', 28),
 ('make', 27),
 ('lab', 27),
 ('electronics', 26),
 ('use', 26),
 ('include', 25),
 ('designs', 24),
 ('fab', 24),
 ('sparkfun', 24),
 ('new', 23),
 ('3-d', 23),
 ('will', 23),
 ('system', 23),
 ('community', 22),
 ('products', 22),
 ('components', 22),
 ('converted', 22),
 ('development', 21),
 ('process', 21),
 ('parts', 21),
 ('battery', 21),
 ('power', 20),
 ('labs', 20),
 ('tools', 19),
 ('making', 19),
 ('technology', 19),
 ('licenses', 19),
 ('equipment', 18),
 ('pearce', 18),
 ('projects', 17),
 ('c

### 7.4 Set of terms/words that occure in both corpus.

In [34]:
common_words = [w for w in input_freq_dist.keys() & english_freq_dist.keys()]
print(len(common_words))

252


In [35]:
pp.pprint(common_words)

['personal',
 'addition',
 'january',
 'printed',
 'around',
 'evolution',
 'manufacturing',
 'several',
 'solar',
 'less',
 'people',
 'project',
 'support',
 'world',
 'distributed',
 'milling',
 'another',
 'term',
 'type',
 'objects',
 'applications',
 'organization',
 'mechanical',
 'buildings',
 'machine',
 'made',
 'build',
 'states',
 'electric',
 'making',
 'design',
 'area',
 'cloud',
 'models',
 'point',
 'system',
 'social',
 'platform',
 'office',
 'used',
 'small',
 'research',
 'vehicle',
 'model',
 'law',
 'example',
 'room',
 'machines',
 'parts',
 'use',
 'costs',
 'computer',
 'road',
 'rise',
 'program',
 'business',
 'new',
 'need',
 'various',
 'designed',
 'approach',
 'plastic',
 'must',
 'within',
 'mass',
 'freedom',
 'trucks',
 'trademark',
 'market',
 'economy',
 'hybrid',
 'development',
 'existing',
 'concept',
 'place',
 'typically',
 'materials',
 'possible',
 'shared',
 'number',
 'now',
 'computers',
 'allow',
 'public',
 'site',
 'original',
 'califor

### 7.5 Set of terms/words that occure in the sample but not in the reference corpus.

This specific set will be incorporated later below. 

In [36]:
input_specifics = dict()
for w in set(input_freq_dist.keys() - english_freq_dist.keys()):
    input_specifics[w] = input_freq_dist[w]

In [37]:
print(len(input_specifics))

35


## 8. Stemming

In [38]:
stemmer = PorterStemmer()
input_wset_stems = {k: stemmer.stem(k) for k in input_freq_dist.keys()}
common_wset_stems = {k: stemmer.stem(k) for k in common_words}
pp.pprint(common_wset_stems)

{'ability': 'abil',
 'access': 'access',
 'added': 'ad',
 'adding': 'ad',
 'addition': 'addit',
 'adrian': 'adrian',
 'allow': 'allow',
 'another': 'anoth',
 'applications': 'applic',
 'approach': 'approach',
 'area': 'area',
 'areas': 'area',
 'around': 'around',
 'available': 'avail',
 'batteries': 'batteri',
 'battery': 'batteri',
 'better': 'better',
 'bicycle': 'bicycl',
 'boards': 'board',
 'build': 'build',
 'building': 'build',
 'buildings': 'build',
 'built': 'built',
 'business': 'busi',
 'california': 'california',
 'called': 'call',
 'car': 'car',
 'cars': 'car',
 'center': 'center',
 'certain': 'certain',
 'circuit': 'circuit',
 'closed': 'close',
 'cloud': 'cloud',
 'code': 'code',
 'commercial': 'commerci',
 'common': 'common',
 'communities': 'commun',
 'community': 'commun',
 'companies': 'compani',
 'company': 'compani',
 'component': 'compon',
 'components': 'compon',
 'computer': 'comput',
 'computers': 'comput',
 'concept': 'concept',
 'construction': 'construct',


## 9. Handling input specific term set

### 9.1 Identfying matching stems with common words. 

Note that the frequency counts are transferred accordingly.

In [39]:
specifics = {}
for k,v in input_specifics.items():
    stem = input_wset_stems[k]
    words = [w for w,s in common_wset_stems.items() if s == stem]
    if words:
        w = words[0]
        input_freq_dist[w] += input_specifics[k]
        continue
    specifics[k] = v
# Removing the words with matching stems from the specific set.
print("Reduction due to stemm matches: ", len(input_specifics) - len(specifics))
input_specifics = specifics

Reduction due to stemm matches:  4


### 9.2 Removing open-maker specific terms.

In [40]:
with open("data/specifics_openmaker.txt", "r") as f:
    SPECIFICS_OPENMAKER = set(f.read().strip().split("\n"))
om_specific_stems = {stemmer.stem(k):k for k in SPECIFICS_OPENMAKER}
pp.pprint(om_specific_stems)

{'3-d': '3-d',
 '3d': '3d',
 'abat': 'abatement',
 'afford': 'affordable',
 'agenda21': 'agenda21',
 'anarch': 'anarchism',
 'autonom': 'autonomous',
 'biodiesel': 'biodiesel',
 'biodivers': 'biodiversity',
 'biofuel': 'biofuel',
 'bioga': 'biogas',
 'biomass': 'biomass',
 'biospher': 'biosphere',
 'bricolag': 'bricolage',
 'brundtland': 'brundtland',
 'c2c': 'c2c',
 'cad': 'cad',
 'cap-and-trad': 'cap-and-trade',
 'carfre': 'carfree',
 'cdm': 'cdm',
 'christensen': 'christensen',
 'co-creat': 'co-creation',
 'co-develop': 'co-develop',
 'co-invent': 'co-invention',
 'co-inventor': 'co-inventor',
 'coextinct': 'coextinction',
 'cognit': 'cognition',
 'commons-bas': 'commons-based',
 'computer-aid': 'computer-aided',
 'conferenc': 'conferencing',
 'consortium': 'consortium',
 'constraint': 'constraints',
 'construct': 'construct',
 'copyleft': 'copyleft',
 'copyright': 'copyright',
 'cradle-to-cradl': 'cradle-to-cradle',
 'crowdsourc': 'crowdsourcing',
 'crowdwork': 'crowdworker',
 'cuv

In [41]:
specific_wset_dirty = {}
specific_wset_stems_selected = {}
for k,v in input_specifics.items():
    stem = stemmer.stem(k)
    if stem in om_specific_stems.keys():
        if k in specific_wset_stems_selected.keys():
            specific_wset_stems_selected[stem] += v
        else:
            specific_wset_stems_selected[stem] = v
        continue
    if v > 10:
        specific_wset_dirty[k] = v
        
input_specifics = specific_wset_dirty

In [42]:
# The set of stems to be added to the set with makerness counts.

pp.pprint(specific_wset_stems_selected)

{'3-d': 23,
 '3d': 38,
 'diy': 9,
 'fab': 24,
 'graphic': 6,
 'hackerspac': 6,
 'mit': 7,
 'open-sourc': 76,
 'pearc': 18,
 'prototyp': 7,
 'reprap': 6,
 'softwar': 31,
 'sparkfun': 24,
 'sustain': 10,
 'thingivers': 16,
 'websit': 6}


### 9.3 Remaining frequent input specifics
The manual checking can help to determine what should go into "specifics_openmaker.txt"

In [43]:
print(len(input_specifics))
pp.pprint(input_specifics)

2
{'ev': 12, 'racer': 11}


In [44]:
pp.pprint(input_specifics.keys())

dict_keys(['ev', 'racer'])


In [45]:
specific_wset_stems = {}
for k,v in input_specifics.items():
    stem = stemmer.stem(k)
    if stem in specific_wset_stems.keys():
        specific_wset_stems[stem].append((k,v))
    else:
        specific_wset_stems[stem] = [(k,v)]
pp.pprint(specific_wset_stems)

{'ev': [('ev', 12)], 'racer': [('racer', 11)]}


## 10. Computing representation power of common words.

In [46]:
# combine
word = []
freq = []
score = []
# common_words = [w[0] for w in common_words]
for w in common_words:
    # Consider only words whose charcater length is larger than 1
    if len(w) > 1:
        # Log likelihood scores are computed:
        s = log((input_freq_dist[w] / n_input) / (english_freq_dist[w] / n_english))
        f = input_freq_dist[w]
        word.append(w)
        freq.append(f)
        score.append(s)

In [47]:
m = pd.Series(score)
f = pd.Series(freq)
k = pd.Series(word)
stem = k.apply(stemmer.stem)
scoring = len(k) * ['raw']
df_common = pd.DataFrame({'Word':k, 'Stem':stem, 'Score':m,'Tf':f, 'Type':scoring})
df_common.sort_values(by='Score', ascending=False, inplace = True)
df_common.reset_index(inplace=True, drop=True)
df_common.head(20)

Unnamed: 0,Score,Stem,Tf,Type,Word
0,6.836018,lab,47,raw,lab
1,6.610211,modular,50,raw,modular
2,6.584427,hardwar,134,raw,hardware
3,6.419858,batteri,31,raw,batteries
4,6.318075,printer,28,raw,printer
5,6.281707,modul,9,raw,modules
6,6.099386,kit,15,raw,kit
7,5.876242,hybrid,6,raw,hybrid
8,5.758459,maker,64,raw,maker
9,5.237162,licens,19,raw,licenses


## Computing makerness of specific terms

In [48]:
max_score = df_common.iloc[0,0]
threshold_score = 1.0
mean_w = df_common.Score[df_common.Score > threshold_score].mean()
mean_f = df_common.Tf[df_common.Score > threshold_score].mean()
print(max_score, threshold_score, mean_w, mean_f)

6.83601796511 1.0 2.7227748860888403 16.106145251396647


In [49]:
def compute_speficif_score(f, maxw=10, minw=1, mean_w=3.0, mean_f=25.0):
    return(max(min((f / mean_f) * mean_w, maxw), minw))
compute_speficif_score(mean_w, max_score, threshold_score, mean_w, mean_f)

1.0

In [50]:
stem = []
freq = []
score = []
for k, v in specific_wset_stems_selected.items():
    s = compute_speficif_score(v, max_score, threshold_score, mean_w, mean_f)
    stem.append(k)
    freq.append(v)
    score.append(s)

In [51]:
m = pd.Series(score)
f = pd.Series(freq)
stem = pd.Series(stem)
scoring = len(m) * ['stem']
df_exclusive= pd.DataFrame({'Stem':stem, 'Score':m,'Tf':f, 'Type':scoring})
df_exclusive.sort_values(by='Score', ascending=False, inplace = True)
df_exclusive.reset_index(inplace=True, drop=True)
df_exclusive.head(20)

Unnamed: 0,Score,Stem,Tf,Type
0,6.836018,open-sourc,76,stem
1,6.423973,3d,38,stem
2,5.24061,softwar,31,stem
3,4.057246,fab,24,stem
4,4.057246,sparkfun,24,stem
5,3.888194,3-d,23,stem
6,3.042935,pearc,18,stem
7,2.704831,thingivers,16,stem
8,1.690519,sustain,10,stem
9,1.521467,diy,9,stem


In [52]:
df_makerness = df_common.append(df_exclusive, ignore_index=True)
df_makerness.sort_values(by='Score', ascending=False, inplace = True)
df_makerness.reset_index(inplace=True, drop=True)
df_makerness.head(20)

Unnamed: 0,Score,Stem,Tf,Type,Word
0,6.836018,lab,47,raw,lab
1,6.836018,open-sourc,76,stem,
2,6.610211,modular,50,raw,modular
3,6.584427,hardwar,134,raw,hardware
4,6.423973,3d,38,stem,
5,6.419858,batteri,31,raw,batteries
6,6.318075,printer,28,raw,printer
7,6.281707,modul,9,raw,modules
8,6.099386,kit,15,raw,kit
9,5.876242,hybrid,6,raw,hybrid


In [53]:
df_makerness.head(100)

Unnamed: 0,Score,Stem,Tf,Type,Word
0,6.836018,lab,47,raw,lab
1,6.836018,open-sourc,76,stem,
2,6.610211,modular,50,raw,modular
3,6.584427,hardwar,134,raw,hardware
4,6.423973,3d,38,stem,
5,6.419858,batteri,31,raw,batteries
6,6.318075,printer,28,raw,printer
7,6.281707,modul,9,raw,modules
8,6.099386,kit,15,raw,kit
9,5.876242,hybrid,6,raw,hybrid


In [54]:
OUTPUT_FOLDER = "./output/"
csvfile_name = OUTPUT_FOLDER + "makerness_" + output_fname + ".csv"
with open(csvfile_name, 'w') as csvfile:
    #thewriter = csv.writer(csvfile, delimiter=',')
    #for k,v in sorted(makerness.items(), key=lambda x:x[1][0], reverse=True): thewriter.writerow([k,v[0],v[1]])
    df_makerness.to_csv(csvfile_name)

In [55]:
print(csvfile_name)

./output/makerness_Maker_Culture.csv
