# Discovery and Representation of Open Making Related Terms

This notebook sketches the initial exercise on discovering the open making related keywords. The input text is harvested via a Web crawler that identifies and crawls semantically related wikipedia articles.   

In [1]:
from utils import tokenizer
import nltk
from nltk import FreqDist
from math import log
import json, csv

## 1. Loading a reference English language corpus

In [2]:
from nltk.corpus import brown
brown.categories()

['adventure',
 'belles_lettres',
 'editorial',
 'fiction',
 'government',
 'hobbies',
 'humor',
 'learned',
 'lore',
 'mystery',
 'news',
 'religion',
 'reviews',
 'romance',
 'science_fiction']

## 2. Stop words

### 2.1 Standard stop words

In [3]:
with open("data/stopwords_standard.txt", "r") as f:
    STOP_WORDS_STANDARD = set(f.read().strip().split("\n"))
print(STOP_WORDS_STANDARD)

{'i', "i'm", 'each', "he'd", 'while', 'know', 'him', 'been', 'do', 'during', 'yourselves', 'our', 'all', 'me', "she's", 'are', 'be', "they've", 'once', 'them', 'only', "wasn't", 'hers', 'can', 'did', 'cannot', 'other', "we're", 'her', "when's", "shouldn't", 'and', "they're", "you'll", 'itself', "shan't", 'we', "couldn't", 'herself', 'myself', "haven't", 'that', 'were', 'between', 'he', "you're", 'have', 'there', 'does', "that's", 'what', 'before', 'more', 'a', 'am', 'this', "here's", 'of', 'would', "we've", 'yourself', "hasn't", "isn't", 'nor', "she'd", "won't", 'most', 'had', "aren't", 'could', "wouldn't", "can't", "he's", 'its', 'above', 'on', "where's", "hadn't", 'himself', 'here', 'out', 'below', 'from', "i'd", "who's", "he'll", 'ourselves', 'down', 'their', 'with', 'through', 'same', 'like', 'it', 'few', 'http', 'having', "they'd", 'in', 'than', 'the', 'get', "you'd", 'who', 'off', 'or', 'these', 'further', "i'll", 'no', 'should', 'too', 'not', 'such', 'until', 'whom', 'doing', 'r

### 2.2 Open-making related stop words

In [4]:
with open("data/stopwords_openmaker.txt", "r") as f:
    STOP_WORDS_OPENMAKER = set(f.read().strip().split("\n"))
print(STOP_WORDS_OPENMAKER)

{'well', 'one', 'many', 'often', 'may', 'also', 'almost'}


## 3. Removing stop words from the reference English corpus

In [5]:
# merging the two list together
STOP_WORDS = STOP_WORDS_STANDARD.union(STOP_WORDS_OPENMAKER)
print(STOP_WORDS)

{"i'm", 'each', 'him', 'during', 'our', "she's", "they've", 'once', 'them', 'only', "wasn't", 'hers', 'did', 'other', "shouldn't", 'and', "they're", "you'll", "shan't", 'that', 'between', 'does', 'what', 'before', 'a', 'am', 'would', "we've", 'yourself', "hasn't", "she'd", 'most', "aren't", "he's", 'on', 'himself', 'here', 'out', 'below', "he'll", 'their', 'same', 'like', 'it', 'having', "they'd", 'than', 'one', 'the', 'who', 'or', 'these', "i'll", 'no', 'should', 'whom', 'doing', 'but', 'she', "mustn't", 'almost', 'you', 'his', 'about', 'often', "don't", "she'll", "you've", 'by', 'how', 'then', "what's", 'an', "how's", 'if', 'those', 'over', 'where', 'some', 'when', 'under', 'both', 'your', 'very', 'ought', "why's", 'because', 'own', 'so', 'why', 'into', 'com', "we'll", 'is', 'i', 'while', "he'd", 'know', 'been', 'do', 'yourselves', 'all', 'me', 'are', 'be', 'can', 'cannot', "we're", 'her', "when's", 'itself', 'we', "couldn't", 'herself', 'myself', "haven't", 'were', 'he', "you're", '

In [6]:
# load english words from the Brown corpus removing stop words.
english_freq_dist = FreqDist([w.lower() for w in nltk.corpus.brown.words()
                              if w not in STOP_WORDS])

## 4. Removing the rare words.

Below we remove rare words and get total count. The code below keeps all words with a occurance frequency above 2. 

In [7]:
english_freq_dist = {k:v for k,v in english_freq_dist.items() if v > 2}

## 5. Loading the input Open Maker corpus

In [8]:
# load the harvested text from wikipedia.
with open("data/wikipedia.json", "r") as f: OM_Corpus_text = f.read()
OM_Corpus = json.loads(OM_Corpus_text)

In [9]:
# The total number of wiki articles used:
print(len(OM_Corpus))

152


In [10]:
# Column names of the the corpus.
OM_Corpus[0].keys()

dict_keys(['theme.id', 'title', 'url', 'depth', 'text'])

In [11]:
def display_articles(tid):
    articles = [article for article in OM_Corpus if article['theme.id'] == tid]
    for article in articles:
        print(article['depth'],article['title'], article['url'])

In [12]:
display_articles(0)

0 Do it yourself https://en.wikipedia.org/wiki/Do_it_yourself
1 Edupunk https://en.wikipedia.org/wiki/Edupunk
1 Prosumer https://en.wikipedia.org/wiki/Prosumer
1 How-to https://en.wikipedia.org/wiki/How-to
1 Kludge https://en.wikipedia.org/wiki/Kludge
1 Bricolage https://en.wikipedia.org/wiki/Bricolage
1 Junk box https://en.wikipedia.org/wiki/Junk_box
1 Number 8 wire https://en.wikipedia.org/wiki/Number_8_wire
1 Ready-to-assemble furniture https://en.wikipedia.org/wiki/Ready-to-assemble_furniture
1 Open design https://en.wikipedia.org/wiki/Open_Design
1 Hackerspace https://en.wikipedia.org/wiki/Hackerspace
1 Instructables https://en.wikipedia.org/wiki/Instructables
1 Handyman https://en.wikipedia.org/wiki/Handyman
1 Circuit bending https://en.wikipedia.org/wiki/Circuit_bending
1 Project GreenWorld International https://en.wikipedia.org/wiki/Project_GreenOman
1 3D printing https://en.wikipedia.org/wiki/3D_printing


In [13]:
display_articles(1)

0 Open design https://en.wikipedia.org/wiki/Open_design
1 Knowledge commons https://en.wikipedia.org/wiki/Knowledge_commons
1 Open Source Ecology https://en.wikipedia.org/wiki/Open_Source_Ecology
1 Computer-aided design https://en.wikipedia.org/wiki/Computer-aided_design
1 Open Source Initiative https://en.wikipedia.org/wiki/Open_Source_Initiative
1 Open Architecture Network https://en.wikipedia.org/wiki/Open_Architecture_Network
1 Open-source architecture https://en.wikipedia.org/wiki/Open-source_architecture
1 Commons-based peer production https://en.wikipedia.org/wiki/Commons-based_peer_production
1 Open standard https://en.wikipedia.org/wiki/Open_standard
1 OpenCores https://en.wikipedia.org/wiki/OpenCores
1 Co-creation https://en.wikipedia.org/wiki/Co-creation
1 OpenBTS https://en.wikipedia.org/wiki/OpenBTS
1 Open manufacturing https://en.wikipedia.org/wiki/Open_manufacturing
1 Open-source hardware https://en.wikipedia.org/wiki/Open-source_hardware
1 Open source appropriate techno

In [14]:
display_articles(2)

0 Sustainability https://en.wikipedia.org/wiki/Sustainability
1 Sustainability standards and certification https://en.wikipedia.org/wiki/Sustainability_standards_and_certification
1 Appropriate technology https://en.wikipedia.org/wiki/Appropriate_technology
1 Sustainable development https://en.wikipedia.org/wiki/Sustainable_development
1 Environmental issue https://en.wikipedia.org/wiki/Environmental_issue
1 World Cities Summit https://en.wikipedia.org/wiki/World_Cities_Summit
1 Ecopsychology https://en.wikipedia.org/wiki/Ecopsychology
1 Book:Sustainability https://en.wikipedia.org/wiki/Book:Sustainability
1 Sustainable design https://en.wikipedia.org/wiki/Sustainable_design
1 Circles of Sustainability https://en.wikipedia.org/wiki/Circles_of_Sustainability
1 Sustainability science https://en.wikipedia.org/wiki/Sustainability_science
1 Sustainable living https://en.wikipedia.org/wiki/Sustainable_living
1 Index of sustainability articles https://en.wikipedia.org/wiki/List_of_sustainabil

In [15]:
display_articles(3)

0 Maker culture https://en.wikipedia.org/wiki/Maker_culture
1 Modular design https://en.wikipedia.org/wiki/Modular_design
1 Open-source car https://en.wikipedia.org/wiki/Open-source_car
1 Electric vehicle conversion https://en.wikipedia.org/wiki/Electric_vehicle_conversion
1 Thingiverse https://en.wikipedia.org/wiki/Thingiverse
1 Fab lab https://en.wikipedia.org/wiki/Fab_Lab_(fabrication_laboratory)
1 SparkFun Electronics https://en.wikipedia.org/wiki/SparkFun
1 RepRap project https://en.wikipedia.org/wiki/RepRap
1 Distributed manufacturing https://en.wikipedia.org/wiki/Distributed_manufacturing
1 Craft production https://en.wikipedia.org/wiki/Craft_production
1 Autonomous building https://en.wikipedia.org/wiki/Autonomous_building
1 Open-source hardware https://en.wikipedia.org/wiki/Open_source_hardware
1 Kit car https://en.wikipedia.org/wiki/Kit_car


In [16]:
display_articles(4)

0 Innovation https://en.wikipedia.org/wiki/Innovation
1 Competitive intelligence https://en.wikipedia.org/wiki/Creative_competitive_intelligence
1 Multiple discovery https://en.wikipedia.org/wiki/Multiple_discovery
1 UNDP Innovation Facility https://en.wikipedia.org/wiki/UNDP_Innovation_Facility
1 Open Innovations (event) https://en.wikipedia.org/wiki/Open_Innovations_(Forum_and_Technology_Show)
1 Trans-cultural diffusion https://en.wikipedia.org/wiki/Diffusion_(anthropology)
1 Individual capital https://en.wikipedia.org/wiki/Individual_capital
1 Innovation system https://en.wikipedia.org/wiki/Innovation_system
1 Public domain https://en.wikipedia.org/wiki/Public_domain
1 Ingenuity https://en.wikipedia.org/wiki/Ingenuity
1 Sustainable Development Goals https://en.wikipedia.org/wiki/Sustainable_Development_Goals
1 Participatory design https://en.wikipedia.org/wiki/Participatory_design
1 Innovation management https://en.wikipedia.org/wiki/Innovation_management
1 Information revolution ht

In [17]:
display_articles(5)

0 Collaboration https://en.wikipedia.org/wiki/Collaboration
1 Wikinomics https://en.wikipedia.org/wiki/Wikinomics
1 Collaborative editing https://en.wikipedia.org/wiki/Collaborative_editing
1 Telepresence https://en.wikipedia.org/wiki/Telepresence
1 Knowledge management https://en.wikipedia.org/wiki/Knowledge_management
1 The Culture of Collaboration https://en.wikipedia.org/wiki/The_Culture_of_Collaboration
1 Collaborative governance https://en.wikipedia.org/wiki/Collaborative_governance
1 Community film https://en.wikipedia.org/wiki/Community_film
1 Collaborative innovation network https://en.wikipedia.org/wiki/Collaborative_innovation_network
1 Design thinking https://en.wikipedia.org/wiki/Design_thinking
1 Role-based collaboration https://en.wikipedia.org/wiki/Role-based_collaboration
1 Intranet portal https://en.wikipedia.org/wiki/Intranet_portal
1 Critical thinking https://en.wikipedia.org/wiki/Critical_thinking
1 Facilitation (business) https://en.wikipedia.org/wiki/Facilitation

## 6. Analyzing a specific corpus based on a theme

In [18]:
def get_title(Corpus, theme_id):
    title = ''
    for article in Corpus:
        if article['theme.id'] == theme_id:
            title = article['title']
            break
    return title

### 6.0 Selecting the specific theme (a sub-corpus).

In [40]:
## For a different sub-corpus use a corresponding theme ID.
current_theme_id = 0

In [41]:
current_title = get_title(OM_Corpus, current_theme_id)

In [42]:
output_fname = "_".join([word.capitalize() for word in current_title.split(" ")])
print(current_title, "::", output_fname)

Do it yourself :: Do_It_Yourself


In [43]:
# Note that theme.id: 0 corresponds to the the Do IT YOURSELF
input_text = " ".join([page['text'] for page in OM_Corpus if page['title'] == current_title])

In [44]:
print(input_text)

Do it yourself 
 For other uses see 
 Do it yourself disambiguation 
 "DIY" redirects here For other uses see 
 DIY disambiguation 
 This article has multiple issues 
 Please help 
 improve it 
 or discuss these issues on the 
 talk page 
 Learn how and when to remove these template messages 
 This article 
 possibly contains 
 original research 
 Please 
 improve it 
 by 
 verifying 
 the claims made and adding 
 inline citations 
 Statements consisting only of original research should be removed 
 November 
 Learn how and when to remove this template message 
 This article 
 needs additional or better citations for 
 verification 
 Please help 
 improve this article 
 by 
 adding citations to reliable sources 
 Unsourced material may be challenged and removed 
 September 
 Learn how and when to remove this template message 
 Learn how and when to remove this template message 
 Part of a series on 
 Individualism 
 Topics and concepts 
 Autonomy 
 Civil liberties 
 Do it yourself 
 Er

In [45]:
# Tokenizing the input text:
tokenized = tokenizer.tokenize_words(input_text)
number_of_words = len(tokenized)
print(number_of_words ,current_title)

2703 Do it yourself


### 6.1 Computing frequency distributions of each token, i.e word, term, pancuation, etc.

In [46]:
input_freq_dist = FreqDist(tokenized)

In [47]:
input_freq_dist.most_common(20)

[('\n', 472),
 ('the', 100),
 ('of', 72),
 ('and', 66),
 ('to', 59),
 ('a', 40),
 ('diy', 38),
 ('in', 38),
 ('"', 26),
 ('as', 21),
 ('on', 20),
 ('this', 17),
 ('or', 17),
 ('by', 16),
 ('with', 14),
 ('it', 13),
 ('how', 13),
 ('for', 12),
 ('is', 12),
 ('home', 11)]

### 6.2 Removing punctuation and stopwords from the input corpus

In [48]:
for stopword in STOP_WORDS:
    if stopword in input_freq_dist:
        del input_freq_dist[stopword]
        
for punctuation in tokenizer.CHARACTERS_TO_SPLIT:
    if punctuation in input_freq_dist:
        del input_freq_dist[punctuation]

# Re-control most common words after cleaning:
input_freq_dist.most_common(80)

[('diy', 38),
 ('home', 11),
 ('improvement', 9),
 ('building', 8),
 ('magazine', 8),
 ('various', 7),
 ('people', 6),
 ('projects', 6),
 ('old', 6),
 ('make', 6),
 ('tools', 6),
 ('books', 6),
 ('popular', 6),
 ('television', 6),
 ('learn', 5),
 ('anarchism', 5),
 ('social', 5),
 ('materials', 5),
 ('term', 5),
 ('wide', 5),
 ('punk', 5),
 ('music', 5),
 ('movement', 5),
 ('culture', 5),
 ('way', 5),
 ('house', 5),
 ('1970s', 5),
 ('using', 5),
 ('zines', 5),
 ('article', 4),
 ('help', 4),
 ('improve', 4),
 ('remove', 4),
 ('template', 4),
 ('material', 4),
 ('personal', 4),
 ('community', 4),
 ('do-it-yourself', 4),
 ('related', 4),
 ('used', 4),
 ('use', 4),
 ('websites', 4),
 ('early', 4),
 ('first', 4),
 ('catalog', 4),
 ('techniques', 4),
 ('design', 4),
 ('maker', 4),
 ('bands', 4),
 ('uses', 3),
 ('please', 3),
 ('research', 3),
 ('citations', 3),
 ('message', 3),
 ('better', 3),
 ('part', 3),
 ('topics', 3),
 ('free', 3),
 ('rights', 3),
 ('individual', 3),
 ('liberty', 3),
 (

### 6.3 Removing rare words from input distribution

In [49]:
input_freq_dist = {k:v for k,v in input_freq_dist.items() if v > 1}

## 7. Comparing input vs English corpus volumes

### 7.1 Total words (after cleaning) 

In [50]:
n_input = sum(input_freq_dist.values())
n_english = sum(english_freq_dist.values())
n_input, n_english

(688, 679519)

### 7.2 Unique words (after cleaning)

In [51]:
n_unique_word_input = len(input_freq_dist.items())
n_unique_word_brown = len(english_freq_dist.items())
n_unique_word_input, n_unique_word_brown

(224, 20591)

### 7.3 Cleaned set of input words/terms

List of words in the corpus in case, for a visual inspection. Such inspections will be used both to improve tokenization as well as filtering.

In [52]:
input_freq_dist

{"'": 2,
 '1960s': 2,
 '1970s': 5,
 'adding': 2,
 'albert': 2,
 'along': 3,
 'alternative': 3,
 'anarchism': 5,
 'around': 2,
 'arthur': 2,
 'article': 4,
 'articles': 3,
 'associated': 2,
 'bands': 4,
 'based': 3,
 'bc': 3,
 'began': 3,
 'bending': 2,
 'better': 3,
 'book': 2,
 'books': 6,
 'brand': 2,
 'broad': 2,
 'building': 8,
 'built': 2,
 'cable': 2,
 'california': 2,
 'came': 2,
 'catalog': 4,
 'century': 2,
 'channel': 2,
 'children': 2,
 'circuit': 2,
 'citations': 3,
 'clothes': 2,
 'common': 3,
 'community': 4,
 'computers': 3,
 'content': 3,
 'continues': 2,
 'craft': 3,
 'crafting': 3,
 'crafts': 2,
 'created': 3,
 'creating': 2,
 'culture': 5,
 'de': 2,
 'design': 4,
 'disambiguation': 2,
 'diy': 38,
 'do-it-yourself': 4,
 'e': 3,
 'early': 4,
 'earth': 2,
 'edition': 2,
 'electronics': 2,
 'encouraging': 2,
 'environmental': 2,
 'equipment': 3,
 'etc': 2,
 'extensive': 3,
 'faire': 2,
 'fashion': 3,
 'featuring': 2,
 'feminism': 3,
 'fiberglass': 2,
 'first': 4,
 'focus

### 7.4 Set of terms/words that occure in both corpus.

In [53]:
common_words = [w for w in input_freq_dist.keys() & english_freq_dist.keys()]
print(len(common_words))

191


In [54]:
for w in common_words: print(w)

albert
show
spurred
children
remove
without
motivations
part
north
range
launched
uses
first
previously
individualism
using
found
labels
network
around
community
kind
prose
professional
published
original
merchandise
late
sites
skills
l
whole
focusing
used
alternative
improve
please
make
game
living
traditional
building
grow
e
school
article
trend
scenes
along
created
people
environmental
message
electronics
terms
others
series
stewart
brand
music
radio
world
common
showing
needs
edition
removed
lack
musical
old
free
mechanics
featuring
based
quickly
projects
began
house
someone
better
tools
independent
skill
bending
california
web
means
'
life
encouraging
fashion
number
channel
multiple
earth
homes
large
maintenance
issues
growing
cable
use
site
circuit
low-cost
articles
followed
television
home
maker
topics
built
since
research
craft
de
john
items
publication
various
liberalism
do-it-yourself
wide
came
making
associated
rock
magazines
media
adding
design
liberty
learn
techniques
band

### 7.5 Set of terms/words that occure in the sample but not in the reference corpus.

TO BE EXAMINED: This specific set needs to be incorporated. In fact, it may capture specifity of the content to a great extend. We need to assign a mapping score for each words in this set.

In [55]:
input_specifics = dict()
for w in input_freq_dist.keys() - english_freq_dist.keys():
    input_specifics[w] = input_freq_dist[w]
    print(w)

knitting
zine
subculture
faire
1970s
disambiguation
feminism
bc
crafting
online
hometalk
punk
anarchism
websites
website
etc
friedrich
fiberglass
1960s
self-publishing
zines
libertarianism
instructables
recycle
diy
citations
informational
indie
reuse
home-improvement
how-to
michel
jeans


In [56]:
print(len(input_specifics))

33


## 8. Stemming (in case needed) 

In [57]:
from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()
for k,v in input_freq_dist.items():
    stemmed = stemmer.stem(k)
    if stemmed != k: print(k, "->", stemmed)

uses -> use
disambiguation -> disambigu
article -> articl
multiple -> multipl
issues -> issu
please -> pleas
improve -> improv
remove -> remov
template -> templat
original -> origin
adding -> ad
citations -> citat
removed -> remov
november -> novemb
message -> messag
needs -> need
material -> materi
series -> seri
individualism -> individu
topics -> topic
rights -> right
individual -> individu
liberty -> liberti
personal -> person
property -> properti
anarchism -> anarch
liberalism -> liber
feminism -> femin
libertarianism -> libertarian
tyranny -> tyranni
building -> build
modifying -> modifi
things -> thing
materials -> materi
various -> variou
motivations -> motiv
previously -> previous
community -> commun
associated -> associ
since -> sinc
improvement -> improv
maintenance -> mainten
people -> peopl
projects -> project
range -> rang
alternative -> altern
indie -> indi
scenes -> scene
related -> relat
crafts -> craft
culture -> cultur
others -> other
used -> use
subculture -> subcul

## 9. Computing representation power of common words.

In [58]:
# combine
makerness = {}
# common_words = [w[0] for w in common_words]
for w in common_words:
    # Consider only words whose charcater length is larger than 1
    if len(w) > 1:
        # Log likelihood scores are computed:
        score = log((input_freq_dist[w] / n_input) / (english_freq_dist[w] / n_english))
        makerness[w] = (score, input_freq_dist[w])

In [59]:
# Sorting by scores:
for k,v in sorted(makerness.items(), key=lambda x:x[1][0], reverse=True): print(v[0],k,v[1])

7.183033708437422 catalog 4
6.672208084671431 do-it-yourself 4
6.672208084671431 template 4
6.489886527877476 labels 2
6.489886527877476 individualist 2
6.489886527877476 crafts 2
6.38452601221965 computers 3
6.202204455425695 featuring 2
6.202204455425695 low-cost 3
6.202204455425695 modifying 2
5.979060904111486 motivations 2
5.883750724307161 bands 4
5.796739347317531 spurred 2
5.796739347317531 focusing 2
5.796739347317531 stewart 2
5.796739347317531 web 2
5.796739347317531 maker 4
5.691378831659705 topics 3
5.642588667490273 merchandise 2
5.642588667490273 environmental 2
5.642588667490273 bending 2
5.642588667490273 cable 2
5.429014567192214 media 3
5.403696759207924 improvement 9
5.31123153153583 magazine 8
5.1906035437472156 tyranny 2
5.160750580597535 tools 6
5.103592166757586 individualism 2
4.949441486930327 prose 2
4.949441486930327 sunset 2
4.880448615443376 liberalism 2
4.858469708724601 craft 3
4.858469708724601 publications 3
4.815910094305805 sites 2
4.815910094305805 

In [60]:
OUTPUT_FOLDER = "./output/"
csvfile_name = OUTPUT_FOLDER + "makerness_" + output_fname + ".csv"
with open(csvfile_name, 'w') as csvfile:
    thewriter = csv.writer(csvfile, delimiter=',')
    for k,v in sorted(makerness.items(), key=lambda x:x[1][0], reverse=True):
        thewriter.writerow([k,v[0],v[1]])