# Discovery and Representation of Open Making Related Terms

This notebook sketches the initial exercise on discovering the open making related keywords. The input text is harvested via a Web crawler that identifies and crawls semantically related wikipedia articles.   

In [1]:
from utils import tokenizer
import nltk
from nltk import FreqDist
from math import log
import json, csv

## 1. Loading a reference English language corpus

In [2]:
from nltk.corpus import brown
brown.categories()

['adventure',
 'belles_lettres',
 'editorial',
 'fiction',
 'government',
 'hobbies',
 'humor',
 'learned',
 'lore',
 'mystery',
 'news',
 'religion',
 'reviews',
 'romance',
 'science_fiction']

## 2. Stop words

### 2.1 Standard stop words

In [3]:
with open("data/stopwords_standard.txt", "r") as f:
    STOP_WORDS_STANDARD = set(f.read().strip().split("\n"))
print(STOP_WORDS_STANDARD)

{"isn't", 'they', "aren't", 'of', "what's", 'how', "it's", 'my', "he'd", "you'd", 'know', 'a', 'yourself', 'nor', "hasn't", 'are', "how's", 'an', "they're", 'yours', 'your', 'if', 'the', 'than', 'it', "that's", "couldn't", 'such', 'up', 'once', 'because', 'some', 'does', 'those', 'this', "we're", 'more', 'you', 'to', 'while', 'me', 'but', 'these', "she'd", 'where', "who's", 'out', 'ourselves', 'should', 'when', "they'd", 'few', 'was', 'what', 'all', "you're", 'each', 'under', 'we', 'who', 'and', 'is', "wasn't", 'very', 'at', "can't", 'no', 'would', 'with', 'after', 'can', 'as', 'theirs', "there's", 'his', 'being', "i've", "he'll", "i'd", "won't", "doesn't", 'having', "they've", 'hers', 'am', "where's", 'were', 'him', "why's", "we'll", 'again', 'in', 'myself', 'through', 'get', 'be', 'then', "weren't", 'she', "he's", 'which', 'down', 'just', 'only', 'both', 'has', 'any', 'here', 'off', 'about', 'into', 'most', "shan't", "wouldn't", 'herself', "we've", 'them', "let's", 'that', "they'll",

### 2.2 Open-making related stop words

In [4]:
with open("data/stopwords_openmaker.txt", "r") as f:
    STOP_WORDS_OPENMAKER = set(f.read().strip().split("\n"))
print(STOP_WORDS_OPENMAKER)

{'may', 'almost'}


## 3. Removing stop words from the reference English corpus

In [5]:
# merging the two list together
STOP_WORDS = STOP_WORDS_STANDARD.union(STOP_WORDS_OPENMAKER)
print(STOP_WORDS)

{"isn't", 'they', "aren't", "what's", "it's", 'my', "you'd", "he'd", 'know', 'yourself', 'nor', "they're", 'the', 'it', "couldn't", 'once', 'some', 'does', 'this', "we're", 'while', 'should', 'when', 'is', 'at', "can't", 'no', 'would', 'after', 'can', 'theirs', "there's", 'his', "he'll", "i've", "won't", "i'd", 'having', "they've", 'were', "why's", 'again', 'get', "weren't", 'only', 'both', 'any', 'here', 'most', 'them', "let's", 'http', 'have', "don't", "you'll", "shouldn't", 'before', 'on', "mustn't", "i'll", 'r', 'been', 'there', 'too', 'from', "we'd", 'so', 'below', 'ours ', "i'm", 'our', 'doing', 'its', 'her', 'he', 'i', "haven't", 'for', 'of', 'how', 'a', "hasn't", 'are', "how's", 'an', 'yours', 'your', 'if', 'than', "that's", 'such', 'up', 'because', 'those', 'more', 'you', 'to', 'me', 'but', 'these', "she'd", 'where', "who's", 'out', 'ourselves', "they'd", 'few', 'was', 'what', 'all', "you're", 'each', 'under', 'we', 'who', 'and', "wasn't", 'very', 'with', 'as', 'being', "doesn

In [6]:
# load english words from the Brown corpus removing stop words.
english_freq_dist = FreqDist([w.lower() for w in nltk.corpus.brown.words()
                              if w not in STOP_WORDS])

## 4. Removing the rare words.

Below we remove rare words and get total count. The code below keeps all words with a occurance frequency above 2. 

In [7]:
english_freq_dist = {k:v for k,v in english_freq_dist.items() if v > 2}

## 5. Loading the input Open Maker corpus

In [8]:
# load the harvested text from wikipedia.
with open("data/wikipedia.json", "r") as f: OM_Corpus_text = f.read()
OM_Corpus = json.loads(OM_Corpus_text)

In [9]:
# The toatl number of wiki articles used:
print(len(OM_Corpus))

152


In [10]:
# Column names of the the corpus.
OM_Corpus[0].keys()

dict_keys(['theme.id', 'title', 'url', 'depth', 'text'])

In [11]:
def display_pages(tid):
    meme = [page for page in OM_Corpus if page['theme.id'] == tid]
    for m in meme:
        print(m['depth'],m['title'], m['url'])

In [12]:
display_pages(0)

0 Do it yourself https://en.wikipedia.org/wiki/Do_it_yourself
1 Edupunk https://en.wikipedia.org/wiki/Edupunk
1 Prosumer https://en.wikipedia.org/wiki/Prosumer
1 How-to https://en.wikipedia.org/wiki/How-to
1 Kludge https://en.wikipedia.org/wiki/Kludge
1 Bricolage https://en.wikipedia.org/wiki/Bricolage
1 Junk box https://en.wikipedia.org/wiki/Junk_box
1 Number 8 wire https://en.wikipedia.org/wiki/Number_8_wire
1 Ready-to-assemble furniture https://en.wikipedia.org/wiki/Ready-to-assemble_furniture
1 Open design https://en.wikipedia.org/wiki/Open_Design
1 Hackerspace https://en.wikipedia.org/wiki/Hackerspace
1 Instructables https://en.wikipedia.org/wiki/Instructables
1 Handyman https://en.wikipedia.org/wiki/Handyman
1 Circuit bending https://en.wikipedia.org/wiki/Circuit_bending
1 Project GreenWorld International https://en.wikipedia.org/wiki/Project_GreenOman
1 3D printing https://en.wikipedia.org/wiki/3D_printing


In [13]:
display_pages(1)

0 Open design https://en.wikipedia.org/wiki/Open_design
1 Knowledge commons https://en.wikipedia.org/wiki/Knowledge_commons
1 Open Source Ecology https://en.wikipedia.org/wiki/Open_Source_Ecology
1 Computer-aided design https://en.wikipedia.org/wiki/Computer-aided_design
1 Open Source Initiative https://en.wikipedia.org/wiki/Open_Source_Initiative
1 Open Architecture Network https://en.wikipedia.org/wiki/Open_Architecture_Network
1 Open-source architecture https://en.wikipedia.org/wiki/Open-source_architecture
1 Commons-based peer production https://en.wikipedia.org/wiki/Commons-based_peer_production
1 Open standard https://en.wikipedia.org/wiki/Open_standard
1 OpenCores https://en.wikipedia.org/wiki/OpenCores
1 Co-creation https://en.wikipedia.org/wiki/Co-creation
1 OpenBTS https://en.wikipedia.org/wiki/OpenBTS
1 Open manufacturing https://en.wikipedia.org/wiki/Open_manufacturing
1 Open-source hardware https://en.wikipedia.org/wiki/Open-source_hardware
1 Open source appropriate techno

In [14]:
display_pages(2)

0 Sustainability https://en.wikipedia.org/wiki/Sustainability
1 Sustainability standards and certification https://en.wikipedia.org/wiki/Sustainability_standards_and_certification
1 Appropriate technology https://en.wikipedia.org/wiki/Appropriate_technology
1 Sustainable development https://en.wikipedia.org/wiki/Sustainable_development
1 Environmental issue https://en.wikipedia.org/wiki/Environmental_issue
1 World Cities Summit https://en.wikipedia.org/wiki/World_Cities_Summit
1 Ecopsychology https://en.wikipedia.org/wiki/Ecopsychology
1 Book:Sustainability https://en.wikipedia.org/wiki/Book:Sustainability
1 Sustainable design https://en.wikipedia.org/wiki/Sustainable_design
1 Circles of Sustainability https://en.wikipedia.org/wiki/Circles_of_Sustainability
1 Sustainability science https://en.wikipedia.org/wiki/Sustainability_science
1 Sustainable living https://en.wikipedia.org/wiki/Sustainable_living
1 Index of sustainability articles https://en.wikipedia.org/wiki/List_of_sustainabil

In [15]:
display_pages(3)

0 Maker culture https://en.wikipedia.org/wiki/Maker_culture
1 Modular design https://en.wikipedia.org/wiki/Modular_design
1 Open-source car https://en.wikipedia.org/wiki/Open-source_car
1 Electric vehicle conversion https://en.wikipedia.org/wiki/Electric_vehicle_conversion
1 Thingiverse https://en.wikipedia.org/wiki/Thingiverse
1 Fab lab https://en.wikipedia.org/wiki/Fab_Lab_(fabrication_laboratory)
1 SparkFun Electronics https://en.wikipedia.org/wiki/SparkFun
1 RepRap project https://en.wikipedia.org/wiki/RepRap
1 Distributed manufacturing https://en.wikipedia.org/wiki/Distributed_manufacturing
1 Craft production https://en.wikipedia.org/wiki/Craft_production
1 Autonomous building https://en.wikipedia.org/wiki/Autonomous_building
1 Open-source hardware https://en.wikipedia.org/wiki/Open_source_hardware
1 Kit car https://en.wikipedia.org/wiki/Kit_car


In [16]:
display_pages(4)

0 Innovation https://en.wikipedia.org/wiki/Innovation
1 Competitive intelligence https://en.wikipedia.org/wiki/Creative_competitive_intelligence
1 Multiple discovery https://en.wikipedia.org/wiki/Multiple_discovery
1 UNDP Innovation Facility https://en.wikipedia.org/wiki/UNDP_Innovation_Facility
1 Open Innovations (event) https://en.wikipedia.org/wiki/Open_Innovations_(Forum_and_Technology_Show)
1 Trans-cultural diffusion https://en.wikipedia.org/wiki/Diffusion_(anthropology)
1 Individual capital https://en.wikipedia.org/wiki/Individual_capital
1 Innovation system https://en.wikipedia.org/wiki/Innovation_system
1 Public domain https://en.wikipedia.org/wiki/Public_domain
1 Ingenuity https://en.wikipedia.org/wiki/Ingenuity
1 Sustainable Development Goals https://en.wikipedia.org/wiki/Sustainable_Development_Goals
1 Participatory design https://en.wikipedia.org/wiki/Participatory_design
1 Innovation management https://en.wikipedia.org/wiki/Innovation_management
1 Information revolution ht

In [17]:
display_pages(5)

0 Collaboration https://en.wikipedia.org/wiki/Collaboration
1 Wikinomics https://en.wikipedia.org/wiki/Wikinomics
1 Collaborative editing https://en.wikipedia.org/wiki/Collaborative_editing
1 Telepresence https://en.wikipedia.org/wiki/Telepresence
1 Knowledge management https://en.wikipedia.org/wiki/Knowledge_management
1 The Culture of Collaboration https://en.wikipedia.org/wiki/The_Culture_of_Collaboration
1 Collaborative governance https://en.wikipedia.org/wiki/Collaborative_governance
1 Community film https://en.wikipedia.org/wiki/Community_film
1 Collaborative innovation network https://en.wikipedia.org/wiki/Collaborative_innovation_network
1 Design thinking https://en.wikipedia.org/wiki/Design_thinking
1 Role-based collaboration https://en.wikipedia.org/wiki/Role-based_collaboration
1 Intranet portal https://en.wikipedia.org/wiki/Intranet_portal
1 Critical thinking https://en.wikipedia.org/wiki/Critical_thinking
1 Facilitation (business) https://en.wikipedia.org/wiki/Facilitation

## 6. Analyzing a specific theme via a theme

In [18]:
# Note that theme.id: 0 corresponds to the the Do IT YOURSELF
input_text = " ".join([page['text'] for page in OM_Corpus if page['theme.id'] == 0])

In [19]:
# Tokenizing the input text:
tokenized = tokenizer.tokenize_words(input_text)
number_of_words = len(tokenized)
print(number_of_words)

30073


### 6.1 Computing frequency distributions of each token, i.e word, term, pancuation, etc.

In [20]:
input_freq_dist = FreqDist(tokenized)

In [21]:
input_freq_dist.most_common(20)

[('\n', 3787),
 ('the', 1257),
 ('and', 776),
 ('of', 771),
 ('a', 661),
 ('to', 642),
 ('in', 563),
 ('"', 429),
 ('is', 303),
 ('as', 276),
 ('for', 257),
 ('that', 224),
 ('or', 206),
 ('by', 186),
 ('with', 182),
 ('on', 156),
 ('are', 151),
 ('3d', 142),
 ('from', 129),
 ('it', 119)]

### 6.2 Removing punctuation and stopwords from the input corpus

In [22]:
for stopword in STOP_WORDS:
    if stopword in input_freq_dist:
        del input_freq_dist[stopword]
        
for punctuation in tokenizer.CHARACTERS_TO_SPLIT:
    if punctuation in input_freq_dist:
        del input_freq_dist[punctuation]

# Re-control most common words after cleaning:
input_freq_dist.most_common(20)

[('3d', 142),
 ('printing', 94),
 ('design', 75),
 ('used', 72),
 ('open', 65),
 ('also', 63),
 ('one', 58),
 ('new', 56),
 ('many', 55),
 ('kludge', 55),
 ('term', 53),
 ('diy', 52),
 ('manufacturing', 51),
 ('use', 50),
 ('project', 49),
 ('bricolage', 46),
 ('often', 45),
 ('work', 45),
 ('hackerspaces', 44),
 ('handyman', 43)]

### 6.3 Removing rare words from input distribution

In [23]:
input_freq_dist = {k:v for k,v in input_freq_dist.items() if v > 1}

## 7. Comparing input vs English corpus volumes

### 7.1 Total words (after cleaning) 

In [24]:
n_input = sum(input_freq_dist.values())
n_english = sum(english_freq_dist.values())
n_input, n_english

(12914, 685422)

### 7.2 Unique words (after cleaning)

In [25]:
n_unique_word_input = len(input_freq_dist.items())
n_unique_word_brown = len(english_freq_dist.items())
n_unique_word_input, n_unique_word_brown

(2386, 20591)

### 7.3 Cleaned set of input words/terms

List of words in the corpus in case, for a visual inspection. Such inspections will be used both to improve tokenization as well as filetring.

In [26]:
input_freq_dist

{'uses': 23,
 'see': 29,
 'disambiguation': 5,
 'diy': 52,
 'redirects': 3,
 'article': 21,
 'multiple': 13,
 'issues': 10,
 'please': 9,
 'help': 20,
 'improve': 15,
 'discuss': 4,
 'page': 2,
 'learn': 14,
 'remove': 11,
 'template': 10,
 'possibly': 4,
 'contains': 3,
 'original': 11,
 'research': 17,
 'verifying': 2,
 'claims': 5,
 'made': 19,
 'adding': 11,
 'inline': 2,
 'citations': 7,
 'statements': 2,
 'consisting': 3,
 'removed': 8,
 'november': 3,
 'message': 10,
 'needs': 8,
 'additional': 5,
 'better': 6,
 'verification': 3,
 'reliable': 3,
 'sources': 12,
 'unsourced': 3,
 'material': 27,
 'challenged': 3,
 'september': 2,
 'part': 23,
 'series': 11,
 'individualism': 4,
 'topics': 4,
 'concepts': 2,
 'autonomy': 2,
 'free': 29,
 'love': 6,
 'freethought': 2,
 'human': 12,
 'rights': 8,
 'individual': 13,
 'reclamation': 2,
 'liberty': 4,
 'negative': 3,
 'personal': 7,
 'property': 12,
 'positive': 4,
 'private': 4,
 'self-ownership': 2,
 'mile': 2,
 'armand': 2,
 'alber

### 7.4 Set of terms/words that occure in both corpus.

In [27]:
common_words = [w for w in input_freq_dist.keys() & english_freq_dist.keys()]
print(len(common_words))

1945


In [28]:
for w in common_words: print(w)

restricted
advertise
agency
continued
collection
defense
pieces
present
guns
accept
date
potential
psychological
institute
spare
claims
everyone
solve
war
regulations
food
cleaning
october
teaching
record
beam
distributed
successful
middle
origins
around
adopted
do-it-yourself
higher
meets
tech
perform
tangible
feedback
last
activity
liquid
interesting
supplying
fixture
later
computing
desired
men's
reduce
increase
distribution
vehicle
internal
fence
said
satisfy
hours
additive
creating
projects
march
aware
well
heritage
introducing
texts
unusual
libraries
disciplines
bring
quality
don
proof
language
solid
artistic
green
collective
achieve
condition
removal
service
awarded
applications
yiddish
widespread
seems
korea
alvin
however
publication
though
economies
cross-section
long
injury
adding
category
powerful
demands
sound
influence
features
within
words
carpentry
chain
economic
james
organizations
pattern
century
jet
deliver
fourth
stock
welcome
casting
example
modern
raymond
continues

### 7.5 Set of terms/words that occure in the sample but not in the reference corpus.

TO BE EXAMINED: This specific set needs to be incorporated. In fact, it may capture specifity of the content to a great extend. We need to assign a mapping score for each words in this set.

In [29]:
input_specifics = dict()
for w in input_freq_dist.keys() - english_freq_dist.keys():
    input_specifics[w] = input_freq_dist[w]
    print(w)

's
unintended
disambiguation
kerala
targeted
youtube
vernacular
threeding
feeder
feminist
josiah
rica
powertrain
websites
update
autonomous
patio
bc
inc
jugaad
sudev
day'
modeling
technologies
metalworking
bahrain
kluge
fiberglass
redirected
printers
timelapse
subtractive
tinker
etymologies
wiktionary
klooj
1970s
homebrewers
forums
handymen
vatakara
sprinkler
sub-cultures
weblog
word's
copyrightable
collaborative
unsourced
encyclopedic
workspace
2009-02-23
website
adjective
branding
homebrewing
granholm
futurologist
anthropologist
complementing
non-corporate
tiling
openbook
community-operated
genre
wikimedia
spence
gingrich
desktop
how-to
edupunk
top-down
jetting
customizing
costa
nigeria
cf
innovators
fdm
fasteners
rothbard
laptop
o'reilly
libertarian
programme
open-source
hackers
self-ownership
audio
solidified
anarchism
trash
professionalism
ceo
knockdown
infoanarchism
app
logo
retrieve
displacing
wipers
computer-aided
pronounce
imperative'
toolroom
actuator
granules
kludges
hirst
k

In [30]:
print(len(input_specifics))

441


## 8. Stemming (in case needed) 

In [31]:
from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()
for k,v in input_freq_dist.items():
    stemmed = stemmer.stem(k)
    if stemmed != k: print(k, "->", stemmed)

uses -> use
disambiguation -> disambigu
redirects -> redirect
article -> articl
multiple -> multipl
issues -> issu
please -> pleas
improve -> improv
remove -> remov
template -> templat
possibly -> possibl
contains -> contain
original -> origin
verifying -> verifi
claims -> claim
adding -> ad
inline -> inlin
citations -> citat
statements -> statement
consisting -> consist
removed -> remov
november -> novemb
message -> messag
needs -> need
additional -> addit
verification -> verif
reliable -> reliabl
sources -> sourc
unsourced -> unsourc
material -> materi
challenged -> challeng
september -> septemb
series -> seri
individualism -> individu
topics -> topic
concepts -> concept
autonomy -> autonomi
rights -> right
individual -> individu
reclamation -> reclam
liberty -> liberti
negative -> neg
personal -> person
property -> properti
positive -> posit
private -> privat
lysander -> lysand
henry -> henri
james -> jame
anarchism -> anarch
anarcho-capitalism -> anarcho-capit
liberalism -> liber
f

## 9. Computing representation power of common words.

In [32]:
# combine
makerness = {}
for w in common_words:
    # Consider only words whose charcater length is larger than 1
    if len(w) > 1:
        # Log likelihood scores are computed:
        score = log((input_freq_dist[w] / n_input) / (english_freq_dist[w] / n_english))
        makerness[w] = score

In [33]:
# Sorting by scores:
for k,v in sorted(makerness.items(), key=lambda x:x[1], reverse=True): print(k,v)

additive 6.240406254084403
printer 5.86884269765192
printing 5.624645737139878
digital 5.398839068406185
franchise 5.13487352257172
users 5.070335001434149
global 5.070335001434149
citation 5.001342129947197
bending 4.9702515428771665
deposition 4.819020573153243
do-it-yourself 4.760180073130309
manufacturing 4.725494515142419
computers 4.664869893325984
non-profit 4.664869893325984
hardware 4.664869893325984
junk 4.664869893325984
template 4.664869893325984
jargon 4.664869893325984
evolutionary 4.664869893325984
homeowners 4.664869893325984
layer 4.622310278907189
fabrication 4.600331372188413
bug 4.531338500701462
lab 4.48254833653203
zealand 4.48254833653203
portal 4.48254833653203
hack 4.48254833653203
coined 4.48254833653203
individualist 4.48254833653203
computer 4.451295793027925
media 4.451295793027925
commons 4.377187820874203
consumers 4.339447492891357
maker 4.320029407034255
circuit 4.270215701322035
prototype 4.2594047852178205
fused 4.2594047852178205
enthusiasts 4.259404

In [34]:
with open('makerness.csv', 'w') as csvfile:
    thewriter = csv.writer(csvfile, delimiter=',')
    for k,v in sorted(makerness.items(), key=lambda x:x[1], reverse=True):
        thewriter.writerow([k,v])