# Discovery and Representation of Open Making Related Terms

This notebook sketches the initial exercise on discovering the open making related keywords. The input text is harvested via a Web crawler that identifies and crawls semantically related wikipedia articles.   

In [None]:
from utils import tokenizer
import nltk
from nltk import FreqDist
from math import log
import json, csv

## 1. Loading a reference English language corpus

In [None]:
from nltk.corpus import brown
brown.categories()

## 2. Stop words

### 2.1 Standard stop words

In [None]:
with open("data/stopwords_standard.txt", "r") as f:
    STOP_WORDS_STANDARD = set(f.read().strip().split("\n"))
print(STOP_WORDS_STANDARD)

### 2.2 Open-making related stop words

In [None]:
with open("data/stopwords_openmaker.txt", "r") as f:
    STOP_WORDS_OPENMAKER = set(f.read().strip().split("\n"))
print(STOP_WORDS_OPENMAKER)

## 3. Removing stop words from the reference English corpus

In [None]:
# merging the two list together
STOP_WORDS = STOP_WORDS_STANDARD.union(STOP_WORDS_OPENMAKER)
print(STOP_WORDS)

In [None]:
# load english words from the Brown corpus removing stop words.
english_freq_dist = FreqDist([w.lower() for w in nltk.corpus.brown.words()
                              if w not in STOP_WORDS])

## 4. Removing the rare words.

Below we remove rare words and get total count. The code below keeps all words with a occurance frequency above 2. 

In [None]:
english_freq_dist = {k:v for k,v in english_freq_dist.items() if v > 2}

## 5. Loading the input Open Maker corpus

In [None]:
# load the harvested text from wikipedia.
with open("data/wikipedia.json", "r") as f: OM_Corpus_text = f.read()
OM_Corpus = json.loads(OM_Corpus_text)

In [None]:
# The total number of wiki articles used:
print(len(OM_Corpus))

In [None]:
# Column names of the the corpus.
OM_Corpus[0].keys()

In [None]:
def display_articles(tid):
    articles = [article for article in OM_Corpus if article['theme.id'] == tid]
    for article in articles:
        print(article['depth'],article['title'], article['url'])

In [None]:
display_articles(0)

In [None]:
display_articles(1)

In [None]:
display_articles(2)

In [None]:
display_articles(3)

In [None]:
display_articles(4)

In [None]:
display_articles(5)

## 6. Analyzing a specific corpus based on a theme

In [None]:
def get_title(Corpus, theme_id):
    title = ''
    for article in Corpus:
        if article['theme.id'] == theme_id:
            title = article['title']
            break
    return title

### 6.0 Selecting the specific theme (a sub-corpus).

In [None]:
## For a different sub-corpus use a corresponding theme ID.
current_theme_id = 1

In [None]:
current_title = get_title(OM_Corpus, current_theme_id)

In [None]:
output_fname = "_".join([word.capitalize() for word in current_title.split(" ")])
print(current_title, "::", output_fname)

In [None]:
# Note that theme.id: 0 corresponds to the the Do IT YOURSELF
input_text = " ".join([page['text'] for page in OM_Corpus if page['theme.id'] == current_theme_id])

In [None]:
print(input_text)

In [None]:
# Tokenizing the input text:
tokenized = tokenizer.tokenize_words(input_text)
number_of_words = len(tokenized)
print(number_of_words ,current_title)

### 6.1 Computing frequency distributions of each token, i.e word, term, pancuation, etc.

In [None]:
input_freq_dist = FreqDist(tokenized)

In [None]:
input_freq_dist.most_common(20)

### 6.2 Removing punctuation and stopwords from the input corpus

In [None]:
for stopword in STOP_WORDS:
    if stopword in input_freq_dist:
        del input_freq_dist[stopword]
        
for punctuation in tokenizer.CHARACTERS_TO_SPLIT:
    if punctuation in input_freq_dist:
        del input_freq_dist[punctuation]

# Re-control most common words after cleaning:
input_freq_dist.most_common(80)

### 6.3 Removing rare words from input distribution

In [None]:
input_freq_dist = {k:v for k,v in input_freq_dist.items() if v > 1}

## 7. Comparing input vs English corpus volumes

### 7.1 Total words (after cleaning) 

In [None]:
n_input = sum(input_freq_dist.values())
n_english = sum(english_freq_dist.values())
n_input, n_english

### 7.2 Unique words (after cleaning)

In [None]:
n_unique_word_input = len(input_freq_dist.items())
n_unique_word_brown = len(english_freq_dist.items())
n_unique_word_input, n_unique_word_brown

### 7.3 Cleaned set of input words/terms

List of words in the corpus in case, for a visual inspection. Such inspections will be used both to improve tokenization as well as filtering.

In [None]:
input_freq_dist

### 7.4 Set of terms/words that occure in both corpus.

In [None]:
common_words = [w for w in input_freq_dist.keys() & english_freq_dist.keys()]
print(len(common_words))

In [None]:
for w in common_words: print(w)

### 7.5 Set of terms/words that occure in the sample but not in the reference corpus.

TO BE EXAMINED: This specific set needs to be incorporated. In fact, it may capture specifity of the content to a great extend. We need to assign a mapping score for each words in this set.

In [None]:
input_specifics = dict()
for w in input_freq_dist.keys() - english_freq_dist.keys():
    input_specifics[w] = input_freq_dist[w]
    print(w)

In [None]:
print(len(input_specifics))

## 8. Stemming (in case needed) 

In [None]:
from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()
for k,v in input_freq_dist.items():
    stemmed = stemmer.stem(k)
    if stemmed != k: print(k, "->", stemmed)

## 9. Computing representation power of common words.

In [None]:
# combine
makerness = {}
# common_words = [w[0] for w in common_words]
for w in common_words:
    # Consider only words whose charcater length is larger than 1
    if len(w) > 1:
        # Log likelihood scores are computed:
        score = log((input_freq_dist[w] / n_input) / (english_freq_dist[w] / n_english))
        makerness[w] = (score, input_freq_dist[w])

In [None]:
# Sorting by scores:
for k,v in sorted(makerness.items(), key=lambda x:x[1][0], reverse=True): print(v[0],k,v[1])

In [None]:
OUTPUT_FOLDER = "./output/"
csvfile_name = OUTPUT_FOLDER + "makerness_" + output_fname + ".csv"
with open(csvfile_name, 'w') as csvfile:
    thewriter = csv.writer(csvfile, delimiter=',')
    for k,v in sorted(makerness.items(), key=lambda x:x[1][0], reverse=True):
        thewriter.writerow([k,v[0],v[1]])