 # Table of Contents
<div class="toc" style="margin-top: 1em;"><ul class="toc-item" id="toc-level0"><li><span><a href="http://localhost:8889/notebooks/Machine%20Learning%20Subject.ipynb#Parse-RDF" data-toc-modified-id="Parse-RDF-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Parse RDF</a></span></li><li><span><a href="http://localhost:8889/notebooks/Machine%20Learning%20Subject.ipynb#Helper-functions" data-toc-modified-id="Helper-functions-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Helper functions</a></span></li><li><span><a href="http://localhost:8889/notebooks/Machine%20Learning%20Subject.ipynb#Start-getting-information" data-toc-modified-id="Start-getting-information-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Start getting information</a></span><ul class="toc-item"><li><span><a href="http://localhost:8889/notebooks/Machine%20Learning%20Subject.ipynb#Get-concepts" data-toc-modified-id="Get-concepts-3.1"><span class="toc-item-num">3.1&nbsp;&nbsp;</span>Get concepts</a></span></li><li><span><a href="http://localhost:8889/notebooks/Machine%20Learning%20Subject.ipynb#Things-being-used-in/related-to-ML" data-toc-modified-id="Things-being-used-in/related-to-ML-3.2"><span class="toc-item-num">3.2&nbsp;&nbsp;</span>Things being used in/related to ML</a></span></li><li><span><a href="http://localhost:8889/notebooks/Machine%20Learning%20Subject.ipynb#Concepts-within-all-the-pages-within-machine-learning" data-toc-modified-id="Concepts-within-all-the-pages-within-machine-learning-3.3"><span class="toc-item-num">3.3&nbsp;&nbsp;</span>Concepts within all the pages within machine learning</a></span></li></ul></li><li><span><a href="http://localhost:8889/notebooks/Machine%20Learning%20Subject.ipynb#Final-structure" data-toc-modified-id="Final-structure-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Final structure</a></span><ul class="toc-item"><li><span><a href="http://localhost:8889/notebooks/Machine%20Learning%20Subject.ipynb#Get-concept-counts" data-toc-modified-id="Get-concept-counts-4.1"><span class="toc-item-num">4.1&nbsp;&nbsp;</span>Get concept counts</a></span></li><li><span><a href="http://localhost:8889/notebooks/Machine%20Learning%20Subject.ipynb#Filter-concepts" data-toc-modified-id="Filter-concepts-4.2"><span class="toc-item-num">4.2&nbsp;&nbsp;</span>Filter concepts</a></span></li><li><span><a href="http://localhost:8889/notebooks/Machine%20Learning%20Subject.ipynb#Construct-final-structure" data-toc-modified-id="Construct-final-structure-4.3"><span class="toc-item-num">4.3&nbsp;&nbsp;</span>Construct final structure</a></span></li></ul></li></ul></div>

In [83]:
import re
from bz2 import BZ2File as bzopen
from tqdm import tqdm
import pickle
import json
import operator
from collections import Counter

## Parse RDF

In [1]:
sample_lines = [
    '<http://dbpedia.org/resource/Timeline_of_machine_learning> <http://purl.org/dc/terms/subject> <http://dbpedia.org/resource/Category:Machine_learning> .\n',
    '<http://dbpedia.org/resource/Category:World_War_II> <http://www.w3.org/2000/01/rdf-schema#label> "World War II"@en <http://en.wikipedia.org/wiki/Category:World_War_II?oldid=729127213> .\n'
]

## Helper functions

In [41]:
def search_file(filename, search_text):
    with bzopen(filename, 'r') as file:
        for line in tqdm(file):
            line = line.decode("utf-8")
            if search_text in line:
                yield line

## Start getting information

In [2]:
topic = 'Machine_learning'

### Get concepts

In [46]:
print(re.escape('<http://purl.org/dc/terms/subject>'))

\<http\:\/\/purl\.org\/dc\/terms\/subject\>


In [47]:
PARSE_CONCEPT_REGEX = r'^\<http\:\/\/dbpedia\.org\/resource\/([^>]+)\>'
PARSE_CONCEPT_TYPE_REGEX = r'\<http\:\/\/purl\.org\/dc\/terms\/([^>]+)\>'
def parse_concept(line):
    concept = re.findall(PARSE_CONCEPT_REGEX, line)[0]
    concept = concept.replace('_', ' ')
    
    concept_type = re.findall(PARSE_CONCEPT_TYPE_REGEX, line)[0]
    
    return (concept, concept_type)

parse_concept('<http://dbpedia.org/resource/Glossary_of_artificial_intelligence> <http://purl.org/dc/terms/subject> <http://dbpedia.org/resource/Category:Machine_learning> .\n')

('Glossary of artificial intelligence', 'subject')

In [48]:
def get_concepts(topic):
    concepts = []
    
    for line in search_file('../article_categories_en.ttl.bz2', 'Category:{}>'.format(topic)):
        concepts.append(parse_concept(line))
        
    return concepts

In [49]:
# Total: 23990516
topic_concepts = get_concepts(topic)

23990516it [03:02, 131245.09it/s]


In [50]:
topic_concepts

[('Supervised learning', 'subject'),
 ('Pattern recognition', 'subject'),
 ('Overfitting', 'subject'),
 ('Inductive bias', 'subject'),
 ('Binary classification', 'subject'),
 ('Early stopping', 'subject'),
 ('Machine learning', 'subject'),
 ('Unsupervised learning', 'subject'),
 ('Formal concept analysis', 'subject'),
 ('Computational learning theory', 'subject'),
 ("Solomonoff's theory of inductive inference", 'subject'),
 ('Cross-validation (statistics)', 'subject'),
 ('Instantaneously trained neural networks', 'subject'),
 ('Linear separability', 'subject'),
 ('Dimensionality reduction', 'subject'),
 ('Curse of dimensionality', 'subject'),
 ('Confusion matrix', 'subject'),
 ('Mixture model', 'subject'),
 ('Relational data mining', 'subject'),
 ('Multi-task learning', 'subject'),
 ('Transduction (machine learning)', 'subject'),
 ('Savi Technology', 'subject'),
 ('Granular computing', 'subject'),
 ('Statistical learning theory', 'subject'),
 ('Bongard problem', 'subject'),
 ('Generati

In [55]:
with open('machine-learning-topics.json', 'wt') as f:
    json.dump(topic_concepts, f, indent=4)

### Things being used in/related to ML

### Concepts within all the pages within machine learning

In [58]:
def get_search_concepts(concepts):
    # Generate search map
    search_concepts = {}
    for concept in concepts:
        name = concept[0]
        slug = name.replace(' ', '_')
        slug += '>'
        
        search_concepts[slug] = concept
        
    return search_concepts

In [59]:
search_concepts = get_search_concepts(topic_concepts)

search_concepts

{'AIXI>': ('AIXI', 'subject'),
 'Accuracy_paradox>': ('Accuracy paradox', 'subject'),
 'Action_model_learning>': ('Action model learning', 'subject'),
 'Active_learning_(machine_learning)>': ('Active learning (machine learning)',
  'subject'),
 'Adversarial_machine_learning>': ('Adversarial machine learning', 'subject'),
 'Algorithm_Selection>': ('Algorithm Selection', 'subject'),
 'Algorithmic_inference>': ('Algorithmic inference', 'subject'),
 'AlphaGo>': ('AlphaGo', 'subject'),
 'Apprenticeship_learning>': ('Apprenticeship learning', 'subject'),
 'Bag-of-words_model>': ('Bag-of-words model', 'subject'),
 'Ball_tree>': ('Ball tree', 'subject'),
 'Base_rate>': ('Base rate', 'subject'),
 'Bayesian_interpretation_of_kernel_regularization>': ('Bayesian interpretation of kernel regularization',
  'subject'),
 'Bayesian_optimization>': ('Bayesian optimization', 'subject'),
 'Bayesian_structural_time_series>': ('Bayesian structural time series',
  'subject'),
 'Bias–variance_tradeoff>': ('B

In [61]:
def get_lines_with_concepts(search_concepts):    
    # Loop through each line and get relevant links
    lines = []
    with bzopen('../page_links_en.ttl.bz2', 'r') as file:
        for line in tqdm(file):
            line = line.decode('utf-8')
            for concept in search_concepts.keys():
                if concept in line:
                    lines.append(line)
                    break
                    
    return lines

In [62]:
concept_lines = get_lines_with_concepts(search_concepts)

183605697it [1:53:52, 26871.83it/s]


In [63]:
len(concept_lines)

9333

In [65]:
concept_lines[0]

'<http://dbpedia.org/resource/Algorithm> <http://dbpedia.org/ontology/wikiPageWikiLink> <http://dbpedia.org/resource/Machine_learning> .\n'

In [67]:
print(re.escape('<http://dbpedia.org/resource/Algorithm>'))

\<http\:\/\/dbpedia\.org\/resource\/Algorithm\>


In [64]:
with open('concept_lines.json', 'wt') as f:
    json.dump(concept_lines, f)

In [68]:
CONCEPT_LINE_FROM_REGEX = r'^\<http\:\/\/dbpedia\.org\/resource\/([^>]+)\>'
CONCEPT_LINE_TO_REGEX = r'\<http\:\/\/dbpedia\.org\/resource\/([^>]+)\> \.'
def parse_concept_line(concept_line):
    from_page = re.findall(CONCEPT_LINE_FROM_REGEX, concept_line)[0]
    from_page = from_page.replace('_', ' ')
    
    to_page = re.findall(CONCEPT_LINE_TO_REGEX, concept_line)[0]
    to_page = to_page.replace('_', ' ')
    
    return (from_page, to_page)
    
parse_concept_line(concept_lines[0])

('Algorithm', 'Machine learning')

In [69]:
concept_links = []
for line in concept_lines:
    concept_links.append(parse_concept_line(line))

In [70]:
concept_links

[('Algorithm', 'Machine learning'),
 ('Artificial intelligence', 'Machine learning'),
 ('Artificial intelligence', 'AlphaGo'),
 ('Artificial intelligence', 'Unsupervised learning'),
 ('Artificial intelligence', 'Supervised learning'),
 ('Artificial intelligence', 'Statistical classification'),
 ('Artificial intelligence', 'Computational learning theory'),
 ('Artificial intelligence', 'Developmental robotics'),
 ('Artificial intelligence', 'Pattern recognition'),
 ('Artificial intelligence', 'Vanishing gradient problem'),
 ('Artificial intelligence', 'Deeplearning4j'),
 ('Artificial intelligence', 'Google DeepMind'),
 ('Artificial intelligence', 'Glossary of artificial intelligence'),
 ('Artificial intelligence', 'AIXI'),
 ('Artificial intelligence', 'Machine Learning (journal)'),
 ('Kolmogorov complexity', "Solomonoff's theory of inductive inference"),
 ('Kolmogorov complexity', 'Grammar induction'),
 ('Aesthetics', 'Machine learning'),
 ('List of artificial intelligence projects', 'AI

In [79]:
def get_related_concepts(concept):
    related = []
    existing_concepts = list(map(operator.itemgetter(0), topic_concepts))
    
    for c in concept_links:
        if c[0] == concept and c[1] not in existing_concepts:
            related.append(c[1])
            
    return related

get_related_concepts('Document classification')

['Library science',
 'Information science',
 'Computer science',
 'Document',
 'Class (philosophy)',
 'Categorization',
 'Algorithmically',
 'Subject (documents)',
 'Subject indexing',
 'Frederick Wilfrid Lancaster',
 'Thesaurus',
 'Controlled vocabulary',
 'Document clustering',
 'Expectation maximization',
 'Naive Bayes classifier',
 'Tf–idf',
 'Latent semantic indexing',
 'Support vector machines',
 'Artificial neural network',
 'K-nearest neighbor algorithm',
 'Decision tree learning',
 'ID3 algorithm',
 'C4.5 algorithm',
 'Concept Mining',
 'Rough set',
 'Soft set',
 'Natural language processing',
 'Spam filter',
 'E-mail spam',
 'Routing',
 'Language identification',
 'Readability',
 'Text simplification',
 'Sentiment analysis',
 'Classification (disambiguation)',
 'Compound term processing',
 'Concept-based image indexing',
 'Content-based image retrieval',
 'Document retrieval',
 'Information retrieval',
 'Knowledge organization',
 'Knowledge Organization System',
 'Library cla

In [82]:
get_related_concepts('Text mining')

[]

## Final structure

1. Ignore "Machine learning" keyword everywhere
1. Ignore keywords starting with "Category:"
1. Topic concepts are the higher level concepts
    1. For each higher level concept, there are linked concept
    1. Linked concept can be present in the higher level concepts, or be absent
    1. Give preference to absent ones for move visibility
    1. The count of subtopics is useful. Should be stored for each topic concept
1. Count of links should be stored for each subtopic and topic. Since count of links is from topic to subtopic, it's not the "real" count

### Get concept counts

For each concept mentioned, get the counts of other concepts linking to it

In [84]:
f_concept_counts = Counter()

for concept_tuple in topic_concepts:
    concept = concept_tuple[0]
    
    related_concepts = get_related_concepts(concept)
    f_concept_counts.update(related_concepts)

In [85]:
f_concept_counts.most_common()

[('Category:Machine learning', 184),
 ('Statistics', 23),
 ('Regularization (mathematics)', 22),
 ('Artificial intelligence', 22),
 ('Support vector machine', 18),
 ('Feature selection', 18),
 ('Reinforcement learning', 18),
 ('Artificial neural network', 17),
 ('Algorithm', 17),
 ('Cluster analysis', 16),
 ('Data mining', 15),
 ('Regression analysis', 15),
 ('Natural language processing', 15),
 ('Computer vision', 14),
 ('Neural network', 14),
 ('Deep learning', 14),
 ('Logistic regression', 13),
 ('Linear regression', 12),
 ('Information retrieval', 12),
 ('Probability distribution', 12),
 ('Principal component analysis', 12),
 ('Computer science', 11),
 ('Category:Artificial intelligence', 11),
 ('Google', 11),
 ('K-nearest neighbor algorithm', 10),
 ('Boosting (meta-algorithm)', 10),
 ('Inductive logic programming', 10),
 ('Classification (machine learning)', 10),
 ('Support vector machines', 10),
 ('Training set', 9),
 ('Loss function', 9),
 ('Speech recognition', 9),
 ('Probabili

### Filter concepts

Filter out concepts that start with "Category:"

In [88]:
def filter_concept(concept):
    if concept.startswith('Category:'):
        return False
    
    if concept == 'Machine learning':
        return False
    
    return True

print(filter_concept('Hello'))
print(filter_concept('Category:Hello'))

True
False


### Construct final structure

In [94]:
# It's an array of concepts
f_concepts = []
f_concepts_cache = []

# Copy over everything from top-level concepts, filtering them
for data in topic_concepts:
    concept = data[0]
    if not filter_concept(concept):
        continue
        
    f_concepts.append({'label': concept, 'concepts': []})
    f_concepts_cache.append(concept)
    
# For each higher level concept, get more related concepts
# and counts
for concept in f_concepts:
    label = concept['label']
    
    # Get related concept and filter it first
    related_concepts = get_related_concepts(label)
    related_concepts = list(filter(filter_concept, related_concepts))
    
    # For each related concept, append it with the count
    for related_concept in related_concepts:
        multiplier = 1
        if related_concept not in f_concepts_cache:
            multiplier = 10
        
        concept['concepts'].append({'label': related_concept,
                                    'linked_to_count': f_concept_counts[related_concept] * multiplier})
        
    # Sort the related concepts by the linked_to_count in descending order
    concept['concepts'].sort(key=operator.itemgetter('linked_to_count'), reverse=True)

In [95]:
f_concepts

[{'concepts': [{'label': 'Regularization (mathematics)',
    'linked_to_count': 220},
   {'label': 'Support vector machine', 'linked_to_count': 180},
   {'label': 'Feature selection', 'linked_to_count': 180},
   {'label': 'Artificial neural network', 'linked_to_count': 170},
   {'label': 'Computer vision', 'linked_to_count': 140},
   {'label': 'Logistic regression', 'linked_to_count': 130},
   {'label': 'Linear regression', 'linked_to_count': 120},
   {'label': 'Information retrieval', 'linked_to_count': 120},
   {'label': 'K-nearest neighbor algorithm', 'linked_to_count': 100},
   {'label': 'Boosting (meta-algorithm)', 'linked_to_count': 100},
   {'label': 'Inductive logic programming', 'linked_to_count': 100},
   {'label': 'Training set', 'linked_to_count': 90},
   {'label': 'Loss function', 'linked_to_count': 90},
   {'label': 'Speech recognition', 'linked_to_count': 90},
   {'label': 'Linear discriminant analysis', 'linked_to_count': 80},
   {'label': 'Decision tree learning', 'lin

In [96]:
with open('machine-learning-full.json', 'wt') as f:
    json.dump(f_concepts, f)