
# TASK: Create a set containing all tags
 

In [115]:
"""
A generator that yields a chunk of dataset file using the specified delimiter
"""

def myreadlines(f, delimiter):
  buf = ""
  while True:
    while delimiter in buf:
      pos = buf.index(delimiter)
      yield buf[:pos]
      buf = buf[pos + len(delimiter):]
    chunk = f.read(4096)
    if not chunk:
      yield buf
      break
    buf += chunk

In [116]:
question_delimiter = "\n|||||\n"
category_delimiter = "\n;;;;;\n"

In [117]:
"""
Create a dictionary of tags with key = tag name and value = number of occurrences
"""

curr_tags = ""
tags_occur = {}

with open('dataset.txt', 'r') as f:
  for post in myreadlines(f, question_delimiter):
    categories = post.split(category_delimiter)
    if len(categories) > 1:
        count += 1
        curr_tags = categories[1].lower()
        for tag in curr_tags.split(" "):
            if tag not in tags_occur:
                tags_occur[tag] = 0 
            tags_occur[tag] += 1

# Remove the bottom 10% from the tags dictionary
tags_count = sorted(set(tags_occur.values()), reverse=True)
req_idx = int(0.9 * len(tags_count))
min_count = tags_count[req_idx]

tags_occur = {k: v for k, v in tags_occur.items() if v > min_count}

In [118]:
from bs4 import BeautifulSoup
import re

question_body = ""

total_prec = 0.0
total_recall = 0.0
curr_prec = 0.0
curr_recall = 0.0
total_posts = 0

with open('dataset.txt') as f:
  for post in myreadlines(f, question_delimiter):
    categories = post.split(category_delimiter)
    if len(categories) > 1:
        given_tags = set(categories[1].lower().split(" ")) # Ground truth
        question_body = categories[2]

        soup = BeautifulSoup(question_body, 'html.parser')

        # Remove all tags with a class or id containing the word snippet 
        # Later use these snippets to predict the programming language ^_^
        for snippet_tag in soup.find_all(attrs={'class': re.compile('snippet')}):
            snippet_tag.decompose()
        for snippet_tag in soup.find_all(attrs={'id': re.compile('snippet')}):
            snippet_tag.decompose()

        # Remove all the <pre> ... </pre> tags
        for extra in soup('pre'):
            extra.extract()

        tokens = soup.get_text().lower().split()

        tmp_tags = {}
        predicted_tags = []
        for token in tokens:
            # Remove ['.', '?', ',', '!', ':'] at the end of a token
            if token[-1] in ['.', '?', ',', '!', ':']:
                token = token[:-1]
            if token in tags_occur:
                tmp_tags[token] = tags_occur[token]
        if (len(tmp_tags) > 5):
            predicted_tags = set([tag for tag in (sorted(tmp_tags, key=tmp_tags.get, reverse=True)[:5])])
        else:
            predicted_tags = set(tmp_tags.keys())
        common = given_tags.intersection(predicted_tags)
        if len(predicted_tags):
            curr_prec = ((len(common) * 1.0)/len(predicted_tags))
        else:
            curr_prec = 0
        if len(given_tags):
            curr_recall = ((len(common) * 1.0)/len(given_tags))
        else:
            curr_recall = 0
        total_prec += curr_prec
        total_recall += curr_recall
        total_posts += 1
        
    


# Predicted Tags = set of top-5 tokens present in 10% trimmed tags dictionary

In [119]:
print ('Total precision: ', total_prec)
print ('Total recall: ', total_recall)
print ('Total posts: ', total_posts)
avg_prec = (total_prec / total_posts)
avg_recall = (total_recall / total_posts)
print ('Average precision: ', avg_prec)
print ('Average recall: ', avg_recall)

Total precision:  2721.9833333333368
Total recall:  2292.0166666666655
Total posts:  7664
Average precision:  0.3551648399443289
Average recall:  0.29906271746694485
