# Create a function to categorise posts based on tags

* Required to categorise the posts into less granular topics based on the tags related to each post
* Assign a category to posts without any tags


# Import stopwords

In [1]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
stopWords = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/dbutler/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


# 1 - Parse core java topics
* site: http://java.meritcampus.com/core-java-topics
* extracted the terms from the above website into a file
* created the function below to create a dictionary storing the core topic along with a unique list of key terms related to that topic

In [2]:
new_topic = True
java_topic_dict = {}
java_topic = ''
tmp_list = []

with open('./data/java_topics.txt') as file:
    for line in file:   
        #create a new topic
        if line in '\n':
            java_topic_dict[java_topic] = set(tmp_list)
            tmp_list = []
            new_topic = True
            continue
            
        if new_topic:
            #assign the current line to be the java topic
            java_topic = ' '.join(line.split()[1:])
            new_topic = False
            continue
        
        #split each line into individual words and append
        for el in line.split()[1:]:
            #stopword removal
            if el not in stopWords and len(el) > 1:
                tmp_list.append(el.lower().replace('-', ''))
        

In [3]:
java_topic_dict.keys()

dict_keys(['Datatypes', 'Variables', 'Operators', 'Control Statements', 'Methods', 'Arrays', 'Classes', 'Inheritance', 'Methods Overiding, Overloading', 'Abstract Class And Methods', 'Interfaces, Packages and Access Control', 'final, static and others', 'Exceptions', 'Multithreaded Programming', 'Generics', 'Strings', 'java.lang', 'Collections Framework', 'Utility Classes', 'Input/Output', 'The Applet Class', 'Swing', 'Servlets'])

## Create a list of the topic keys 

In [4]:
java_topic_keys = list(java_topic_dict.keys())

# 2 - eliminate frequently occuring sub terms across various topics  
* example the word class appears in a wide variety of topics

The function below eliminates all the terms in a given topic that occur in other topics and retains the terms unique to that topic only

In [5]:
# stores the refined topic terms
new_java_topic_dict = {}

#scan through each topic
for key in java_topic_keys:
    
    #tmp list to hold all the terms related to topic
    tmp_list = []
    for item in java_topic_keys:
        #we do not want to add the terms related to the current topic to the main list
        #when detected move on to next
        if item == key:
            continue
        else:
            #for each term related to the topic append them to the tmp list
            for el in list(java_topic_dict[item]):
                tmp_list.append(el)
    #create a set form the tmp list containing terms from all the topics except the current topic (key)
    #this creates our stop list
    stop_list = set(tmp_list)
    
    #initialise a filtered_list to store unique values for each topic
    filtered_list = []
    for el in list(java_topic_dict[key]):
        #if it doesnt appear in our stop list dont append it
        if el not in stop_list:
            filtered_list.append(el)

    #add the key to the list
    filtered_list.append(key.lower())
    #creates a new list of unique terms related to that topic. 
    new_java_topic_dict[key] = filtered_list

# 3 - Attempt to categorise posts in the dataset based on their tag values

In [6]:
import pandas as pd
df = pd.read_csv('./data/filtered_cleaned_posts_no_frameworks_no_alt_lang.csv')

In [7]:
df.shape

(4212, 5)

In [8]:
# word2vec function converts the word into a vector
def word2vec(word):
    from collections import Counter
    from math import sqrt

    # count the characters in word
    cw = Counter(word)
    # precomputes a set of the different characters
    sw = set(cw)
    # precomputes the "length" of the word vector
    lw = sqrt(sum(c*c for c in cw.values()))

    # return a tuple
    return cw, sw, lw

# calculates the cosine similarity between 2 vectors i.e. words
def cosdis(v1, v2):
    # which characters are common to the two words?
    common = v1[1].intersection(v2[1])
    # by definition of cosine distance we have
    return sum(v1[0][ch]*v2[0][ch] for ch in common)/v1[2]/v2[2]

# 4 - function to categorise a post 

In [9]:
'''
    Description:
        takes an argument of a list of tags and checks two things:
        
        1)  checks to see if any of the tags are related directly to a core topic e.g. generics, classes, exceptions
            if the score is greater than 95% then it returns the key that the tag scored greater than 95% against.
        
        2)  If there is no match over 95% for the keys, the tags are then compared to each of the terms in all 
            of the topics, each topics terms are loaded in and the cosine similarity is checked against each word
            the highest scoring word is stored. For example if there are three tags, each tag will be compared to the 
            rest of the terms in a given topic. The highest score for each tag is appended to a total score, 
            this generates a percentage of how likely it is related to that topic. The highest scoring topic is returned
    
    args: tag_list
        list of tags related to a given post
        
'''
def assign_java_topic(tag_list):
    
    #first check if it contains a core topic tag
    for tag in tag_list:
        tag = tag.replace('-', '')
        va = word2vec(tag)
        for jkey in java_topic_keys:
            vb = word2vec(jkey.lower())
            #if the cosine similarity is greater than 95% return that key as the category
            if cosdis(va, vb) > 0.95:
                return jkey, cosdis(va,vb)

    #stores the most likely topic
    max_topic_name = ''
    #holds the max topic score for the set of tags
    max_topic_score = 0

    for key in java_topic_keys:
        #stores the score of the current topic
        topic_score = 0
        #each tag is converted to a vector
        for tag in tag_list:
            tag = tag.replace('-', '')
            va = word2vec(tag)
            max_tag_score = 0

            #for every element in the current topic e.g. classes
            for el in new_java_topic_dict[key]:
                #vb is the current term in the current topic converted to vector format
                vb = word2vec(el)
                #calculate the cosine similarity
                score = cosdis(va,vb)

                #if the score is greater than the current max tag score then 
                # assign this as the new max
                if score > max_tag_score:
                    max_tag_score = score
            #increment the overall topic score using the max tag score
            topic_score += max_tag_score
            
        #check if this is the highest scoring topic score
        if topic_score > max_topic_score:
            max_topic_score = topic_score
            max_topic_name = key
    
    #returns the highest scoring topic along with the score as a percentage. 
    return max_topic_name, max_topic_score/len(tag_list)

# 5 - Test the function against some posts

In [10]:
test = df.head(10)

In [11]:
for index, row in test.iterrows():   
    topic_name, topic_score = assign_java_topic(test.loc[index, 'Tags'].split())
    print(test.loc[index, 'Tags'].split())
    print(topic_name)
    print(topic_score)
    print('\n')

['exception', 'mocking', 'try-catch']
Exceptions
0.9574271077563381


['generics', 'syntax']
Generics
0.9999999999999999


['-ee', 'jvm', 'out-of-memory', 'heap-memory']
Classes
0.7247445514768732


['string', 'random', 'alphanumeric']
Strings
0.9525793444156805


['class', 'clone']
Classes
0.8590265111456049


['generics']
Generics
0.9999999999999999


['interface', 'static']
Operators
0.8423850737154739


['string']
Strings
0.9525793444156805


['multithreading', 'memory-leaks']
Multithreaded Programming
0.7841429308197335


['arrays', 'data-structures', 'data-manipulation']
Arrays
0.9999999999999999




# 6 - Assign topic to posts with no tags using the title
* Using the title - use textblob and POS tagging to extract all adjectives and nouns from a title 
* append these to a list and pass them to the assign_java_topic function

In [12]:
def extract_title_keywords(title):
    from textblob import TextBlob
    s_list = []
    blob = TextBlob(title)
    for word, pos in blob.tags:
        if pos == 'NN' or pos == 'JJ' or pos == 'VB':
            s_list.append(word)
    return s_list

In [14]:
s_test = ["How can I make this java generic cast?", "How to convert nanoseconds to seconds using the TimeUnit enum?",
         "How can I sort the keys of a Map in Java?", "How does Java convert int into byte?", 
          "how to convert byte array to string and vice versa ", "How do I make a Class extend Observable when it has extended another class too?",
         "How to check if an IP address is the local host on a multi-homed system?"]
for s in s_test: 
    res = extract_title_keywords(s)
    topic_name, topic_score = assign_java_topic(res)
    print(res)
    print(topic_name)
    print(topic_score)
    print('\n')

['make', 'java', 'generic', 'cast']
Collections Framework
0.7846088201832161


['convert', 'enum']
Collections Framework
0.8795767165529425


['sort', 'keys']
java.lang
0.7618016810571369


['convert', 'int', 'byte']
java.lang
0.8273206885909709


['convert', 'byte', 'array', 'string', 'vice', 'versa']
Strings
0.9525793444156805


['make', 'extend', 'class']
Control Statements
0.7672497797561304


['check', 'address', 'local', 'host', 'multi-homed', 'system']
java.lang
0.7322116450659335




# Assign topics to each post in the dataframe 
* if the post has tags available use the tags
* otherwise, if no tags avaialable use the title

In [79]:
#store the codes
java_topic_codes = {}
count = 0

for key in java_topic_keys:
    java_topic_codes[key] = count
    count += 1
#sets a default key
java_topic_codes['other'] = count    

In [80]:
java_topic_codes

{'Abstract Class And Methods': 9,
 'Arrays': 5,
 'Classes': 6,
 'Collections Framework': 17,
 'Control Statements': 3,
 'Datatypes': 0,
 'Exceptions': 12,
 'Generics': 14,
 'Inheritance': 7,
 'Input/Output': 19,
 'Interfaces, Packages and Access Control': 10,
 'Methods': 4,
 'Methods Overiding, Overloading': 8,
 'Multithreaded Programming': 13,
 'Operators': 2,
 'Servlets': 22,
 'Strings': 15,
 'Swing': 21,
 'The Applet Class': 20,
 'Utility Classes': 18,
 'Variables': 1,
 'final, static and others': 11,
 'java.lang': 16,
 'other': 23}

In [102]:
# check if tags column empty
df.Tags[df.Tags == "  -7"].count()

0

In [83]:
#initialise the topic column
df["Topic"] = None
for index, row in df.iterrows():
    #if tags are not present use the title from the post
    if df.loc[index, "Tags"] == "  " or df.loc[index, "Tags"] in "-8" or df.loc[index, "Tags"] in "-7":
        #extract the keywords from the title
        res = extract_title_keywords(df.loc[index, "Title"])
        if len(res) > 0:
            topic_name, topic_score = assign_java_topic(res)
            df.loc[index, "Topic"] = java_topic_codes[topic_name]
    #otherwise use the tags
    else:
        topic_name, topic_score = assign_java_topic(df.loc[index, "Tags"].split())
        
        if topic_name == '':
            topic_name = "other"
        df.loc[index, "Topic"] = java_topic_codes[topic_name]

In [107]:
df[df.Topic == 4]

Unnamed: 0,Id,Title,Body,Tags,body,Topic
17,189787,How to format methods with large parameter lists,"<p>I have never seen a way to do this nicely, ...",formatting methods,<p>A large set of parameters like this is ofte...,4
58,21817,Why can't I declare static methods in an inter...,<p>The topic says the most of it - what is the...,interface methods static,<p>There are a few issues at play here. The f...,4
260,576918,How do I intercept a method invocation with st...,<p>I want to intercept all method invocations ...,reflection methods,"<p>As you note, you cannot use JDK dynamic pro...",4
264,610458,Why isn't calling a static method by way of an...,<p>I'm sure you all know the behaviour I mean ...,static methods,<p>Basically I believe the Java designers made...,4
542,2211002,Why not abstract fields?,<p>Why can't Java classes have abstract fields...,methods field abstract,<p>You can do what you described by having a f...,4
554,1411612,How to test for equality of complex object gra...,<p>Say I have a unit test that wants to compar...,unit-testing,<p>What you could do is render each object to ...,4
621,1924253,How to determine by reflection if a Method ret...,<p>I have a <code>java.lang.reflect.Method</co...,reflection methods,<pre><code>if( method.getReturnType().equals(V...,4
671,2315445,How to quickly determine if a method is overri...,<p>There is a possible optimization I could ap...,oop optimization methods override,<p>I wouldn't do this. It violates encapsulati...,4
751,2563791,Why does Java's invokevirtual need to resolve ...,<p>Consider this simple Java class:</p>\n\n<pr...,jvm methods virtual-method,<p>It is all about performance. When by figuri...,4
767,2636660,why no replace() method defined on the Set int...,<p>Currently I have to write the following to ...,,<p>A Set is a data structure made to avoid dup...,4


In [108]:
df.to_csv("./data/java_questions_including_topics.csv", index=False)

In [15]:
for key in java_topic_keys:
    print(key)

Datatypes
Variables
Operators
Control Statements
Methods
Arrays
Classes
Inheritance
Methods Overiding, Overloading
Abstract Class And Methods
Interfaces, Packages and Access Control
final, static and others
Exceptions
Multithreaded Programming
Generics
Strings
java.lang
Collections Framework
Utility Classes
Input/Output
The Applet Class
Swing
Servlets
