This is my submission for the task Tagging System of Questions using Transfer Learning. We use a kind of brute force method, using TF-IDF to predict the best possible tags for a question.


First, load the required libraries. 

In [37]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import nltk # natural language processing
import re # regular expression
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize 
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from string import punctuation

stop = set(stopwords.words('english'))

In [38]:
# read in test data sets
test = pd.read_csv("test.csv")
raw = test

First, we perform data cleansing on our dataset

The content column contains some HTML tags. We clean our data to remove these tags, convert all to small case letters and remove stopwords. We also tokenize the data here.

In [41]:
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]p') 
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]') # take all words that contain characters other than 0-9,a-z,#,+
stopwords = set(stopwords.words('english'))

def parse_content(text):
    """
        text: a string
        
        return: modified initial string
    """
    #text = # lowercase text
    text =text.lower()
    #text = # replace REPLACE_BY_SPACE_RE symbols by space in text
    text = re.sub(REPLACE_BY_SPACE_RE, ' ', text)
    #text = # delete symbols which are in BAD_SYMBOLS_RE from text
    text =  re.sub(BAD_SYMBOLS_RE, '', text)
    text = re.sub(r"\b[a-zA-Z]\b", "", text)
    text = re.sub("\d+", "", text)
    word_tokens = word_tokenize(text) 
  
    filtered_sentence = [w for w in word_tokens if not w in stopwords]
    text = filtered_sentence
    return text

In [42]:
# Apply parse_content onto dataframe.
raw['content'] = raw['content'].apply(parse_content)
raw['title'] = raw['title'].apply(parse_content)

def importance_list_row(sparse_row,n_importance):
    importance_list = [0]*n_importance
    for i in range(0,n_importance): 
        ind =  sparse_row.indices[sparse_row.data.argmax(axis=0)] if sparse_row.nnz else 0
        importance_list[i] = pos_to_word[ind]
        sparse_row[0,ind] = 0
    return importance_list


def importance_list(sparse_matrix,n_importance):
    n_row = sparse_matrix.shape[0]
    importance_lists = [0]*n_row
    for row in range(0,n_row):
        importance_lists[row] = importance_list_row(sparse_matrix[row],n_importance)
    return importance_lists   

We do further processing on our text by selecting nouns which match a certain patters, and converting the entire document into tokens of words.
After that, we perform lemmetization of the document.

Now, we combine the title and content fields of input data, as we will be generating TF-IDF matrix using both.

In [45]:
#tokenize and tag texts. 
lemmatizer = nltk.stem.WordNetLemmatizer()
raw['text_token'] = ' '
raw['text_token'] = raw['title'] + raw['content']

In [46]:
raw['text_token'] = raw['text_token'].apply(lambda x:[lemmatizer.lemmatize(t) for t in x])
raw['text_pos'] = raw['text_token'].apply(nltk.pos_tag)
raw['text_nouns'] = raw['text_pos'].apply(lambda x: [pair[0] for pair in x if pair[1] in ("NN","NNS","JJ")])

In [47]:
raw['text_bigram'] = raw['text_pos'].apply(nltk.bigrams)
raw['text_bigram'] = raw['text_bigram'].apply(list)

In [48]:
def findPair(l):
    result = []
    for pair in l:
        if pair[1][1] in ('NN','NNS') and pair[0][1] in ('NN','NNS','JJ'):
            result.append(pair[0][0]+" "+pair[1][0])
    return result
raw['word_pair'] = raw['text_bigram'].apply(findPair)

Now, lets make the actual predictions.

In [49]:
mydoclist = raw['text_nouns'].apply(" ".join).tolist()
count_vectorizer = CountVectorizer(stop_words='english',lowercase=True,analyzer='word',ngram_range=(1,1))
term_freq_matrix = count_vectorizer.fit_transform(mydoclist)
tfidf = TfidfTransformer(norm="l2")
tfidf.fit(term_freq_matrix)
tf_idf_matrix = tfidf.transform(term_freq_matrix)
pos_to_word = dict([[v,k] for k,v in count_vectorizer.vocabulary_.items()])

In [50]:
n_importance = 3 #no of tags to be generated
predict = importance_list(tf_idf_matrix,n_importance)
predict_vs_actual = pd.DataFrame({'tags':predict,'id':raw['id']})
predict_vs_actual['tags'] = predict_vs_actual['tags'].apply(" ".join)

In [51]:
predict_vs_actual.to_csv("predicted.csv",index=False)

print(predict_vs_actual[0:100]) # print results for first 100 questions


                                                 tags   id
0                             spin subatomic particle    1
1                                plausible theory non    2
2                            group lie representation    3
3                        determinism overcome laplace    7
4                      hamilton phamiltons stationary    9
5                                     sound clue life   13
6                       theory thinkliliand ollithose   15
7                                sky sunriseset night   17
8                               energy collision hows   19
9                                  monte carlo pwhere   21
10                                    bike wheel turn   24
11                   projectile youmichael vanderpool   26
12                      particle measurement collapse   27
13                                   mph average trip   29
14  hrefhttpenwikipediaorgwikispecial_relativity r...   31
15                       whirlvortex sinkbathtub myth   

This marks the end of this submission. Unfortunately, I was not able to implement transfer learning but, I was able to generate tags for all the questions.