<center><h1>Stackoverflow Post Tag Prediction<h1/><center>

For simplicity we'll turn it to a binary classification problem. We'll fetch the rows for the tags - 'iphone' and 'android' only.

In [1]:
import pandas as pd
import numpy as np

data = pd.read_csv("stack-overflow-data.csv")
data = data[data['tags'].isin(['iphone', 'android'])].reset_index().drop('index', axis=1)

print(data.shape)

(4000, 2)


In [2]:
data.head()

Unnamed: 0,post,tags
0,identifying server timeout quickly in iphone ...,iphone
1,distance between 2 or more drop pins i was do...,iphone
2,what are all the restrictions by apple for iph...,iphone
3,not able to clicked on item i have facing ver...,iphone
4,all phone numbers of one contact for startacti...,android


In [4]:
data.tags.value_counts()

android    2000
iphone     2000
Name: tags, dtype: int64

## Basic Preprocessing (Noise Removal)

In [6]:
import re, string, unicodedata
from bs4 import BeautifulSoup

def strip_html(text):
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()

def remove_between_square_brackets(text):
    return re.sub('\[[^]]*\]', '', text)

def denoise_text(text):
    text = strip_html(text)
    text = remove_between_square_brackets(text)
    return text

data['post'] = data['post'].apply(denoise_text)

## Word Tokenization

In [7]:
from nltk import word_tokenize, sent_tokenize

tokens = data['post'].apply(word_tokenize)
tokens.head()

0    [identifying, server, timeout, quickly, in, ip...
1    [distance, between, 2, or, more, drop, pins, i...
2    [what, are, all, the, restrictions, by, apple,...
3    [not, able, to, clicked, on, item, i, have, fa...
4    [all, phone, numbers, of, one, contact, for, s...
Name: post, dtype: object

## Advanced Preprocessing

In [8]:
# !pip install contractions
# !pip install inflect
# !pip install autocorrect
# import contractions
import inflect
from autocorrect import spell
from nltk.corpus import stopwords
from nltk.stem import LancasterStemmer, WordNetLemmatizer

In [9]:
def remove_non_ascii(words):
    """Remove non-ASCII characters from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = unicodedata.normalize('NFKD', word).encode('ascii', 'ignore').decode('utf-8', 'ignore')
        new_words.append(new_word)
    return new_words

def to_lowercase(words):
    """Convert all characters to lowercase from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = word.lower()
        new_words.append(new_word)
    return new_words

def remove_punctuation(words):
    """Remove punctuation from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = re.sub(r'[^\w\s]', '', word)
        if new_word != '':
            new_words.append(new_word)
    return new_words

# def replace_contractions(words):
#     """Replace contractions in string of text"""
#     return contractions.fix(words)

def replace_numbers(words):
    """Replace all interger occurrences in list of tokenized words with textual representation"""
    p = inflect.engine()
    new_words = []
    for word in words:
        if word.isdigit():
            new_word = p.number_to_words(word)
            new_words.append(new_word)
        else:
            new_words.append(word)
    return new_words

def remove_stopwords(words):
    """Remove stop words from list of tokenized words"""
    new_words = []
    for word in words:
        if word not in stopwords.words('english'):
            new_words.append(word)
    return new_words

def spelling_correction(words):
    """Autocorrect the spelling of the words"""
    new_words = [spell(w) for w in (words)]
    return new_words

def stem_words(words):
    """Stem words in list of tokenized words"""
    stemmer = LancasterStemmer()
    stems = []
    for word in words:
        stem = stemmer.stem(word)
        stems.append(stem)
    return stems

def lemmatize_verbs(words):
    """Lemmatize verbs in list of tokenized words"""
    lemmatizer = WordNetLemmatizer()
    lemmas = []
    for word in words:
        lemma = lemmatizer.lemmatize(word, pos='v')
        lemmas.append(lemma)
    return lemmas

In [10]:
tokens = tokens.apply(remove_non_ascii)
tokens = tokens.apply(to_lowercase)
tokens = tokens.apply(remove_punctuation)
tokens = tokens.apply(replace_numbers)
tokens = tokens.apply(remove_stopwords)
# tokens = tokens.apply(spelling_correction) # Takes time
tokens = tokens.apply(stem_words)
tokens = tokens.apply(lemmatize_verbs)

## Data Partitioning

In [None]:
def tokens_to_sentence(words):
    return ' '.join(words)

In [12]:
X = tokens
y = data['tags'].replace({'iphone':1, 'android':0})

## Feature extraction using word2vec Pretrained Model

In [14]:
from gensim.models import KeyedVectors

wv_model = KeyedVectors.load_word2vec_format('pretrained models/GoogleNews-vectors-negative300.bin', binary=True)

### Strategy: Mean of Word2Vec vectors

In [15]:
def mean_embedding_vectorizer(wv_model, docs):
    mean_vecs = []
    for words in docs:
        vecs = []
        for word in words:
            if word in wv_model.vocab:
                vecs.append(wv_model[word])
            else:
                vecs.append(np.zeros(wv_model.vector_size))
        
        mean_vecs.append(np.mean(vecs, axis=0))
        
    return mean_vecs

In [16]:
mev = mean_embedding_vectorizer(wv_model, tokens)

### Strategy: Mean of Word2Vec vectors with TF-IDF

In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import defaultdict

def tfidf_embedding_vectorizer(wv_model, docs):
    tfidf = TfidfVectorizer().fit(docs.apply(lambda words:' '.join(words)))
    word2weight = defaultdict(lambda: max(tfidf.idf_), 
                              [(w, tfidf.idf_[i]) for w, i in tfidf.vocabulary_.items()])

    tfidf_vecs = []

    for words in docs:
        vecs = []
        for word in words:
            if word in wv_model.vocab:
                vecs.append(wv_model[word] * word2weight[word])
            else:
                vecs.append(np.zeros(wv_model.vector_size))

        tfidf_vecs.append(np.mean(vecs, axis=0))
    
    return tfidf_vecs

In [18]:
tfidfv = tfidf_embedding_vectorizer(wv_model, tokens)

## Feature DataFrame

In [31]:
X = pd.DataFrame(mev).add_prefix('feat_')

In [32]:
X.head()

Unnamed: 0,feat_0,feat_1,feat_2,feat_3,feat_4,feat_5,feat_6,feat_7,feat_8,feat_9,...,feat_290,feat_291,feat_292,feat_293,feat_294,feat_295,feat_296,feat_297,feat_298,feat_299
0,-0.054047,-0.006847,0.003345,0.092404,-0.068242,0.055398,-0.007483,-0.065513,-0.023434,-0.040749,...,0.058644,0.006197,-0.115312,0.026849,0.019679,-0.105075,0.007112,0.00179,-0.080108,-0.044161
1,-0.01373,-0.02112,0.025297,0.098415,-0.046692,-0.002546,-0.036435,-0.089058,0.03937,0.037308,...,0.019272,0.089198,-0.0602,0.023799,-0.017523,-0.080523,-0.075071,-0.028831,-0.032314,0.080765
2,-0.125996,0.04747,-0.037542,0.110095,-0.086817,-0.004518,0.0305,0.004625,-0.025803,-0.023903,...,0.055047,0.065123,-0.056684,0.031982,-0.085679,-0.055781,0.018345,-0.061113,-0.04907,0.052026
3,-0.022023,0.044843,-0.044099,0.100669,-0.100849,0.053145,0.070942,-0.030137,0.02996,-0.021439,...,0.074973,0.117831,-0.01144,0.046054,0.051869,-0.052432,-0.003928,-0.084062,-0.052671,0.045854
4,0.046797,-0.029014,-0.044262,0.047656,-0.110386,0.05622,0.062416,-0.082308,0.077633,0.054741,...,0.028023,0.072235,-0.063314,0.008243,-0.023574,-0.004674,-0.012499,-0.085481,-0.019483,-0.014777


## Train-Test Split

In [33]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1234, test_size=0.20)

## Model Training and Evaluation

In [34]:
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(solver='lbfgs', random_state=1234).fit(X_train, y_train.values.ravel())
model.score(X_test, y_test.values.ravel())

0.84875