Use LSA to identify related questions

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import nltk
from multiprocessing import Pool
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import re
from itertools import chain
from collections import Counter
import pickle
import scipy.io as scio
from sklearn.decomposition import TruncatedSVD
import scipy.spatial.distance as distance
import scipy.cluster.hierarchy as hierarchy
from scipy.stats import pearsonr

In [2]:
dat = pd.read_csv("../raw_data/local-dev/Questions.csv", encoding='latin1')
dat['Title'].fillna("None", inplace=True)
dat['Score'].fillna(0, inplace=True)

In [3]:
dat.head()

Unnamed: 0,Id,OwnerUserId,CreationDate,Score,Title,Body
0,17532666,2556416.0,2013-07-08T17:33:15Z,0,Migrating to Twitter API version 1.1 (?) [dupl...,<p>I am quite new to Twitter API. I have updat...
1,10540999,161085.0,2012-05-10T19:37:09Z,4,How to configure a Clojure library at runtime?,"<p>As a Clojure learning exercise, I am portin..."
2,34126234,3769629.0,2015-12-07T04:02:48Z,0,Handling JSON file with python,<p>I'm developing a plugin for sublime text an...
3,23721800,2474015.0,2014-05-18T12:19:40Z,0,Getting a MemoryError because list/array is to...,<h2>Problem</h2>\n\n<p>I have to download <cod...
4,10126239,891306.0,2012-04-12T14:59:27Z,3,Reading output with telnetlib in realtime,<p>I'm using Python's telnetlib to telnet to s...


In [4]:
# select a sample - results will improve without sampling in tf-idf caluculations, but due to
# Kaggle kernel memory limit we have to make a compromise here.
selected_ids = np.random.choice(range(dat.shape[0]), 10000, replace=False)
sample = dat.loc[selected_ids, :]
sample.shape

(10000, 6)

In [6]:
def purify_string(html):
    return re.sub('(\r\n)+|\r+|\n+', " ", re.sub('<[^<]+?>', '', html))

In [7]:
corpus = sample.ix[:, 'Body'].apply(purify_string)

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  """Entry point for launching an IPython kernel.


In [8]:
def combine_title_body(tnb):
    return tnb[0] + " " + tnb[1]

In [9]:
p = Pool(8)
combined_corpus = p.map(combine_title_body, zip(dat['Title'], corpus))
p.close()

In [10]:
combined_corpus[:2]

['Migrating to Twitter API version 1.1 (?) [duplicated] From the django default admin panel, I need to store all pin-codes that uploaded from an excel file for that I developed two models Pincodes and UploadPinFile. I am trying to save all pincodes to Pincode Model from UploadPinFile\'s save method but it returns me an error given bellow coercing to Unicode: need string or buffer, FieldFile found The two model defanitions are given below class UploadPincode(models.Model):     added_on = models.DateField(auto_now_add=True)     pincode_file = models.FileField(         upload_to="Pincodes/", verbose_name="Pincode Excel file (.xls,xlsx)", blank=False, null=False)     def save(self):         book = open_workbook(self.pincode_file)         for j in range(0, book.nsheets - 1):             sheet = book.sheet_by_index(j)             for i in range(sheet.nrows):                 if i != 0:                     a = sheet.row_values(i)                     int_pin = int(a[0])                     Pinc

In [12]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/adantonison/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [14]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/adantonison/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [15]:
lem = WordNetLemmatizer()
def cond_tokenize(t):
    if t is None:
        return []
    else:
        return [lem.lemmatize(w.lower()) for w in word_tokenize(t)]

p = Pool(8)
tokens = list(p.imap(cond_tokenize, combined_corpus))
p.close()

In [16]:
# stops = stopwords.words('english')
pure_tokens = [" ".join(sent) for sent in tokens]

In [17]:
vectorizer = TfidfVectorizer(min_df=1, max_features=2000, stop_words='english', ngram_range=[1, 1], sublinear_tf=True)
tfidf = vectorizer.fit_transform(pure_tokens)

In [18]:
idfs = pd.DataFrame([[v, k] for k, v in vectorizer.vocabulary_.items()], columns=['id', 'word']).sort_values('id')
idfs['idf'] = vectorizer.idf_
idfs.sort_values('idf').head(10)

Unnamed: 0,id,word,idf
71,1407,python,1.382679
51,1874,using,1.938125
64,406,code,1.943248
11,738,file,2.037147
78,1032,like,2.056228
103,910,import,2.140159
28,540,def,2.240391
154,1918,want,2.247328
115,1355,print,2.255366
17,675,error,2.263469


### Compress using SVD

In [19]:
tsvd = TruncatedSVD(n_components=500)
transformed = tsvd.fit_transform(tfidf)

In [20]:
np.sum(tsvd.explained_variance_ratio_)

0.607752559791084

In [21]:
transformed.shape

(10000, 500)

In [22]:
# calculate pairwise cosine distance
D = distance.pdist(transformed, 'cosine')

In [23]:
# hierarchical clustering - tree calculation
L = hierarchy.linkage(D)

In [24]:
# mean distance between clusters
np.mean(D)

0.9309327680889984

In [25]:
# split clusters by criterion. Here 0.71 is used as the inconsistency criterion. Adjust the
# number to change cluster sizes
cls = hierarchy.fcluster(L, 0.71, criterion='inconsistent')

In [26]:
df_cls = pd.DataFrame({'Pos': selected_ids, 'Cluster': cls})
cnts = df_cls.groupby('Cluster').size().sort_values(ascending=False)
cnts.sort_values(ascending=False).head()

Cluster
919     25
704     24
1245    21
1183    19
1396    17
dtype: int64

In [27]:
# add clusters to question data
bc = pd.concat([sample, df_cls.set_index('Pos')], axis=1)
bc.head()

Unnamed: 0_level_0,Id,OwnerUserId,CreationDate,Score,Title,Body,Cluster
Pos,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
4851,27034153,3916946.0,2014-11-20T07:34:15Z,0,How to Process Uploaded file in Django Model s...,"<p>From the django default admin panel, I need...",2014
264,26100005,72911.0,2014-09-29T12:31:57Z,1,unittest.mock.patch: Context manager vs setUp/...,"<p>There seems to be 2 ways to use <a href=""ht...",2665
2837,14233996,429423.0,2013-01-09T11:14:46Z,1,Add new item to Dictionary dynamically using a...,<p>I'm currently reading values from file and ...,357
1657,8608341,814354.0,2011-12-22T18:38:05Z,4,"Documenting files with ""from x import *""",<p>Can sphinx's .. automodule:: and other auto...,156
468,37648213,945039.0,2016-06-06T00:48:10Z,0,How do I associate python with py files in bash,"<p>A common question is ""how do I make <em>my<...",6655


In [65]:
# calculate cluster stats
stats = bc.groupby('Cluster')['Score'].describe().unstack()

In [56]:
stats.sort_values('count', ascending=False).head(10)

ValueError: No axis named count for object type <class 'pandas.core.series.Series'>

In [None]:
plt.figure(figsize=(12, 8))
plt.hlines([0], xmin=0, xmax=np.max(stats['count']) + 5, alpha=0.5)
plt.vlines([1], ymin=0, ymax=np.max(stats['mean']) + 50, alpha=0.5)
plt.scatter(stats['count'], stats['mean'], alpha=0.3)
plt.title("cluster mean score vs cluster size")
plt.xlabel("cluster size")
plt.ylabel("mean score")
plt.show()

### Check if clusters make sense

In [None]:
bc.loc[bc['Cluster'] == cnts.index[0]][['Score', 'Title', 'Body']]

In [None]:
bc.loc[bc['Cluster'] == cnts.index[1]][['Score', 'Title', 'Body']]

In [None]:
bc.loc[bc['Cluster'] == cnts.index[2]][['Score', 'Title', 'Body']]

We can improve our clusters by increasing sample size, using entire dataset to calculate tf-idf, adjusting cluster splitting criterion, using non-exclusive clustering techniques etc.

Next steps:

 1. Use clusters and most significant words in questions to generate question tags automatically
 2. Use an autoencoder to perform semantical hashing for better estimates of question relatedness