# Clustering queries

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import feature_extraction
import matplotlib.pyplot as plt
import nltk

file_query_features = '/data/khodadaa/stack_results/stack_feat/18_ml_in.csv'

In [2]:
# cite 
# https://stackoverflow.com/questions/17390326/getting-rid-of-stop-words-and-document-tokenization-using-nltk

import string
stopwords = nltk.corpus.stopwords.words('english') + list(string.punctuation)

def tokenize(text: str) -> list:    
    tokens = nltk.word_tokenize(text.lower())
    return [t for t in tokens if t not in stopwords]

In [3]:
df_qf = pd.read_csv(file_query_features)
df_qf.columns

Index(['Query', 'covered_t_18', 'mean_df_t_18', 'min_df_t_18',
       'mean_mean_pop_t_18', 'mean_min_pop_t_18', 'min_mean_pop_t_18',
       'min_min_pop_t_18', 'ql_t_18', 'qll_t_18', 'covered_t_bi_18',
       'mean_df_t_bi_18', 'min_df_t_bi_18', 'mean_mean_pop_t_bi_18',
       'mean_min_pop_t_bi_18', 'min_mean_pop_t_bi_18', 'min_min_pop_t_bi_18',
       'ql_t_bi_18', 'qll_t_bi_18', 'scs_t_18', 'maxSCQ_t_18', 'covered_t_c18',
       'mean_df_t_c18', 'min_df_t_c18', 'mean_mean_pop_t_c18',
       'mean_min_pop_t_c18', 'min_mean_pop_t_c18', 'min_min_pop_t_c18',
       'ql_t_c18', 'qll_t_c18', 'covered_t_bi_c18', 'mean_df_t_bi_c18',
       'min_df_t_bi_c18', 'mean_mean_pop_t_bi_c18', 'mean_min_pop_t_bi_c18',
       'min_mean_pop_t_bi_c18', 'min_min_pop_t_bi_c18', 'ql_t_bi_c18',
       'qll_t_bi_c18', 'scs_t_c18', 'maxSCQ_t_c18', 'TestViewCount', '18',
       '100', 'ql_t', 'ql_t.1', 'Y', 'Label'],
      dtype='object')

In [4]:
y = df_qf['Label']
X = df_qf[df_qf.columns.difference(['Label', 'Y'])]
train_X, test_X, train_y, test_y = train_test_split(X, y, stratify=y, test_size= 0.33)

In [5]:
%time df_qf['tokens'] = df_qf.Query.apply(tokenize)

CPU times: user 3min 22s, sys: 1.11 s, total: 3min 23s
Wall time: 3min 29s


In [12]:
# cite:
# http://brandonrose.org/clustering
from sklearn.feature_extraction.text import TfidfVectorizer
%time tfidf_vectorizer = TfidfVectorizer(max_df=0.9, \
                                         max_features= 5000, \
                                         stop_words=nltk.corpus.stopwords.words('english') + list(string.punctuation), \
                                         lowercase=True, \
                                         use_idf=True, \
                                         tokenizer=nltk.word_tokenize, ngram_range = (1,2))
%time tfidf_matrix = tfidf_vectorizer.fit_transform(df_qf.loc[train_X.index, 'Query'])


CPU times: user 466 µs, sys: 981 µs, total: 1.45 ms
Wall time: 3.47 ms
CPU times: user 2min 25s, sys: 3.4 s, total: 2min 28s
Wall time: 2min 28s


In [34]:
from sklearn.cluster import KMeans
from sklearn.externals import joblib

num_clusters = 20
km = KMeans(n_clusters = num_clusters, random_state=5, max_iter=100)
joblib.dump(km, '/data/khodadaa/stack_results/cluster_train.pkl')

%time km.fit(tfidf_matrix)
clusters = km.labels_.tolist()

CPU times: user 52min 21s, sys: 1min 47s, total: 54min 8s
Wall time: 10min 43s


In [35]:
n_X = train_X.copy()

In [36]:
n_X['cluster'] = clusters
c1 = n_X[train_y==1]
c0 = n_X[train_y==0]

In [37]:
c0['cluster'].value_counts()/c0.shape[0]

0     0.580052
14    0.052362
1     0.044727
16    0.036780
12    0.033565
4     0.029369
2     0.027771
19    0.025402
17    0.019771
9     0.018167
13    0.018038
18    0.016671
6     0.015040
10    0.014241
8     0.014180
3     0.013023
5     0.012833
15    0.012576
7     0.008474
11    0.006958
Name: cluster, dtype: float64

In [38]:
c1['cluster'].value_counts()/c1.shape[0]

0     0.598871
14    0.053302
1     0.042088
16    0.034665
12    0.030726
4     0.028037
19    0.023864
13    0.021571
17    0.019241
2     0.018431
9     0.015961
10    0.015295
18    0.014776
15    0.014713
3     0.014653
8     0.014026
6     0.011922
5     0.011177
7     0.008643
11    0.008036
Name: cluster, dtype: float64