In [6]:
import numpy as np
from nltk.tokenize import RegexpTokenizer
import gensim
import pandas as pd
import re
import nltk
import string
from tqdm import tqdm

In [7]:
model_path = u'D:\models\GoogleNews-vectors-negative300.bin'
wv_embeddings = gensim.models.KeyedVectors.load_word2vec_format(fname=model_path, limit=500000, binary=True)

In [38]:
def read_corpus(filename):
    data = []
    for line in open(filename, encoding='utf-8'):
        data.append(line.strip().split('\t'))
    return data

In [39]:
data = read_corpus('so_questions.tsv')

## Вопрос 1

**Входит ли слово cat в топ-5 близких слов к слову dog?**

In [8]:
wv_embeddings.most_similar(positive=['cat'])

[('cats', 0.8099378943443298),
 ('dog', 0.760945737361908),
 ('kitten', 0.7464984655380249),
 ('feline', 0.7326234579086304),
 ('beagle', 0.7150583267211914),
 ('puppy', 0.7075453996658325),
 ('pup', 0.6934290528297424),
 ('pet', 0.6891531348228455),
 ('felines', 0.6755931377410889),
 ('chihuahua', 0.6709762215614319)]

**Ответ: да.**

#### Векторные представления текста

In [8]:
tokenizer = RegexpTokenizer(r'\w+')

In [9]:
def question_to_vec(question, embeddings, dim=300):
    words =  tokenizer.tokenize(question)
    n_known = 0
    result = np.array([0] * dim, dtype=float)    
    for word in words:
        if word in embeddings:
            result += embeddings[word]
            n_known += 1
            
    if n_known != 0:
        return result / n_known
    else:
        return result

In [23]:
q = 'I like dark themes. However, the default theme of Jupyter notebooks is light, and I cant find the option to change the theme/background-color. How is this done?'

In [27]:
question_to_vec(q, wv_embeddings)

array([ 0.09819946,  0.08552979,  0.02617798,  0.10978516, -0.05410885,
        0.01544922,  0.08193115, -0.07053467,  0.11806641,  0.0185083 ,
       -0.04598206, -0.07106689, -0.00578369, -0.01892578, -0.09141602,
        0.11324707,  0.0459082 ,  0.08457031,  0.0205896 , -0.17009521,
       -0.04350586,  0.12529785,  0.06759521, -0.02728973, -0.0307766 ,
       -0.00048126, -0.07328613,  0.04516602,  0.03177185,  0.0094281 ,
       -0.07897217,  0.03314283,  0.03263672,  0.03050682,  0.02820801,
        0.05157562, -0.02617188, -0.02082764,  0.03863037,  0.0336084 ,
        0.11003906,  0.01725708,  0.04221436, -0.0097171 ,  0.01095947,
        0.00229919, -0.0186731 , -0.04037598, -0.00190674,  0.0013504 ,
       -0.04914185,  0.07050293,  0.00696533, -0.05645508,  0.00242432,
        0.03453125, -0.02273071, -0.05413208,  0.02813232, -0.06486145,
        0.00998169,  0.06753906, -0.12299561, -0.06290344, -0.0172168 ,
       -0.03997559, -0.08246887,  0.13952148, -0.03980103,  0.06

#### Оценка близости текстов

In [90]:
def hits_count(dup_ranks, k):
    N = len(dup_ranks)    
    hits_value = np.sum([1 for r in dup_ranks if r <=k]) / N
    return hits_value

def dcg_score(dup_ranks, k):
    N = len(dup_ranks)
    dcg_value = np.sum([1/np.log2(1 + r) for r in dup_ranks if r <=k]) / N
    return dcg_value

In [91]:
hits_count([2],4)

1.0

In [92]:
dcg_score([2,3,5],2)

0.2103099178571525

#### Протестируем функции

In [93]:
copy_answers = ["How does the catch keyword determine the type of exception that was thrown"]

candidates_ranking = [["How Can I Make These Links Rotate in PHP",
                       "How does the catch keyword determine the type of exception that was thrown",
                       "NSLog array description not memory address",
                       "PECL_HTTP not recognised php ubuntu"]]

dup_ranks = [candidates_ranking[0].index(copy_answers[i]) + 1 for i in range(len(copy_answers))]

print('Ваш ответ HIT:', [hits_count(dup_ranks,k) for k in range(1, 5)])
print('Ваш ответ DCG:', [round(dcg_score(dup_ranks,k), 5) for k in range(1, 5)])

Ваш ответ HIT: [0.0, 1.0, 1.0, 1.0]
Ваш ответ DCG: [0.0, 0.63093, 0.63093, 0.63093]


#### Ранжирование вопросов StackOverflow

In [95]:
len(data)

3760

Реализуйте функцию ранжирования кандидатов на основе косинусного расстояния. Функция должна по списку кандидатов вернуть отсортированный список пар (позиция в исходном списке кандидатов, кандидат). При этом позиция кандидата в полученном списке является его рейтингом (первый - лучший). Например, если исходный список кандидатов был [a, b, c], и самый похожий на исходный вопрос среди них - c, затем a, и в конце b, то функция должна вернуть список **[(2, c), (0, a), (1, b)]**.

In [96]:
from sklearn.metrics.pairwise import cosine_similarity
from copy import deepcopy

In [100]:
def rank_candidates(question, candidates, embeddings, dim=300):
    vec_question = question_to_vec(question, embeddings)
    vec_candidates = np.array([question_to_vec(candidates[i], embeddings)
                      for i in range(len(candidates))])
    rank = np.array([(i, candidates[i]) for i in range(len(candidates))])
    dist_s = cosine_similarity(vec_candidates, np.array([vec_question]))[:, 0]
    return deepcopy(rank[dist_s.argsort()[::-1]])

In [101]:
questions = ['converting string to list', 'Sending array via Ajax fails'] 

candidates = [['Convert Google results object (pure js) to Python object', # первый эксперимент
               'C# create cookie from string and send it',
               'How to use jQuery AJAX for an outside domain?'],
              
              ['Getting all list items of an unordered list in PHP',      # второй эксперимент
               'WPF- How to update the changes in list item of a list',
               'select2 not displaying search results']]

**Ответ**

In [104]:
for question, q_candidates in zip(questions, candidates):
    ranks = rank_candidates(question, q_candidates, wv_embeddings, 300)
    print(ranks)

[['1' 'C# create cookie from string and send it']
 ['0' 'Convert Google results object (pure js) to Python object']
 ['2' 'How to use jQuery AJAX for an outside domain?']]
[['0' 'Getting all list items of an unordered list in PHP']
 ['2' 'select2 not displaying search results']
 ['1' 'WPF- How to update the changes in list item of a list']]


Теперь мы можем оценить качество нашего метода. Запустите следующие два блока кода для получения результата. Обратите внимание, что вычисление расстояния между векторами занимает некоторое время (примерно 10 минут).

In [105]:
wv_ranking = []
for line in data:
    q, *ex = line
    ranks = rank_candidates(q, ex, wv_embeddings)
    wv_ranking.append([r[0] for r in ranks].index('0') + 1)

In [107]:
for k in [1, 5, 10, 100, 500, 1000]:
    print("DCG@%4d: %.3f | Hits@%4d: %.3f" % (k, dcg_score(wv_ranking, k), k, hits_count(wv_ranking, k)))

DCG@   1: 0.259 | Hits@   1: 0.259
DCG@   5: 0.314 | Hits@   5: 0.364
DCG@  10: 0.332 | Hits@  10: 0.420
DCG@ 100: 0.374 | Hits@ 100: 0.627
DCG@ 500: 0.401 | Hits@ 500: 0.840
DCG@1000: 0.418 | Hits@1000: 1.000


Если вы проделали все шаги правильно, то вы должны немного разочароваться полученными результатами. Давайте попробуем понять, почему качество модели такое низкое. Когда вы работаете с какими-либо данными, очень полезно первым делом посмотреть на них глазами. Выведим несколько вопросов из наших данных:

In [108]:
for line in data[:3]:
    q, *examples = line
    print(q, *examples[:3])
    print()

How to print a binary heap tree without recursion? How do you best convert a recursive function to an iterative one? How can i use ng-model with directive in angular js flash: drawing and erasing

How to start PhoneStateListener programmatically? PhoneStateListener and service Java cast object[] to model WCF and What does this mean?

jQuery: Show a div2 when mousenter over div1 is over when hover on div1 depenting on if it is on div2 or not it should act differently How to run selenium in google app engine/cloud? Python Comparing two lists of strings for similarities



#### Предобработка текста

In [110]:
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Sergei\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [111]:
stopWords = set(stopwords.words('english'))
len(stopWords)

179

In [135]:
tokenizer = RegexpTokenizer(r'\w+')
stopWords = set(stopwords.words('english'))

def text_prepare(text):
    text = text.lower()
    tokens = tokenizer.tokenize(text)
    return ' '.join(t for t in tokens if t not in stopWords)

In [132]:
new_data = deepcopy(data)

for i in tqdm(range(len(data))):
    for j in range(len(data[i])):
        new_data[i][j] = text_prepare(data[i][j])

100%|███████████████████████████████████████████████████████████████████████████████████████████| 3760/3760 [18:03<00:00,  3.47it/s]


In [133]:
wv_ranking = []

for line in tqdm(new_data):
    q, *ex = line
    ranks = rank_candidates(q, ex, wv_embeddings)
    wv_ranking.append([r[0] for r in ranks].index('0') + 1)

100%|███████████████████████████████████████████████████████████████████████████████████████████| 3760/3760 [05:01<00:00, 12.46it/s]


In [134]:
for k in [1, 5, 10, 100, 500, 1000]:
    print("DCG@%4d: %.3f | Hits@%4d: %.3f" % (k, dcg_score(wv_ranking, k), k, hits_count(wv_ranking, k)))

DCG@   1: 0.335 | Hits@   1: 0.335
DCG@   5: 0.406 | Hits@   5: 0.470
DCG@  10: 0.421 | Hits@  10: 0.517
DCG@ 100: 0.457 | Hits@ 100: 0.696
DCG@ 500: 0.477 | Hits@ 500: 0.850
DCG@1000: 0.493 | Hits@1000: 1.000


#### Визуализация (вопросы со Stack Overflow)

Сделаем плоский список и возьмем первые 10 000 вопросов, так t-SNE будет долго считать.

In [4]:
import pickle
new_data_flat = pickle.load(open('new_data_flat.dmp','rb'))

In [145]:
new_data_flat = [item for sublist in new_data for item in sublist]

In [5]:
len(new_data_flat)

3763048

In [10]:
so_embeddings = []
for q in tqdm(new_data_flat[0:10000]):
        so_embeddings.append(question_to_vec(q, wv_embeddings))

100%|██████████████████████████████████████████████████████████████████████████████████████| 10000/10000 [00:00<00:00, 13097.07it/s]


In [None]:
#from sklearn.decomposition import PCA
#pca = PCA(n_components=2)
#so_embeddings_pca = pca.fit_transform(so_embeddings)

In [11]:
import bokeh.models as bm, bokeh.plotting as pl
from bokeh.io import output_notebook
output_notebook()

def draw_vectors(x, y, radius=10, alpha=0.25, color='blue', width=600, height=400, show=True, **kwargs):
    if isinstance(color, str): color = [color] * len(x)
    data_source = bm.ColumnDataSource({ 'x' : x, 'y' : y, 'color': color, **kwargs })

    fig = pl.figure(active_scroll='wheel_zoom', width=width, height=height)
    fig.scatter('x', 'y', size=radius, color='color', alpha=alpha, source=data_source)

    fig.add_tools(bm.HoverTool(tooltips=[(key, "@" + key) for key in kwargs.keys()]))
    if show: pl.show(fig)
    return fig

In [2]:

#pickle.dump(so_embeddings, open('so_embeddings.dmp','wb'))

In [None]:
from sklearn.manifold import TSNE
from sklearn.preprocessing import StandardScaler

tsne = TSNE(n_components=2, verbose=50)
so_embeddings_tsne = tsne.fit_transform(so_embeddings)
so_embeddings_tsne_scaled = StandardScaler().fit_transform(so_embeddings_tsne)

[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Indexed 10000 samples in 0.203s...
[t-SNE] Computed neighbors for 10000 samples in 54.081s...
[t-SNE] Computed conditional probabilities for sample 1000 / 10000
[t-SNE] Computed conditional probabilities for sample 2000 / 10000
[t-SNE] Computed conditional probabilities for sample 3000 / 10000
[t-SNE] Computed conditional probabilities for sample 4000 / 10000
[t-SNE] Computed conditional probabilities for sample 5000 / 10000
[t-SNE] Computed conditional probabilities for sample 6000 / 10000
[t-SNE] Computed conditional probabilities for sample 7000 / 10000
[t-SNE] Computed conditional probabilities for sample 8000 / 10000
[t-SNE] Computed conditional probabilities for sample 9000 / 10000
[t-SNE] Computed conditional probabilities for sample 10000 / 10000
[t-SNE] Mean sigma: 0.000000
[t-SNE] Computed conditional probabilities in 0.458s
[t-SNE] Iteration 50: error = 93.9200592, gradient norm = 0.0386176 (50 iterations in 2358.446s)
[t-SNE

In [None]:
output_notebook()
draw_vectors(so_embeddings_tsne_scaled[:, 0], so_embeddings_tsne_scaled[:, 1], token=words)

In [None]:
def find_closest_questions(question, k=5):
    vec_question = question_to_vec(question, embeddings)
    
    #vec_question = question_to_vec(question, embeddings)
    #vec_candidates = np.array([question_to_vec(candidates[i], embeddings)
    #                  for i in range(len(candidates))])
    #rank = np.array([(i, candidates[i]) for i in range(len(candidates))])
    #dist_s = cosine_similarity(vec_candidates, np.array([vec_question]))[:, 0]
    #return deepcopy(rank[dist_s.argsort()[::-1]])
    
    return index

In [None]:
find_closest_questions(text_prepare("Why am I so stupid?"))