# Домашнее задание 2. Извлечение коллокаций + NER

Ссылка на данные: https://drive.google.com/file/d/1GTR4FrgAWRyA742s0-7uRpWml8dou925/view?usp=sharing

## Создаем корпус

In [96]:
import nltk
from nltk.collocations import *
from nltk.corpus import stopwords
from pymorphy2 import MorphAnalyzer
import pandas as pd

In [97]:
data = pd.read_json('Cell_Phones_and_Accessories_5.json.gz', compression='infer', lines=True)

In [110]:
sample = data.truncate(0, 9999)

In [128]:
len(sample)

10000

In [112]:
m = MorphAnalyzer()

In [113]:
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(r'\w+')
nltk.download('stopwords')
stop_words = set(stopwords.words('english')) 

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Anastasia\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [114]:
def normalize(text):
    lemmas = []
    for t in tokenizer.tokenize(text):
        t = t.lower()
        if t not in stop_words:
            lemmas.append(m.parse(t)[0].normal_form)
    return lemmas

In [115]:
corpus = sample['reviewText'].apply(normalize).tolist()

## Создаем лист сущностей

In [116]:
list_of_NEs = ['phone', 'smartphone', 'charger', 'headset', 'usb', 'android', 'model']

In [117]:
import gensim.downloader as api

info = api.info()

model = api.load("glove-wiki-gigaword-50")
#можно было бы, наверное, использовать модель побольше, но у меня она отказывалась подгружаться

In [118]:
final_NEs = []

for word in list_of_NEs:
    final_NEs.append(normalize(word)[0])
    try:
        sim_words = model.most_similar(positive = [word], topn=3)
        for sim_word in sim_words:
            if sim_word[0] not in final_NEs:
                final_NEs.append(sim_word[0])
    except: print('error')

In [119]:
print(final_NEs)

['phone', 'telephone', 'phones', 'cellphone', 'smartphone', 'iphone', 'smartphones', 'ipad', 'charger', 'volt', 'turbo', 'ev1', 'headset', 'headphones', 'headsets', 'earphones', 'usb', 'adapter', 'firewire', 'plugs', 'android', 'ios', 'model', 'models', 'design', 'concept']


## Ищем биграммы

In [120]:
bigram_measures = nltk.collocations.BigramAssocMeasures()

finder = BigramCollocationFinder.from_documents(corpus)
finder.apply_freq_filter(10)

In [121]:
likelihood_ngrams = finder.score_ngrams(bigram_measures.likelihood_ratio)
pmi_ngrams = finder.score_ngrams(bigram_measures.pmi)
dice_ngrams = finder.score_ngrams(bigram_measures.dice)

In [122]:
def find_relevant(input_ngrams):
    output_ngrams = {}
    for word in final_NEs:
        relevant_ngrams = []
        for ngram in input_ngrams:
            if word in ngram[0]:
                relevant_ngrams.append(ngram)
        output_ngrams[word] = relevant_ngrams
    return output_ngrams

In [123]:
relevant_likelihood = find_relevant(likelihood_ngrams)
relevant_pmi = find_relevant(pmi_ngrams)
relevant_dice = find_relevant(dice_ngrams)

In [124]:
def print_any_num(input_ngrams, num):
    for word in final_NEs:
        if len(input_ngrams[word]) >= num:
            print('\n---\n', word, '\n---')
            for i in range (0, num):
                print(input_ngrams[word][i][0][0], input_ngrams[word][i][0][1])
        elif len(input_ngrams[word]) > 0:
            print('\n---\n', word, '\n---')
            for i in range (0, len(input_ngrams[word])):
                print(input_ngrams[word][i][0][0], input_ngrams[word][i][0][1]) 

In [125]:
print_any_num(relevant_likelihood, 5)


---
 phone 
---
cell phone
smart phone
phone calls
new phone
speaker phone

---
 phones 
---
cell phones
smart phones
different phones
android phones
two phones

---
 iphone 
---
iphone 4s
iphone 3g
iphone 5
iphone 4
iphone 3gs

---
 ipad 
---
charge ipad
ipad mini
iphone ipad
ipad 2
ipad iphone

---
 charger 
---
car charger
wall charger
usb charger
ac charger
battery charger

---
 headset 
---
bluetooth headset
bt headset
wired headset
headset vibrating
headset ever

---
 headphones 
---
stereo headphones
bluetooth headphones
wireless headphones
normal headphones
use headphones

---
 headsets 
---
bluetooth headsets
bt headsets
headsets used
different headsets
wired headsets

---
 usb 
---
micro usb
mini usb
usb port
usb cable
usb ports

---
 adapter 
---
ac adapter
power adapter
white adapter
wall adapter
headphone adapter

---
 plugs 
---
ear plugs

---
 android 
---
android phones
android phone

---
 model 
---
another model

---
 models 
---
newer models
different models

---
 d