In [6]:
import json

import numpy as np
import pandas as pd

import spacy
import en_core_web_sm
import pyinflect

import gensim.downloader as api


In [15]:
# малая модель spacy
nlp = en_core_web_sm.load()

# model glove wiki
# внимание - очень долго скачивает, если она еще не установлена еще
model = api.load("glove-wiki-gigaword-100")

In [5]:
# пример датасета – как упаковать упражнения

df = pd.DataFrame(columns=['raw', 'type', 'transformed', 'options', 'answer'])

df.loc[len(df)] = {'raw' : 'All the necessary ingredients for a pizza arrived in the next delivery.',
                   'transformed' : 'All the necessary _____ for a pizza arrived in the next delivery.',
                   'type' : 'select_word',
                   'options' : ['fabrics', 'dogs', 'ingredients'],
                   'answer' : 'ingredients'
                  }

df.loc[len(df)] = {'raw' : 'All the necessary ingredients for a pizza arrived in the next delivery.',
                   'transformed' : 'All the necessary _____ for a pizza arrived in the next delivery.',
                   'type' : 'write_word',
                   'options' : None,
                   'answer' : 'ingredients'
                  }

df.loc[len(df)] = {'raw' : 'All the necessary ingredients for a pizza arrived in the next delivery.',
                   'transformed' : 'All the necessary _____ for a pizza arrived in the next delivery.',
                   'type' : 'select_sentence',
                   'options' : ['fabrics', 'dogs', 'ingredients'],
                   'answer' : 'ingredients'
                  }

df.loc[len(df)] = {'raw' : 'All the necessary ingredients for a pizza arrived in the next delivery.',
                   'transformed' : 'All the necessary ingredients for a pizza arrived in the next delivery.',
                   'type' : 'select_pos',
                   'options' : ['nsubj', 'pobj'],
                   'answer' : 'nsubj'
                  }
df

Unnamed: 0,raw,type,transformed,options,answer
0,All the necessary ingredients for a pizza arri...,select_word,All the necessary _____ for a pizza arrived in...,"[fabrics, dogs, ingredients]",ingredients
1,All the necessary ingredients for a pizza arri...,write_word,All the necessary _____ for a pizza arrived in...,,ingredients
2,All the necessary ingredients for a pizza arri...,select_sentence,All the necessary _____ for a pizza arrived in...,"[fabrics, dogs, ingredients]",ingredients
3,All the necessary ingredients for a pizza arri...,select_pos,All the necessary ingredients for a pizza arri...,"[nsubj, pobj]",nsubj


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4 entries, 0 to 3
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   text      4 non-null      object
 1   exercise  4 non-null      object
 2   type      4 non-null      object
 3   options   3 non-null      object
 4   answer    4 non-null      object
dtypes: object(5)
memory usage: 192.0+ bytes


# Similar words

In [7]:
model.similar_by_word('cosmic')

[('planetary', 0.6683834195137024),
 ('gravity', 0.6506437659263611),
 ('galactic', 0.6414125561714172),
 ('primordial', 0.6334367990493774),
 ('gravitational', 0.6305052042007446),
 ('phenomena', 0.6299728751182556),
 ('earth', 0.6076391339302063),
 ('sonic', 0.6034857630729675),
 ('quantum', 0.5904744267463684),
 ('particle', 0.5867664217948914)]

In [19]:
model.most_similar(positive=['fast','bad'], negative=['good'])

[('slow', 0.7502553462982178),
 ('slower', 0.6295009851455688),
 ('faster', 0.6158817410469055),
 ('too', 0.5972148180007935),
 ('turning', 0.5882929563522339),
 ('off', 0.5874745845794678),
 ('dangerous', 0.5860161185264587),
 ('worse', 0.5812638998031616),
 ('trouble', 0.5808587074279785),
 ('heavy', 0.5680885910987854)]

In [14]:
model.most_similar(positive=['fast','negative'], negative=['positive'])

[('slow', 0.7045032978057861),
 ('too', 0.6350111961364746),
 ('faster', 0.6330539584159851),
 ('turning', 0.6328635811805725),
 ('turn', 0.6146118640899658),
 ('rather', 0.6059424877166748),
 ('moving', 0.6030642986297607),
 ('low', 0.602108895778656),
 ('slower', 0.599425196647644),
 ('big', 0.5981428027153015)]

# Sentence transformation

In [9]:
new_sent = []
i=5
for token in nlp("All the necessary ingredients for a pizza arrived in the next delivery."):
    if token.pos_ in ['NOUN', 'VERB', 'ADV', 'ADJ']:
        try:
            new_sent += [model.most_similar(token.text,topn=i)[np.random.randint(0,i)][0]]
        except:
            new_sent += [token.text]
    else:
        new_sent += [token.text]
        
print('All the necessary ingredients for a pizza arrived in the next delivery.')
' '.join(new_sent)    

All the necessary ingredients for a pizza arrived in the next delivery.


'All the needed spices for a burger arrive in the week supply .'

# Inflecting

In [13]:
for token in nlp("I think it's a good idea and easy to use"):
    if token.pos_=='ADJ':
        print(token._.inflect('JJS'))          

best
easiest


# Morphology

In [14]:
for token in nlp("I think it's a good idea and easy to use"):
    print(token.text, '\t–\t', token.morph) 


I 	–	 Case=Nom|Number=Sing|Person=1|PronType=Prs
think 	–	 Tense=Pres|VerbForm=Fin
it 	–	 Case=Nom|Gender=Neut|Number=Sing|Person=3|PronType=Prs
's 	–	 Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin
a 	–	 Definite=Ind|PronType=Art
good 	–	 Degree=Pos
idea 	–	 Number=Sing
and 	–	 ConjType=Cmp
easy 	–	 Degree=Pos
to 	–	 
use 	–	 VerbForm=Inf


# Dependency

In [15]:
for chunk in nlp("All the necessary ingredients for a pizza arrived in the next delivery").noun_chunks:
    print(chunk.text, ':', 
          chunk.root.text, ':', 
          chunk.root.dep_, ':', 
          chunk.root.head.text)

All the necessary ingredients : ingredients : nsubj : arrived
a pizza : pizza : pobj : for
the next delivery : delivery : pobj : in


In [16]:
for token in nlp("All the necessary ingredients for a pizza arrived in the next delivery"):
    print(token.text, ':', token.dep_)


All : predet
the : det
necessary : amod
ingredients : nsubj
for : prep
a : det
pizza : pobj
arrived : ROOT
in : prep
the : det
next : amod
delivery : pobj


# Gensim models and vectors

In [10]:
info = api.info()
# print(json.dumps(info, indent=4))


In [11]:
for corpus_name, corpus_data in sorted(info['corpora'].items()):
    print(
        '%s (%d records): %s' % (
            corpus_name,
            corpus_data.get('num_records', -1),
            corpus_data['description'][:40] + '...',
        )
    )


20-newsgroups (18846 records): The notorious collection of approximatel...
__testing_matrix-synopsis (-1 records): [THIS IS ONLY FOR TESTING] Synopsis of t...
__testing_multipart-matrix-synopsis (-1 records): [THIS IS ONLY FOR TESTING] Synopsis of t...
fake-news (12999 records): News dataset, contains text and metadata...
patent-2017 (353197 records): Patent Grant Full Text. Contains the ful...
quora-duplicate-questions (404290 records): Over 400,000 lines of potential question...
semeval-2016-2017-task3-subtaskA-unannotated (189941 records): SemEval 2016 / 2017 Task 3 Subtask A una...
semeval-2016-2017-task3-subtaskBC (-1 records): SemEval 2016 / 2017 Task 3 Subtask B and...
text8 (1701 records): First 100,000,000 bytes of plain text fr...
wiki-english-20171001 (4924894 records): Extracted Wikipedia dump from October 20...


In [12]:
for model_name, model_data in sorted(info['models'].items()):
    print(
        '%s (%d records): %s' % (
            model_name,
            model_data.get('num_records', -1),
            model_data['description'][:40] + '...',
        )
    )


__testing_word2vec-matrix-synopsis (-1 records): [THIS IS ONLY FOR TESTING] Word vecrors ...
conceptnet-numberbatch-17-06-300 (1917247 records): ConceptNet Numberbatch consists of state...
fasttext-wiki-news-subwords-300 (999999 records): 1 million word vectors trained on Wikipe...
glove-twitter-100 (1193514 records): Pre-trained vectors based on  2B tweets,...
glove-twitter-200 (1193514 records): Pre-trained vectors based on 2B tweets, ...
glove-twitter-25 (1193514 records): Pre-trained vectors based on 2B tweets, ...
glove-twitter-50 (1193514 records): Pre-trained vectors based on 2B tweets, ...
glove-wiki-gigaword-100 (400000 records): Pre-trained vectors based on Wikipedia 2...
glove-wiki-gigaword-200 (400000 records): Pre-trained vectors based on Wikipedia 2...
glove-wiki-gigaword-300 (400000 records): Pre-trained vectors based on Wikipedia 2...
glove-wiki-gigaword-50 (400000 records): Pre-trained vectors based on Wikipedia 2...
word2vec-google-news-300 (3000000 records): Pre-trai