# Чанкинг - разбиение текстовых данных на информационные блоки

In [None]:
import nltk 
import numpy as np 
from nltk.corpus import brown

In [None]:
nltk.download('brown')

[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Unzipping corpora/brown.zip.


True

In [None]:
# Разбиение текста на блоки по N слов
def chunker(input_data, N):
  '''input_data - текст, N - кол-во слов
  '''
  input_words = input_data.split(' ')
  output = []
  cur_chunk = []
  count = 0 
  for word in input_words: 
    cur_chunk.append(word)
    count += 1 
    if count == N:
      output.append(' '.join(cur_chunk))
      count, cur_chunk = 0, []
  output.append(' '.join(cur_chunk))

  return output

In [None]:
if __name__ == '__main__':
  # Чтение первых 12 тыс слов из коллекции Brown
  input_data = ' '.join(brown.words()[:12000])
  
  # Определение количества слов в каждом блоке 
  chunk_size = 700

  chunks = chunker(input_data, chunk_size)
  print('\nNumber of text chunks =', len(chunks), '\n')
  for i, chunk in enumerate(chunks):
    print('Chunk', i+1, '==>', chunk[:50])



Number of text chunks = 18 

Chunk 1 ==> The Fulton County Grand Jury said Friday an invest
Chunk 2 ==> '' . ( 2 ) Fulton legislators `` work with city of
Chunk 3 ==> . Construction bonds Meanwhile , it was learned th
Chunk 4 ==> , anonymous midnight phone calls and veiled threat
Chunk 5 ==> Harris , Bexar , Tarrant and El Paso would be $451
Chunk 6 ==> set it for public hearing on Feb. 22 . The proposa
Chunk 7 ==> College . He has served as a border patrolman and 
Chunk 8 ==> of his staff were doing on the address involved co
Chunk 9 ==> plan alone would boost the base to $5,000 a year a
Chunk 10 ==> nursing homes In the area of `` community health s
Chunk 11 ==> of its Angola policy prove harsh , there has been 
Chunk 12 ==> system which will prevent Laos from being used as 
Chunk 13 ==> reform in recipient nations . In Laos , the admini
Chunk 14 ==> . He is not interested in being named a full-time 
Chunk 15 ==> said , `` to obtain the views of the general publi
Chunk 16 ==> '' . M

In [None]:
text = '''
Long Short Term Memory cells are like mini neural networks designed to allow for memory in a larger neural network. This is achieved through the use of a recurrent node inside the LSTM cell. This node has an edge looping back on itself with a weight of one, meaning at every feedfoward iteration the cell can hold onto information from the previous step, as well as all previous steps. Since the looping connection’s weight is one, old memories wont fade over time like they would in traditional RNNs.
LTSMs and recurrent neural networks are as a result good at working with time series data thanks to their ability to remember the past. By storing some of the old state in these recurrent nodes, RNNs and LSTMs can reason about current information as well as information the network had seen one, ten or a thousand steps ago. Even better, I don’t have to write my own implementation of an LSTM cell; they’re a default layer in Tensorflow’s Keras.
So I had my plan; to use LSTMs and Keras to predict the stock market, and perhaps even make some money. The good thing about stock price history is that it’s basically a well labelled pre formed dataset. After some googling I found a service called AlphaVantage. They offered the daily price history of NASDAQ stocks for the past 20 years. This included the open, high, low, close and volume of trades for each day, from today all the way back up to 1999. Even better, a python wrapper exists for the service. I got my free API key from the website and downloaded Microsofts daily stock history.
'''

In [None]:
len(text)

1545

In [None]:
if __name__ == '__main__':
  # Чтение первых 12 тыс слов из коллекции Brown
  input_data = text
  
  # Определение количества слов в каждом блоке 
  chunk_size = 10

  chunks = chunker(input_data, chunk_size)
  print('\nNumber of text chunks =', len(chunks), '\n')
  for i, chunk in enumerate(chunks):
    print('Chunk', i+1, '==>', chunk[:10])


Number of text chunks = 28 

Chunk 1 ==> 
Long Shor
Chunk 2 ==> designed t
Chunk 3 ==> This is ac
Chunk 4 ==> inside the
Chunk 5 ==> back on it
Chunk 6 ==> every feed
Chunk 7 ==> the previo
Chunk 8 ==> the loopin
Chunk 9 ==> over time 
Chunk 10 ==> neural net
Chunk 11 ==> time serie
Chunk 12 ==> past. By s
Chunk 13 ==> recurrent 
Chunk 14 ==> as well as
Chunk 15 ==> or a thous
Chunk 16 ==> to write m
Chunk 17 ==> a default 
Chunk 18 ==> to use LST
Chunk 19 ==> and perhap
Chunk 20 ==> stock pric
Chunk 21 ==> pre formed
Chunk 22 ==> called Alp
Chunk 23 ==> stocks for
Chunk 24 ==> high, low,
Chunk 25 ==> from today
Chunk 26 ==> better, a 
Chunk 27 ==> my free AP
Chunk 28 ==> daily stoc


# Извлечение частотности слов с помощью модели Bag of words

In [None]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
# Количество слов в каждом блоке 
chunk_size = 800

In [None]:
input_data = ' '.join(brown.words()[:5400])

In [None]:
text_chunks = chunker(input_data, chunk_size)

In [None]:
# Преобразование в элементы словаря
chunks = []
for count, chunk in enumerate(text_chunks):
    d = {'index': count, 'text': chunk}
    chunks.append(d)

In [None]:
# Извлечение терм-документной матрицы
count_vectorizer = CountVectorizer(min_df=7, max_df=20)
document_term_matrix = count_vectorizer.fit_transform([chunk['text'] for chunk in chunks])

In [None]:
document_term_matrix

<7x21 sparse matrix of type '<class 'numpy.int64'>'
	with 147 stored elements in Compressed Sparse Row format>

In [None]:
# извлечение и отображение словаря
vocabulary = np.array(count_vectorizer.get_feature_names()) 
print("\nVocabulary:\n", vocabulary)


Vocabulary:
 ['and' 'are' 'be' 'by' 'county' 'for' 'in' 'is' 'it' 'of' 'on' 'one'
 'said' 'state' 'that' 'the' 'to' 'two' 'was' 'which' 'with']


In [None]:
# Генерация имен блоков 
chunk_names = []
for i in range(len(text_chunks)):
  chunk_names.append('Chunk-' + str(i+1) )

In [None]:
chunk_names

['Chunk-1', 'Chunk-2', 'Chunk-3', 'Chunk-4', 'Chunk-5', 'Chunk-6', 'Chunk-7']

In [None]:
# Вывод терм-документной матрицы
print("\nDocument term matrix:")
formatted_text = '{:>12}' * (len(chunk_names) + 1)
print('\n', formatted_text.format('Word', *chunk_names), '\n')
for word, item in zip(vocabulary, document_term_matrix.T):
    # 'item' is a 'csr_matrix' data structure
    output = [word] + [str(freq) for freq in item.data]
    print(formatted_text.format(*output))


Document term matrix:

         Word     Chunk-1     Chunk-2     Chunk-3     Chunk-4     Chunk-5     Chunk-6     Chunk-7 

         and          23           9           9          11           9          17          10
         are           2           2           1           1           2           2           1
          be           6           8           7           7           6           2           1
          by           3           4           4           5          14           3           6
      county           6           2           7           3           1           2           2
         for           7          13           4          10           7           6           4
          in          15          11          15          11          13          14          17
          is           2           7           3           4           5           5           2
          it           8           6           8           9           3           1           2
   

# Создание  прогнозатора категорий

In [None]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfTransformer 
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
# Определение карты категорий
category_map = {'talk.politics.misc': 'Politics',
'rec.autos': 'Autos', 'rec.sport.hockey': 'Hockey', 
'sci.electronics': 'Electronics', 'sci.med': 'Medicine'}

In [None]:
# Получение тренировочного набора данных 
training_data = fetch_20newsgroups(subset='train',
categories=category_map.keys(), shuffle=True, random_state=5)

Downloading 20news dataset. This may take a few minutes.
Downloading dataset from https://ndownloader.figshare.com/files/5975967 (14 MB)


In [None]:
# Создание векторизатора и извлечение счетчиков слов 
count_vectorizer = CountVectorizer()
train_tc = count_vectorizer.fit_transform(training_data.data)
print("\nDimensions of training data:", train_tc.shape)


Dimensions of training data: (2844, 40321)


In [None]:
# Создание  преобразователя tf-idf
tfidf = TfidfTransformer()
train_tfidf = tfidf.fit_transform(train_tc)

In [None]:
# Определение тестовых данных 
input_data = ['You need to be careful with cars when you are driving slippery roads',
              'А lot of devices can Ье operated wirelessly',
'Players need to Ье careful when they are close to goal posts', 
'Political debates help us understand the perspectives of both sides']


In [None]:
# Обучение мультиномиального байесовского классификатора
classifier = MultinomialNB().fit(train_tfidf, training_data.target)

In [None]:
# Преобразование входных данных с помощью
# векторизатора счетчиков
input_tc = count_vectorizer.transform(input_data)

In [None]:
# Преобразование векторизованных данных с помощью tf - idf
input_tfidf = tfidf.transform(input_tc)

In [None]:
# Прогнозирование результирующих категорий
predictions = classifier.predict(input_tfidf)

In [None]:
# Вывод результатов
for sent, category in zip(input_data, predictions):
  print('\ninput: ', sent, '\nPredicted category: ',
        category_map[training_data.target_names[category]])


input:  You need to be careful with cars when you are driving slippery roads 
Predicted category:  Autos

input:  А lot of devices can Ье operated wirelessly 
Predicted category:  Electronics

input:  Players need to Ье careful when they are close to goal posts 
Predicted category:  Hockey

input:  Political debates help us understand the perspectives of both sides 
Predicted category:  Politics


# Создание анализатора грамматических родов

In [None]:
import random
from nltk import NaiveBayesClassifier
from nltk.classify import accuracy as nltk_accuracy
from nltk.corpus import names

In [None]:
# Извлечение последних N букв из входного слова
# и возврат значения, выступающего в качестве "признака"
def extract_features(word, N=2):
  last_n_letters = word [-N:]
  return {'feature': last_n_letters .lower()}

In [None]:
nltk.download('names')

[nltk_data] Downloading package names to /root/nltk_data...
[nltk_data]   Unzipping corpora/names.zip.


True

In [None]:
if __name__ =='__main__':
    # Создание обучающих данных с использованием 
    # помеченных имен, доступных в NLTK 
    male_list = [(name, 'male') for name in names.words('male.txt' )] 
    female_list = [ (name, 'female') for name in names.words('female.txt')] 
    data = (male_list + female_list)

In [None]:
# Затравочное значение дпя генератора случайных чисел 
random.seed(5)
# Перемешивание данных 
random.shuffle(data)

In [None]:
# Создание тестовых данных
input_names = ['Alexander', 'Danielle', 'David', 'Cheryl']

In [None]:
# Определение количеств образцов, используемых 
# дпя тренировки и тестирования
num_train = int(0.8 * len(data))

In [None]:
# Итерирование по различным длинам конечного # фрагмента дпя сравнения точности
for i in range(1, 6):
  print('\nNшnЬer of end letters: ', i)
  features = [ (extract_features(n, i), gender) for (n,gender) in data]
  train_data, test_data = features[:num_train], features[num_train:]
  classifier = NaiveBayesClassifier.train(train_data)
  # Вычисление точности классификатора
  accuracy = round(100 * nltk_accuracy(classifier, test_data), 2) 
  print('Accuracy = ' + str (accuracy) + ' %' )
  # Предсказание результатов для входнь~ имен
  # с использованием обученной модели классификатора 
  for name in input_names:
    print(name, '==>', classifier.classify(extract_features(name, i )))


NшnЬer of end letters:  1
Accuracy = 74.7 %
Alexander ==> male
Danielle ==> female
David ==> male
Cheryl ==> male

NшnЬer of end letters:  2
Accuracy = 78.79 %
Alexander ==> male
Danielle ==> female
David ==> male
Cheryl ==> female

NшnЬer of end letters:  3
Accuracy = 77.22 %
Alexander ==> male
Danielle ==> female
David ==> male
Cheryl ==> female

NшnЬer of end letters:  4
Accuracy = 69.98 %
Alexander ==> male
Danielle ==> female
David ==> male
Cheryl ==> female

NшnЬer of end letters:  5
Accuracy = 64.63 %
Alexander ==> male
Danielle ==> female
David ==> male
Cheryl ==> female


# Создание сентимент-анализатора

In [None]:
from nltk.corpus import movie_reviews
from nltk.classify import NaiveBayesClassifier
from nltk.classify.util import accuracy as nltk_accuracy

In [None]:
# Извлечение признаков из входного списка слов
def extract_features(words):
  return dict([(word, True) for word in words])

In [None]:
nltk.download('movie_reviews')

[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Unzipping corpora/movie_reviews.zip.


True

In [None]:
if __name__ =='__main__':
  # Загрузка отзывов из коллекции
  fileids_pos = movie_reviews.fileids('pos') 
  fileids_neg = movie_reviews.fileids('neg')

In [None]:
# Извлечение признаков из отзывов
features_pos = [ (extract_features(movie_reviews.words(fileids=[f]) ), 'Positive') for f in fileids_pos] 
features_neg = [ (extract_features(movie_reviews.words(fileids=[f]) ), 'Negative') for f in fileids_neg]

In [None]:
# Определение относительных долей тренировочного # и тестового наборов (80% и 20%)
threshold = 0.8
num_pos = int(threshold * len(features_pos)) 
num_neg = int(threshold * len(features_neg))

In [None]:
# Создание тренировочного и тестового наборов
features_train = features_pos[:num_pos] + features_neg[:num_neg]
features_test = features_pos[num_pos:] + features_neg[num_neg:]


In [None]:
# Вывод количества используемых точек данных 
print('\nNumber of training datapoints: ', len(features_train))
print('Number of test datapoints: ', len(features_test))


Number of training datapoints:  1600
Number of test datapoints:  400


In [None]:
# Обучение наивного байесовского классификатора 
classifier = NaiveBayesClassifier.train(features_train)
print('\nAccuracy of the classifier: ', nltk_accuracy(classifier, features_test))


Accuracy of the classifier:  0.735


In [None]:
N= 15
print('\nTop ' + str(N) + 'most inforrnative words: ') 
for i, item in enumerate(classifier.most_informative_features()): 
  print(str(i+1) + '. ' + item[0])
  if i == N-1:
    break


Top 15most inforrnative words: 
1. outstanding
2. insulting
3. vulnerable
4. ludicrous
5. uninvolving
6. astounding
7. avoids
8. fascination
9. seagal
10. darker
11. anna
12. symbol
13. affecting
14. animators
15. idiotic


In [None]:
# Тестирование входных отзывов о фильмах 
input_reviews = [
'The costumes in this rnovie were great',
'I think the story was terriЫe and the characters were very weak',
'People say that the director of the movie is amazing', 
'This is such an idiotic movie. I will not recornrnend it to anyone.']

In [None]:
print("\nMovie review predictions:")
for review in input_reviews: 
  print("\nReview:", review)
  # вычисление вероятностей
  probabilities = classifier.prob_classify(extract_features(review.split())) 
  # выбор макс знач-я
  predicted_sentiment = probabilities.max()
  # вывод рез-ов
  print("Predicted sentiment", predicted_sentiment)
  print("Probability", round(probabilities.prob(predicted_sentiment),2))  


Movie review predictions:

Review: The costumes in this rnovie were great
Predicted sentiment Positive
Probability 0.61

Review: I think the story was terriЫe and the characters were very weak
Predicted sentiment Negative
Probability 0.58

Review: People say that the director of the movie is amazing
Predicted sentiment Positive
Probability 0.6

Review: This is such an idiotic movie. I will not recornrnend it to anyone.
Predicted sentiment Negative
Probability 0.89


# Тематическое моделирование с использованием Латентного размещения Дирихле

In [None]:
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer 
from gensim import models, corpora

In [None]:
# Загрузка входных данных 
def load_data(input_file):
  data = []
  with open(input_file, 'r') as f:
    for line in f.readlines(): 
      data.append(line[:-1])
  return data

In [None]:
# Функция обработки, предназначенная дnя токенизации текста,
# удаления стоп-слов и выполнения стемминга
def process(input_text):
  # Создание регулярного выражения для токенизатора
  tokenizer = RegexpTokenizer(r'\w+')
  # Создание стеммера Сноубаолла
  stemmer = SnowballStemmer('english')
  # Получение списка стоп-слов
  stop_words = stopwords.words('english')
  # Токенизация входной строки
  tokens = tokenizer.tokenize(input_text.lower())
  # Удаление стоп-слов
  tokens = [х for х in tokens if not х in stop_words]
  # Вьmолнение стемминга токенизированных слов
  tokens_stemmed = [stemmer.stem(x) for x in tokens] 
  return tokens_stemmed

In [None]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
if __name__ =='__main__':
  # Загрузка входных данных 
  data = load_data('data.txt')
  # Создание списка токенов предложений 
  tokens = [process(x) for x in data]
  # Создание словаря на основе токенизированных предложений 
  dict_tokens = corpora.Dictionary(tokens)
  # Создание терм-документной матрицы
  doc_term_mat = [dict_tokens.doc2bow(token) for token in tokens]
  # Определим количество тем для LDА-модели 
  num_topics = 2
  # Генерирование LDА-модели
  ldamodel = models.ldamodel.LdaModel(doc_term_mat, num_topics=num_topics, id2word=dict_tokens, passes=25)
  num_words = 5
  print('\nTop ' + str(num_words) + ' contributing words to each topic:')
  for item in ldamodel.print_topics(num_topics=num_topics, num_words=num_words):
    print('\nTopic', item[0])
  # Вывод представительных слов вместе с их # относительными вкладами
  list_of_strings = item[1] .split(' + ') 
  for text in list_of_strings:
    weight = text.split('*') [0]
    word = text.split('*') [1]
    print(word, '==>', str(round(float(weight) * 100, 2)) + '%')


Top 5 contributing words to each topic:

Topic 0

Topic 1
"empir" ==> 3.8%
"time" ==> 2.7%
"peopl" ==> 2.7%
"histor" ==> 2.7%
"expand" ==> 2.7%
