In [246]:
import markovify

In [247]:
training_dir = "./sources/training/"
test_dir = "./sources/test/"

In [248]:
# Get raw text as string.
with open(training_dir+"A_A_Milne_-_Chatka_Puchatka.txt") as f:
    text = f.read()

In [249]:
# Build the model.
text_model = markovify.Text(text)

In [250]:
model = text_model.to_dict()

print('state_size:', model['state_size'])
# print('chain:', model['chain'])

state_size: 2


In [251]:
# Print a randomly-generated sentence
for i in range(1):
    print(text_model.make_sentence())

Nie słyszałem ich dobrze i w chwilę potem okazało się, że on już przedtem wiedział.


In [252]:
import os

In [253]:
# Extract source files and authors

files = sorted(os.listdir(training_dir))
files = files[:50]
authors = {}

for s in files:
    with open(s) as f:
    text = f.read()
    
    no_extension = s[0:s.find('.')]
    try:
        index = s.index('-')
    except ValueError:
        index = len(s)
    author = no_extension[:index]
    assigned_files = authors.get(author, [])
    assigned_files.append(s)
    authors[author] = assigned_files
    
for k,v in authors.items():
    print(k)
    print(v)

A_A_Milne_
['A_A_Milne_-_Chatka_Puchatka.txt', 'A_A_Milne_-_Kubus_Puchatek.txt']
A_E_Van_Vogt_
['A_E_Van_Vogt_-_Sklepy_z_bronia_na_Isher.txt']
A_Imie_Jej_Ciemnosc
['A_Imie_Jej_Ciemnosc.txt']
A_S_LaVey_
['A_S_LaVey_-_Biblia_Szatana.txt']
A_i_B_Strugaccy_
['A_i_B_Strugaccy_-_Miliard_lat_przed_koncem_swiata.txt', 'A_i_B_Strugaccy_-_Pora_deszczow.txt']
Abe_Kobo_
['Abe_Kobo_-_Czwarta_epoka.txt', 'Abe_Kobo_-_Kobieta_z_Wydm.txt']
Adam_Bilikiewicz_
['Adam_Bilikiewicz_-_Psychiatria.txt']
Adam_Mickiewicz_
['Adam_Mickiewicz_-_Pan_Tadeuszosloskop_net.txt']
Adam_Wi?niewski_
['Adam_Wi?niewski_-_Snerg_-_Wed?ug_?otra.txt']
Adolf_Nowaczy?ski_
['Adolf_Nowaczy?ski_-_Wielki_Fryderyk.txt']
Agata_Christie_
['Agata_Christie_-_Spotkanie_w_Bagdadzie.txt', 'Agata_Christie_-_Tajemnica_Wawrzynow.txt']
Agent_Fundacji
['Agent_Fundacji.txt']
Ahern_Jerry_
['Ahern_Jerry_-_Krucjata_3_-_Poszukiwanie.txt', 'Ahern_Jerry_-_Krucjata_5_-_Pajecza_siec.txt']
Akta_odessy_Friderick_Forsyth
['Akta_odessy_Friderick_Forsyth.txt']
A

In [254]:
generated = {}
all_generated = []

for author in authors:
    files = authors[author]
    
    # Get raw text as string.
    for file in files:
        text = ''
        with open(training_dir+file) as f:
            text += f.read()
            
    # Build the model.
    text_model = markovify.Text(text)
    # print('\n',author)
    generated_sentences = ''
        
    # Print a randomly-generated sentence
    for i in range(50):
        sentence = str(text_model.make_sentence())
        # print(sentence)
        generated_sentences += sentence
    # print(generated_sentences)
        
    generated[author] = generated_sentences
    all_generated.append(generated_sentences)

In [255]:
from gensim import corpora, models, utils, similarities
from gensim.models import LsiModel
from nltk.tokenize import word_tokenize
import string

In [256]:
texts = [[w.lower() for w in word_tokenize(text) if w not in string.punctuation] for text in all_generated]
# print(texts)

dictionary = corpora.Dictionary(texts)
dictionary.save('/tmp/deerwester.dict')
# print(dictionary)

In [257]:
# Converting each document into the bag-of-words format
corpus = [dictionary.doc2bow(text) for text in texts]
corpora.MmCorpus.serialize('/tmp/deerwester.mm', corpus)

corpus = corpora.MmCorpus('/tmp/deerwester.mm')
id2word = corpora.Dictionary.load('/tmp/deerwester.dict')

lsi = models.LsiModel(corpus, id2word=id2word, num_topics=100, chunksize=1)

In [258]:
test_author = 'A_A_Milne'
test_file = test_dir + test_author + '.txt'

with open(test_file) as f:
    test_text = f.read()

In [264]:
vec_bow = dictionary.doc2bow(test_text.lower().split())
vec_lsi = lsi[vec_bow] # convert the query to LSI space
print(vec_lsi)
    
# index = similarities.MatrixSimilarity(lsi[corpus])
# index.save('/tmp/deerwester.index')
# index = similarities.MatrixSimilarity.load('/tmp/deerwester.index')
# sims = index[vec_lsi] # perform a similarity query against the corpus
# result = list(enumerate(sims))
# print(result)
# top_per_doc = sorted(result, key=lambda x: x[1], reverse=True)
# top_per_doc = list(item[0] for item in top_per_doc)

top_per_doc = sorted(vec_lsi, key=lambda x: x[1], reverse=True)
top_per_doc = list(item[0] for item in top_per_doc)

print(top_per_doc)

best_match = list(generated.keys())[top_per_doc[0]]
print("\nMatched author:",best_match)

[(0, 22.92668060663625), (1, 2.6846355218427504), (2, -0.4921776099770541), (3, -3.4317637339707683), (4, 3.0032906594894553), (5, -0.5287377570973137), (6, -1.8406910233823326), (7, -3.1079086364510924), (8, 2.0705832998308047), (9, 3.747855918858471), (10, -0.5305512495477465), (11, -1.2739782466734555), (12, 0.04687543252580676), (13, -2.4689424092133407), (14, 1.5851102075308237), (15, -0.33006878918891003), (16, -1.0962394647311304), (17, -1.8388767719838248), (18, 0.22834254168586762), (19, -1.0497305065719387), (20, 0.2501080870627827), (21, -0.4883630118119687), (22, -0.36109437974817), (23, -0.6656961092351791), (24, -0.1365760484494392), (25, 1.10620706996828), (26, -0.6254683765917782), (27, 0.8239370876075615), (28, 0.5673094339495134), (29, -0.5098182776241748), (30, -1.4456922092706788), (31, -0.4749582534806645), (32, -0.6535086566183084)]
[0, 9, 4, 1, 8, 14, 25, 27, 28, 20, 18, 12, 24, 15, 22, 31, 21, 2, 29, 5, 10, 26, 32, 23, 19, 16, 11, 30, 17, 6, 13, 7, 3]

Matched a

In [268]:
def experiment(test_author):
    test_file = test_dir + test_author + '.txt'

    with open(test_file) as f:
        test_text = f.read()
        
    vec_bow = dictionary.doc2bow(test_text.lower().split())
    vec_lsi = lsi[vec_bow] # convert the query to LSI space
    print(vec_lsi)
    
    top_per_doc = sorted(vec_lsi, key=lambda x: x[1], reverse=True)
    top_per_doc = list(item[0] for item in top_per_doc)

    print(top_per_doc)

    best_match = list(generated.keys())[top_per_doc[0]]
    print("\nMatched author:",best_match)
    return best_match

In [269]:
experiment('A_A_Milne')

[(0, 22.92668060663625), (1, 2.6846355218427504), (2, -0.4921776099770541), (3, -3.4317637339707683), (4, 3.0032906594894553), (5, -0.5287377570973137), (6, -1.8406910233823326), (7, -3.1079086364510924), (8, 2.0705832998308047), (9, 3.747855918858471), (10, -0.5305512495477465), (11, -1.2739782466734555), (12, 0.04687543252580676), (13, -2.4689424092133407), (14, 1.5851102075308237), (15, -0.33006878918891003), (16, -1.0962394647311304), (17, -1.8388767719838248), (18, 0.22834254168586762), (19, -1.0497305065719387), (20, 0.2501080870627827), (21, -0.4883630118119687), (22, -0.36109437974817), (23, -0.6656961092351791), (24, -0.1365760484494392), (25, 1.10620706996828), (26, -0.6254683765917782), (27, 0.8239370876075615), (28, 0.5673094339495134), (29, -0.5098182776241748), (30, -1.4456922092706788), (31, -0.4749582534806645), (32, -0.6535086566183084)]
[0, 9, 4, 1, 8, 14, 25, 27, 28, 20, 18, 12, 24, 15, 22, 31, 21, 2, 29, 5, 10, 26, 32, 23, 19, 16, 11, 30, 17, 6, 13, 7, 3]

Matched a

'A_A_Milne_'