In [1]:
import gensim
import en_core_web_md

nlp = en_core_web_md.load()
# nlp.max_length=20**6

# def preprocess(in_filename, out_filename):
#     with open(in_filename, "r") as in_f:
#         with open(out_filename, "w") as out_f:
#             paragraph = []
#             for line in in_f.readlines():
#                 if line == "\n":
#                     print (len(paragraph))
#                     print (paragraph)
#                     text = nlp("".join(paragraph))
#                     for sent in text.sents:
#                         out_f.write("{}\n".format(sent.text))
#                     paragraph = []
#                 else:    
#                     paragraph.append(line)
    #converts file to 1 line = 1 sentence
# preprocess("capitalcritiqueo00marx_djvu.txt", "capital_processed.txt")
class MyText(object):
    def __init__(self, filename):
        self.filename = filename
    
    def __iter__(self):
        with open(self.filename, "r") as f:
            paragraph = []
            for line in f.readlines():
                if line == "\n" and len(paragraph) > 2:
                    text = nlp("".join(paragraph))
                    tokens = []
                    for token in text:
                        if token.pos_ in ("PROPN", "NOUN", "VERB") and token.is_stop == False and len(token.text) > 3:
                            tokens.append("{}_{}".format(token.lemma_, token.pos_))
                    yield tokens
                    paragraph = []
                else:    
                    paragraph.append(line)
            

data = MyText("capitalcritiqueo00marx_djvu.txt")

In [2]:
model = gensim.models.Word2Vec(data, size=70, min_count=7, iter=15, workers=10)

In [3]:
print("The vocabulary contains {} tokens.".format(len(model.wv.vocab)))

The vocabulary contains 1469 tokens.


In [4]:
model.wv.vocab.keys()

dict_keys(['prefaces_PROPN', 'share_NOUN', 'work_NOUN', 'bear_VERB', 'whole_NOUN', 'edition_NOUN', 'be_VERB', 'make_VERB', 'basis_NOUN', 'prepare_VERB', 'note_NOUN', 'leave_VERB', 'author_NOUN', 'indicate_VERB', 'passage_NOUN', 'replace_VERB', 'designate_VERB', 'text_NOUN', 'publish_VERB', 'alteration_NOUN', 'effect_VERB', 'coincide_VERB', 'change_NOUN', 'prescribe_VERB', 'marx_PROPN', 'english_PROPN', 'translation_NOUN', 'year_NOUN', 'america_PROPN', 'want_NOUN', 'place_VERB', 'disposal_NOUN', 'friend_NOUN', 'consider_VERB', 'liberty_NOUN', 'case_NOUN', 'help_VERB', 'difficulty_NOUN', 'refer_VERB', 'what_NOUN', 'something_NOUN', 'import_NOUN', 'sacrifice_VERB', 'could_VERB', 'term_NOUN', 'sense_NOUN', 'have_VERB', 'life_NOUN', 'political_PROPN', 'economy_PROPN', 'aspect_NOUN', 'science_NOUN', 'involve_VERB', 'revolution_NOUN', 'show_VERB', 'change_VERB', 'will_VERB', 'find_VERB', 'go_VERB', 'series_NOUN', 'name_NOUN', 'take_VERB', 'operate_VERB', 'fail_VERB', 'do_VERB', 'confine_VERB'

In [5]:
print(model.wv.most_similar(positive=["labour_NOUN"], topn=5))
print("=======")
print(model.wv.most_similar(positive=["exploitation_NOUN"], topn=5))
print("=======")
print(model.wv.most_similar(positive=["wealth_PROPN"], topn=5))
print("=======")
print(model.wv.most_similar(positive=["scotland_PROPN"], topn=5))
print("=======")
print(model.wv.most_similar(positive=["economy_PROPN"], topn=5))

[('bour_PROPN', 0.9236874580383301), ('create_VERB', 0.923599362373352), ('productiveness_NOUN', 0.8998245000839233), ('consume_VERB', 0.8980896472930908), ('expend_VERB', 0.8903379440307617)]
[('degree_NOUN', 0.9745671153068542), ('live_VERB', 0.9568015336990356), ('shorten_VERB', 0.9548390507698059), ('extension_NOUN', 0.94846111536026), ('supply_VERB', 0.9375481605529785)]
[('essay_PROPN', 0.9977242946624756), ('torrens_PROPN', 0.992271900177002), ('section_NOUN', 0.9650440216064453), ('science_NOUN', 0.9487770199775696), ('paris_PROPN', 0.9472794532775879)]
[('journeyman_NOUN', 0.9935593008995056), ('committee_PROPN', 0.9933854341506958), ('inspector_PROPN', 0.9923052191734314), ('petition_NOUN', 0.99211186170578), ('parliament_PROPN', 0.9920125007629395)]
[('political_PROPN', 0.9769187569618225), ('trade_PROPN', 0.9336150884628296), ('history_NOUN', 0.9311421513557434), ('employment_PROPN', 0.9309288263320923), ('commission_PROPN', 0.9265061020851135)]


In [6]:
model.train(data, total_words=len(model.wv.vocab.keys()), epochs=10)

(502256, 746670)

In [7]:
print(model.wv.most_similar(positive=["labour_NOUN"], topn=5))
print(model.wv.most_similar(positive=["manufacture_VERB"], topn=5))
print(model.wv.most_similar(positive=["wealth_PROPN"], topn=5))
print(model.wv.most_similar(positive=["scotland_PROPN"], topn=5))
print(model.wv.most_similar(positive=["economy_PROPN"], topn=5))

[('create_VERB', 0.9687044620513916), ('expenditure_NOUN', 0.9418447613716125), ('bour_PROPN', 0.9412345886230469), ('count_VERB', 0.9363717436790466), ('abstract_NOUN', 0.9308106899261475)]
[('specialise_VERB', 0.9899737238883972), ('assign_VERB', 0.986685574054718), ('revolutionise_VERB', 0.9860734939575195), ('play_NOUN', 0.9854501485824585), ('guild_NOUN', 0.983850359916687)]
[('essay_PROPN', 0.9978162050247192), ('torrens_PROPN', 0.9849659204483032), ('theory_NOUN', 0.9560229778289795), ('evil_NOUN', 0.9475080966949463), ('mill_PROPN', 0.944904625415802)]
[('committee_PROPN', 0.9935905933380127), ('march_PROPN', 0.993569016456604), ('journeyman_NOUN', 0.9933037161827087), ('inspector_PROPN', 0.992128312587738), ('petition_NOUN', 0.9919497966766357)]
[('political_PROPN', 0.9699495434761047), ('science_NOUN', 0.9534783959388733), ('theory_NOUN', 0.9352896213531494), ('history_NOUN', 0.923443615436554), ('volume_NOUN', 0.9141945838928223)]


In [9]:
print(model.wv.most_similar(positive=["export_NOUN"], topn=5))

[('copper_NOUN', 0.9791479110717773), ('bullion_NOUN', 0.9712123870849609), ('import_NOUN', 0.9691593647003174), ('nation_NOUN', 0.9592657685279846), ('coin_VERB', 0.9543654918670654)]


In [31]:
print(model.wv.most_similar(positive=["country_NOUN"], negative=["wealth_NOUN"], topn=5))

[('factory_PROPN', 0.821958601474762), ('district_NOUN', 0.8198416233062744), ('manufacturer_NOUN', 0.8103755712509155), ('england_PROPN', 0.8100759983062744), ('factory_NOUN', 0.7820419073104858)]


In [12]:
print(model.wv.most_similar(positive=["live_VERB"], negative=["shorten_VERB"], topn=5))

[('system_NOUN', 0.7150136232376099), ('factory_NOUN', 0.6234575510025024), ('factory_PROPN', 0.5868390798568726), ('england_PROPN', 0.5609627962112427), ('century_NOUN', 0.5449818968772888)]
