In [1]:
from __future__ import print_function
import argparse
import pprint
import gensim

from glove import Glove
from glove import Corpus


def read_corpus(filename):

    delchars = [chr(c) for c in range(256)]
    delchars = [x for x in delchars if not x.isalnum()]
    delchars.remove(' ')
    delchars = ''.join(delchars)

    with open(filename, 'r') as datafile:
        for line in datafile:
            yield line.lower().translate(None, delchars).split(' ')


def read_wikipedia_corpus(filename):

    # We don't want to do a dictionary construction pass.
    corpus = gensim.corpora.WikiCorpus(filename, dictionary={})

    for text in corpus.get_texts():
        yield text

In [2]:
import urllib.request
import os
import tarfile

url = "http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"
filepath = "aclImdb_v1.tar.gz"
if not os.path.isfile(filepath):
    result = urllib.request.urlretrieve(url, filepath)
    print('downloaded: ', result)

# 解壓縮檔案
# Open for reading with gzip compression
if not os.path.exists("data/aclImdb"):
    tfile = tarfile.open("aclImdb_v1.tar.gz", 'r:gz')
    result = tfile.extractall('data/')
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.preprocessing.text import Tokenizer

# Regular expression operations
# Compile a regular expression pattern into a regular expression object
import re
def rm_tags(text):
    re_tag = re.compile(r'<[^>]+>')
    return re_tag.sub('', text)

import os
def read_files(filetype):
    path = "data/aclImdb/"
    file_list = []
    
    positive_path = path + filetype + "/pos/"
    for f in os.listdir(positive_path):
        file_list += [positive_path + f]
    
    negative_path = path + filetype + "/neg/"
    for f in os.listdir(negative_path):
        file_list += [negative_path + f]
    
    print('read ', filetype, ' files: ', len(file_list))
    
    all_labels = ([1] * 12500 + [0] * 12500)
    
    all_texts = []
    for fi in file_list:
        with open(fi, encoding = 'utf8') as file_input:
            all_texts += [rm_tags(" ".join(file_input.readlines()))]
    
    return all_labels, all_texts

y_train, train_text = read_files("train")
y_test, test_text = read_files("test")
print(train_text[0])
print(y_train[0])
print(train_text[12501])
print(y_train[12501])
print(test_text[0])
print(y_test[0])
print(test_text[12501])
print(y_test[12501])


read  train  files:  25000
read  test  files:  25000
An intriguingly bold film weaves the seemingly effortless camerawork with some superb casting and an explosive soundtrack to plot the damaging effects of the crime and corruption of the Santiago underworld on 2 naive young brothers from the southern city of Temuco.Film debutant Daniella Rios is the seductive erotic dancer Gracia, working in the nightclub owned by the face of the new mini-wave in Chilean film production, Alejandro Trejo. The elder brother, played maturely by Nestor Cantillana, is easily convinced to become Trejo's lead henchman, after a night at the stripclub to celebrate younger brother Victor's (Juan Pablo Miranda) seventeenth birthday. From the establishing shot of this opening scene, the film explodes into neo-noir exploration of everything the outside world doesn't usually expect to see in this country so stereotypically conservative and catholic.Gracia's charms of seduction attract the three men like bees to hon

In [3]:
a=""
for i in range(len(train_text)):
    for j in range(len(train_text[i])):
        _tmp=str(train_text[i][j])
        if(_tmp==" "):
            a+=" "
        elif(str(train_text[i][j]).isalpha()==True):
            a+=train_text[i][j]
a=a.lower()
a=a.split( )
try:
    a=a.remove([])
except:
    pass

In [4]:


# Set up command line parameters.
parser = argparse.ArgumentParser(description='Fit a GloVe model.')

parser.add_argument('--create', '-c', action='store',
                    default=True,
                    help=('The filename of the corpus to pre-process. '
                          'The pre-processed corpus will be saved '
                          'and will be ready for training.'))
parser.add_argument('-wiki', '-w', action='store_true',
                    default=False,
                    help=('Assume the corpus input file is in the '
                          'Wikipedia dump format'))
parser.add_argument('--train', '-t', action='store',
                    default=10,
                    help=('Train the GloVe model with this number of epochs.'
                          'If not supplied, '
                          'We\'ll attempt to load a trained model'))
parser.add_argument('--parallelism', '-p', action='store',
                    default=1,
                    help=('Number of parallel threads to use for training'))
parser.add_argument('--query', '-q', action='store',
                    default='torch',
                    help='Get closes words to this word.')
args = parser.parse_args(args=[])

In [8]:
print(a[1])

intriguingly


In [38]:
if args.create:
    # Build the corpus dictionary and the cooccurrence matrix.
    print('Pre-processing corpus')

    if args.wiki:
        print('Using wikipedia corpus')
        get_data = read_wikipedia_corpus
    else:
        get_data = read_corpus

    corpus_model = Corpus()
    corpus_model.fit(a, window=10)
    corpus_model.save('corpus.model')

    print('Dict size: %s' % len(corpus_model.dictionary))
    print('Collocations: %s' % corpus_model.matrix.nnz)
    
if args.train:
    # Train the GloVe model and save it to disk.

    if not args.create:
        # Try to load a corpus from disk.
        print('Reading corpus statistics')
        corpus_model = Corpus.load('corpus.model')

        print('Dict size: %s' % len(corpus_model.dictionary))
        print('Collocations: %s' % corpus_model.matrix.nnz)

    print('Training the GloVe model')

    glove = Glove(no_components=100, learning_rate=0.05)
    glove.fit(corpus_model.matrix, epochs=int(args.train),
              no_threads=args.parallelism, verbose=True)
    glove.add_dictionary(corpus_model.dictionary)

    glove.save('glove.model')

Training the GloVe model
Performing 10 training epochs with 1 threads
Epoch 0
Epoch 1
Epoch 2
Epoch 3
Epoch 4
Epoch 5
Epoch 6
Epoch 7
Epoch 8
Epoch 9


In [39]:
if args.query:
    # Finally, query the model for most similar words.
    if not args.train:
        print('Loading pre-trained GloVe model')
        glove = Glove.load('glove.model')

    print('Querying for %s' % args.query)
    pprint.pprint(glove.most_similar(args.query, number=10))

Querying for torch
[('shrink', 0.9528697041474521),
 ('patriotism', 0.9448376324079611),
 ('condescending', 0.9405317673280993),
 ('cab', 0.9387947252620756),
 ('punchline', 0.9356652000446415),
 ('clinic', 0.9347968630852115),
 ('slug', 0.9344321221023468),
 ('secure', 0.9341518217061977),
 ('candle', 0.9331772051092638)]


In [41]:
print('Querying for %s' % "cab")
pprint.pprint(glove.most_similar('cab', number=10))

Querying for cab
[('deer', 0.9861586661269642),
 ('clinic', 0.9809775364263449),
 ('pizza', 0.9756774890403555),
 ('traitor', 0.9749497951264646),
 ('poet', 0.973449088800856),
 ('prop', 0.9724361198329827),
 ('homicidal', 0.9708488614931129),
 ('garage', 0.9699048848157713),
 ('chemical', 0.969285674004196)]


In [42]:
print('Querying for %s' % "deer")
pprint.pprint(glove.most_similar('deer', number=10))

Querying for deer
[('cab', 0.9861586661269641),
 ('clinic', 0.9770505381751335),
 ('sour', 0.976533622697208),
 ('coke', 0.9737799186775501),
 ('basketball', 0.9734173713498455),
 ('saint', 0.9688736915361924),
 ('pizza', 0.9674406352218079),
 ('newspaper', 0.9670046977609423),
 ('meteor', 0.9658782701338349)]
