In [1]:
import pandas as pd
import time
import re
import numpy as np
import nltk
import unicodedata
import itertools
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from nltk.stem import SnowballStemmer
from gensim.models import word2vec
from gensim.models.keyedvectors import KeyedVectors

# Leyendo datos

In [2]:
#path_data = "../Fulldata/clear_labeled_data"
tokenizer = nltk.data.load('tokenizers/punkt/spanish.pickle')
stemmer = SnowballStemmer('spanish')
#df = pd.read_csv(path_data, sep='\t', index_col=0)

In [3]:
path_old_data = "../notas/Uurgencias"
path_new_data = "../notas/Ualergias"
data = pd.concat([pd.read_csv(path_old_data, sep='\t', index_col=0), pd.read_csv(path_new_data, sep='\t', index_col=0)]).reset_index(drop=True)

# Función para obtencion de sentences

In [4]:
# Define a function to split a review into parsed sentences
def review_to_sentences(review, tokenizer, stemmer=False):
    # Function to split a review into parsed sentences. Returns a 
    # list of sentences, where each sentence is a list of words
    #
    # 1. Use the NLTK tokenizer to split the paragraph into sentences
    raw_sentences = tokenizer.tokenize(review.strip())
    #
    # 2. Loop over each sentence
    #sentences = []
    #for raw_sentence in raw_sentences:
    #    # If a sentence is empty, skip it
    #    if len(raw_sentence) > 0:
    #        # Otherwise, call review_to_wordlist to get a list of words
    #        sentences.append(review_to_wordlist(raw_sentence))
    #
    if stemmer:
        sentences = [review_to_wordlist(raw_sentence, stemmer) for raw_sentence in raw_sentences if len(raw_sentence) > 0]
    else:
        sentences = [review_to_wordlist(raw_sentence) for raw_sentence in raw_sentences if len(raw_sentence) > 0]
    #
    # Return the list of sentences (each sentence is a list of words,
    # so this returns a list of lists
    return sentences

# Función para el procesamiento de las sentences

In [5]:
def review_to_wordlist(raw_review, stemmer=False):
    # Function to convert a raw review to a string of words
    # The input is a single string (a raw movie review), and 
    # the output is a single string (a preprocessed movie review)
    #
    # 1. Remove accent marks
    review_text = ''.join((c for c in unicodedata.normalize('NFD',str(raw_review)) if unicodedata.category(c) != 'Mn'))
    #
    # 2. Remove non-letters
    #letters_only = re.sub("[^A-Za-z0-9]", " ", review_text) 
    letters_only = re.sub("[^\w\d]", " ", review_text) 
    #
    # 2. Convert to lower case, split into individual words
    words = letters_only.lower().split()                             
    #
    # 3. In Python, searching a set is much faster than searching
    #   a list, so convert the stop words to a set
    stops = set(stopwords.words("spanish"))                  
    # 
    # 4. Remove stop words and apply or not stemming
    if stemmer:
        meaningful_words = [stemmer.stem(w) for w in words if not w in stops]
    else:
        meaningful_words = [w for w in words if not w in stops]
    #
    # 5. Join the words back into one string separated by space, 
    # and return the result.
    #return( " ".join( meaningful_words ))
    return meaningful_words

# Limpiando todo el texto

In [6]:
start = time.time() # Start time
sentences = []
stem_sentences = []
for review in data.text:
    sentences += review_to_sentences(review, tokenizer)
    stem_sentences += review_to_sentences(review, tokenizer, stemmer)
end = time.time()
elapsed = end - start
print("Finished, Time taken: ", elapsed, "seconds.")

Finished, Time taken:  2803.5532009601593 seconds.


In [7]:
len(sentences), len(stem_sentences) #(4776079, 4776079) (4855308, 4855308)

(4855308, 4855308)

# Generando el Modelo de W2V

In [8]:
num_features = 300    # Word vector dimensionality                      
min_word_count = 5   # Minimum word count                        
num_workers = 8       # Number of threads to run in parallel
context = 10          # Context window size                                                                                    
downsampling = 1e-3   # Downsample setting for frequent words

# Initialize and train the model (this will take some time)
print ("Training model...")
start = time.time() # Start time
model = word2vec.Word2Vec(sentences, workers=num_workers, \
            size=num_features, min_count = min_word_count, \
            window = context, sample = downsampling, sg=1)
stem_model = word2vec.Word2Vec(stem_sentences, workers=num_workers, \
            size=num_features, min_count = min_word_count, \
            window = context, sample = downsampling, sg=1)
end = time.time()
elapsed = end - start
model.init_sims(replace=True)
stem_model.init_sims(replace=True)
print("Finished, Time taken: ", elapsed, "seconds.")

Training model...
Finished, Time taken:  789.9282767772675 seconds.


In [9]:
len(model.wv.index2word), len(stem_model.wv.index2word) # (52497, 38274)

(52497, 38274)

In [17]:
path_W2V_stem = "../W2V/hufa_stem-300-5w-10n-skip.bin"
path_W2V_nost = "../W2V/hufa_nost-300-5w-10n-skip.bin"
stem_model.wv.save_word2vec_format(path_W2V_stem, binary=True)
model.wv.save_word2vec_format(path_W2V_nost, binary=True)