In [None]:
import gensim
import spacy

In [None]:
from gensim.corpora import Dictionary
from gensim.models import LdaModel
from gensim.models import TfidfModel

In [None]:
nlp = spacy.load("en")

In [None]:
def clean_texts(texts):
    cleaned_texts = []
    cleaned_text = []
    
    for text in texts:
        doc = nlp(text.lower())
        for w in doc:
            # if it's not a stop word or punctuation mark, add it to our article!
            if w.text != '\n' and not w.is_stop and not w.is_punct and not w.like_num and w.text != 'I':
                # we add the lematized version of the word
                word = w.lemma_.strip()
                # do we want = > <
                if word != '' and word != '=' and word != ">" and word != "<" and word not in stop_words:
                    cleaned_text.append(word)
        cleaned_texts.append(cleaned_text)
        cleaned_text = []
        
    bigram = gensim.models.Phrases(cleaned_texts)
    cleaned_texts = [bigram[line] for line in cleaned_texts]
    cleaned_words = []
    for text in cleaned_texts:
        for word in text:
            cleaned_words.append(word)
    return cleaned_words

In [None]:
def text_summaries(texts):
    total_docs = len(texts)
    avg_len = 0
    for text in texts:
        avg_len += len(text)
    avg_len /= total_docs
    
    cleaned_texts = []
    for text in texts:
        cleaned_texts.append(clean_texts(text))
    
    word_use = {}
    for text in cleaned_texts:
        for word in text:
            if word not in word_use:
                word_use[word] = 0
            if word in word_use:
                word_use[word] += 1
    
    sorted_words = sorted(word_use.items(), key=operator.itemgetter(1))
    sorted_words.reverse()

    dictionary = Dictionary(cleaned_texts)
    corpus = [dictionary.doc2bow(text) for text in cleaned_texts]
    ldamodel = LdaModel(corpus=corpus, num_topics=2, id2word=dictionary, passes=10, iterations=500)
    
    tf_idf_model = TfidfModel(corpus)
    tf_idf_texts = tf_idf_model[corpus]
    
    d = {dictionary.get(id): value for doc in tf_idf_texts for id, value in doc}
    sorted_d = sorted(d.items(), key=operator.itemgetter(1))
    sorted_d.reverse()
    
    print("Total number of documents: " + str(total_docs))
    print("Average length of text: " + str(avg_len))
    print(" ")
    print("The top 5 tf-idf scores:")
    print(sorted_d[0:5])
    print(" ")
    print("The top 20 most used words:")
    print(sorted_words[0:20])
    print(" ")
    print("Topic Model with 2 topics")
    print(ldamodel.print_topics())
    return cleaned_texts
    

In [None]:
def joined_texts(texts):
    joined_texts = []
    for text in texts:
        s = ' '
        joined_text = s.join(text)
        joined_texts.append(joined_text)
    return joined_texts

In [None]:
# to use with embedding model
def sentence_entropy(sentence, model):
    sentence_word_vectors = []
    if len(sentence) >= 2:
        for word in sentence:
            try:
                sentence_word_vectors.append(model[word])
            except KeyError:
                continue
        distances = []
        for i in range(0, len(sentence_word_vectors) - 1):
            v = sentence_word_vectors[i+1]
            v_ = sentence_word_vectors[i]
            dist = np.dot(v, v_)/(np.linalg.norm(v)* np.linalg.norm(v_))
            distances.append(dist)
        mean_distance = np.mean(distances, axis=0)
        semantic_variability = 0
        for dist in distances:
            semantic_variability += np.square(dist - mean_distance)
        semantic_variability /= len(distances)
        return semantic_variability

In [None]:
# to use with BERT like language model
def get_sentence_perplexity(sentence, model, tokenizer):
    tokenize_input = tokenizer.tokenize(sentence)
    tensor_input = torch.tensor([tokenizer.convert_tokens_to_ids(tokenize_input)])
    predictions = model(tensor_input)
    # print(predictions[0])
    # sentence_embedding, word_embedding = predictions[0], predictions[1]
    loss_fct = torch.nn.CrossEntropyLoss()
    loss = loss_fct(predictions[0].squeeze(),tensor_input.squeeze()).data 
    return math.exp(loss)

In [None]:
# to use with GPT like model
def gpt_ppl_score(sentence, model, tokenizer, stride=512):
    encodings = tokenizer(sentence, return_tensors='pt')
#     encodings = torch.tensor([tokenizer.convert_tokens_to_ids(tokenize_input)])
    max_length = model.config.n_positions
    lls = []
    for i in tqdm(range(0, encodings.input_ids.size(1), stride)):
        begin_loc = max(i + stride - max_length, 0)
        end_loc = min(i + stride, encodings.input_ids.size(1))
        trg_len = end_loc - i    # may be different from stride on last loop
        input_ids = encodings.input_ids[:,begin_loc:end_loc]
        target_ids = input_ids.clone()
        target_ids[:,:-trg_len] = -100

        with torch.no_grad():
            outputs = model(input_ids, labels=target_ids)
            log_likelihood = outputs[0] * trg_len

        lls.append(log_likelihood)

    ppl = torch.exp(torch.stack(lls).sum() / end_loc)
    return ppl