In [None]:
import pandas as pd
import numpy as np
import io
import os
import shutil

from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
import pdftotext

import re
from sentence_splitter import SentenceSplitter
splitter = SentenceSplitter(language="en")

import string

from typing import Dict
import fitz
import sys

In [None]:
pd.options.mode.chained_assignment = None

In [None]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)
np.set_printoptions(threshold=sys.maxsize)

## 1. Get bookmarks

In [None]:
def get_page_count(filepath):
    # WARNING! One page can have multiple bookmarks!
    with fitz.open(filepath) as doc:
        num_pages = doc.page_count  # [[lvl, title, page, …], …]
    return num_pages

In [None]:
def get_bookmarks(filepath: str) -> Dict[int, str]:
    # WARNING! One page can have multiple bookmarks!
    bookmarks = {}
    with fitz.open(filepath) as doc:
        toc = doc.get_toc()  # [[lvl, title, page, …], …]
        for level, title, page in toc:
            bookmarks[page-1] = [level, title]
    return bookmarks

In [None]:
def get_all_sections(filepath):
    num_pages=get_page_count(filepath)
    dict_sections=get_bookmarks(filepath)
    
    complete_sections_dict={}
    section_lev_1=''
    section_lev_2=''
    section_lev_3=''
    abs_num=0
    
    depth2=False
    depth3=False
    
    while abs_num < num_pages:
        if abs_num not in dict_sections.keys():
            complete_sections_dict[abs_num]=[[1, section_lev_1]]
            if depth2:
                complete_sections_dict[abs_num].append([2, section_lev_2])
            if depth3:
                complete_sections_dict[abs_num].append([3, section_lev_3])
            abs_num+=1
        else:
            if dict_sections[abs_num][0]==1:
                complete_sections_dict[abs_num]=[[1, dict_sections[abs_num][1]]]
                section_lev_1=dict_sections[abs_num][1]
                abs_num+=1
            elif dict_sections[abs_num][0]==2:
                complete_sections_dict[abs_num]=[[1, section_lev_1]]
                complete_sections_dict[abs_num].append([2, dict_sections[abs_num][1]])
                section_lev_2=dict_sections[abs_num][1]
                abs_num+=1
                depth2=True
            elif dict_sections[abs_num][0]==3:
                complete_sections_dict[abs_num]=[[1, section_lev_1]]
                complete_sections_dict[abs_num]=[[2, section_lev_2]]
                complete_sections_dict[abs_num].append([3, dict_sections[abs_num][1]])
                section_lev_3=dict_sections[abs_num][1]
                abs_num+=1
                depth3=True
            else:
                continue
                    
    return complete_sections_dict

In [None]:
def get_depth(filepath):
    dict_sections=get_all_sections(filepath)
    abs_num=0
    depth3=False
    depth2=False
    
    list_values=[]
    for value in dict_sections.values():
        if len(value)==3:
            depth3=True
        if len(value)==2:
            depth2=True
    
    if depth3==True:
        return 3
    elif depth2==True:
        return 2
    else:
        return 1

In [None]:
def add_sections_level_1(x):
    complete_sections_dict=get_all_sections(filepath)
    sections=complete_sections_dict[x]
    section_level_1=sections[0][1]
    return section_level_1

In [None]:
def add_sections_level_2(x):
    if get_depth(filepath)==2:
        complete_sections_dict=get_all_sections(filepath)
        sections=complete_sections_dict[x]
        section_level_2=sections[x][1][1]
        return section_level_2

In [None]:
def add_sections_level_3(x):
    if get_depth(filepath)==3:
        complete_sections_dict=get_all_sections(filepath)
        sections=complete_sections_dict[x]
        section_level_3=sections[x][2][1]
        return section_level_3

### Get real pagination

In [None]:
num_map = [(1000, 'M'), (900, 'CM'), (500, 'D'), (400, 'CD'), (100, 'C'), (90, 'XC'), (50, 'L'), (40, 'XL'), (10, 'X'), (9, 'IX'), (5, 'V'), (4, 'IV'), (1, 'I')]

def num2roman(num):
    roman = ''
    while num > 0:
        for i, r in num_map:
            while num >= i:
                roman = roman +  r
                num = num - i
    return roman

In [None]:
def get_dict_pages(filepath):
    dict_pages={}
    with fitz.open(filepath) as doc:
        page_num=doc.get_page_labels()
    for item in page_num:
        abs_page=item['startpage']
        if item['prefix']!='':
            real_page=item['prefix']
        else:
            if item['style']=='r':
                #transform to arabic numeral
                real_page=[item['firstpagenum'], 'r']
            elif item['style']=='D':
                real_page=[item['firstpagenum'], 'd']
        dict_pages[abs_page]=real_page
    return dict_pages

In [None]:
def translate(filepath):
    num_pages=get_page_count(filepath)
    dict_pages=get_dict_pages(filepath)
    
    trans_dict={}
    real_num=0
    abs_num=0
    is_roman=False
    
    while abs_num < num_pages:
        if abs_num not in dict_pages.keys():
            if is_roman:
                trans_dict[abs_num]=num2roman(real_num)
            else:
                trans_dict[abs_num]=real_num
            abs_num+=1
            real_num+=1
        else:
            if type(dict_pages[abs_num])==str:
                trans_dict[abs_num]=dict_pages[abs_num]
                real_num+=1
                abs_num+=1
            else:
                if dict_pages[abs_num][1]=='r':  
                    is_roman=True
                    trans_dict[abs_num]=num2roman(dict_pages[abs_num][0])
                else:
                    trans_dict[abs_num]=dict_pages[abs_num][0]
                    is_roman=False
                real_num=dict_pages[abs_num][0]+1
                abs_num+=1
                    
    return trans_dict

In [None]:
def translate_number(x):
    trans_dict=translate(filepath)
    return trans_dict[x]

In [None]:
#not needed
def get_list_real_pages(filepath):
    trans_dict=translate(filepath)
    return list(trans_dict.values())

## 2. Create dataframes

In [None]:
#returns a list of lists that have as first element the content of one page and as the second element its page number (starting from 0)
def process_pages(name):
    path="books/processed/"+name+".pdf"
    read_file = open(path,'rb')
    content_per_page = pdftotext.PDF(read_file)
    
    list_pages=[]
    page_count=0
    for page in content_per_page:
        content=page

        # recover hyphen-splitter sentences
        pattern_line_break_1 = re.compile(r"-\n")
        # recover sentences split not preceded by a period and followed by lowercase character. I added upper case character, number, parentheses or quotation marks. Problem is that I was losing the first sentences of each page, because they were attached to the header
        #pattern_line_break_2 = re.compile(r"(?<![.?¿!¡º])\s*\n(?=\s*[a-zA-Z0-9\(\)\'\"])")
        pattern_line_break_2 = re.compile(r"(?<![.?¿!¡º])\s*\n(?=\s*[a-z])")

        content_processed = pattern_line_break_1.sub("",content)
        content_processed = pattern_line_break_2.sub(" ", content_processed)

        # Split lines
        splitter = SentenceSplitter(language="en")
        split_file = splitter.split(text=content_processed)
        page_content="\n".join(split_file)
        
        list_pages.append([page_content, page_count])
        page_count+=1

    return list_pages

In [None]:
list_cann_pages=process_pages("(Cambridge Textbooks in Linguistics) Ronnie Cann - Formal Semantics_ An Introduction-Cambridge University Press (1993)")

In [None]:
df_cann_pages = pd.DataFrame(list_cann_pages, columns = ['content', 'page_number'])

In [None]:
df_cann_pages.head()

### Add columns with real page numbers and sections

#### Page by page

In [None]:
filepath="books/processed/(Cambridge Textbooks in Linguistics) Ronnie Cann - Formal Semantics_ An Introduction-Cambridge University Press (1993).pdf"

In [None]:
df_cann_pages['real_page_num']=df_cann_pages['page_number'].apply(translate_number)

In [None]:
df_cann_pages['section_level_1']=df_cann_pages['page_number'].apply(add_sections_level_1)

In [None]:
df_cann_pages['section_level_2']=df_cann_pages['page_number'].apply(add_sections_level_2)

In [None]:
df_cann_pages['section_level_3']=df_cann_pages['page_number'].apply(add_sections_level_3)

In [None]:
df_cann_pages.head(50)

In [None]:
def list_lines(df_by_pages):
    list_pages=df_by_pages.values.tolist()
    list_lines=[]
    for item in list_pages:
        if type(item[0])==str:
            split_file = splitter.split(text=item[0])
            for line in split_file:
                list_lines.append([line, item[1], item[2], item[3], item[4], item[5]])
        else:
            list_lines.append([item[0], item[1], item[2], item[3], item[4], item[5]])
    return list_lines

In [None]:
list_lines(df_cann_pages)

In [None]:
df_cann_lines = pd.DataFrame(list_lines(df_cann_pages), columns = df_cann_pages.columns)

In [None]:
df_cann_lines.head()

## Data cleaning

### Drop lines/pages that include the phrase "This page was intentionally left blank"

In [None]:
#df_cann_lines[df_cann_lines.content.str.match("This page intentionally left blank", na=False)]

In [None]:
#df_cann_lines[df_cann_lines.content.str.match("This page was intentionally left blank", na=False)]

In [None]:
df_cann_lines=df_cann_lines[~df_cann_lines.content.str.match("This page intentionally left blank", na=False)]
df_cann_lines=df_cann_lines[~df_cann_lines.content.str.match("This page was intentionally left blank", na=False)]

In [None]:
df_cann_pages=df_cann_pages[~df_cann_pages.content.str.match("This page intentionally left blank", na=False)]
df_cann_pages=df_cann_pages[~df_cann_pages.content.str.match("This page was intentionally left blank", na=False)]

In [None]:
df_cann_pages.info()

In [None]:
df_cann_lines.info()

### Drop null lines/pages

In [None]:
df_cann_pages.content.isnull().sum()

In [None]:
df_cann_lines.content.isnull().sum()

In [None]:
df_cann_lines.content.dropna(inplace=True)
df_cann_pages.content.dropna(inplace=True)

In [None]:
df_cann_lines.info()

In [None]:
df_cann_pages.info()

### Drop lines/pages that contain only numbers (they are page numbers, numbers of sections, etc), only punctuation or are empty (with whitespace character or not)

In [None]:
#finds lines that contain only numbers 
#PROBLEM: needed for index???
df_cann_lines[df_cann_lines.content.str.match("^[^a-zA-Z]*\d+[^a-zA-Z]*$")]

In [None]:
#finds lines that are empty (they don't have a whitespace character)
df_cann_lines[df_cann_lines.content.str.match("^$")]

In [None]:
#this finds the lines that are just punctuation
df_cann_lines[df_cann_lines.content.str.match("^[^\w]+$")]

In [None]:
df_cann_lines=df_cann_lines[~df_cann_lines.content.str.match("^[^a-zA-Z]*\d+[^a-zA-Z]*$")]
df_cann_pages=df_cann_pages[~df_cann_pages.content.str.match("^[^a-zA-Z]*\d+[^a-zA-Z]*$")]

In [None]:
df_cann_lines=df_cann_lines[~df_cann_lines.content.str.match("^[^\w]+$")]
df_cann_pages=df_cann_pages[~df_cann_pages.content.str.match("^[^\w]+$")]

In [None]:
df_cann_lines=df_cann_lines[~df_cann_lines.content.str.match("^$")]
df_cann_pages=df_cann_pages[~df_cann_pages.content.str.match("^$")]

In [None]:
df_cann_lines.reset_index(drop=True, inplace=True)
df_cann_pages.reset_index(drop=True, inplace=True)

### Split dataset into 3: ToC, Body of text, index

In [None]:
df_cann_lines.to_csv("cann_info_lines.csv", encoding = 'utf-8')

In [None]:
df_cann_pages.to_csv("cann_info_pages.csv", encoding = 'utf-8')

In [None]:
path = 'cann_info_pages.csv'
df_cann_pages = pd.read_csv(path,index_col=0)
df_cann_pages.head()

In [None]:
path = 'cann_info_lines.csv'
df_cann_lines = pd.read_csv(path,index_col=0)
df_cann_lines.head()

In [None]:
df_cann_pages.section_level_2.isnull().sum()

In [None]:
df_cann_pages.section_level_1.fillna("No info", inplace=True)
df_cann_lines.section_level_1.fillna("No info", inplace=True)
df_cann_pages.section_level_2.fillna("No info", inplace=True)
df_cann_lines.section_level_2.fillna("No info", inplace=True)
df_cann_pages.section_level_3.fillna("No info", inplace=True)
df_cann_lines.section_level_3.fillna("No info", inplace=True)

In [None]:
df_cann_pages.section_level_2.isnull().sum()

In [None]:
#FALTA: remove references (maybe don't do it for compendiums)

#toc
df_cann_pages_toc=df_cann_pages[df_cann_pages.section_level_1.str.match("CONTENTS|Contents")]#generalise more
df_cann_lines_toc=df_cann_lines[df_cann_lines.section_level_1.str.match("CONTENTS|Contents")]

#index
df_cann_pages_index=df_cann_pages[df_cann_pages.section_level_1.str.match("INDEX|Index")]#generalise more
df_cann_lines_index=df_cann_lines[df_cann_lines.section_level_1.str.match("INDEX|Index")]

In [None]:
#body: probably needs to be done manually
df_cann_pages_body=df_cann_pages[(df_cann_pages.page_number>=19) & (df_cann_pages.page_number<=340)]
df_cann_lines_body=df_cann_lines[(df_cann_lines.page_number>=19) & (df_cann_lines.page_number<=340)]

In [None]:
df_cann_pages_toc.reset_index(drop=True, inplace=True)
df_cann_lines_toc.reset_index(drop=True, inplace=True)
df_cann_pages_body.reset_index(drop=True, inplace=True)
df_cann_lines_body.reset_index(drop=True, inplace=True)
df_cann_pages_index.reset_index(drop=True, inplace=True)
df_cann_lines_body.reset_index(drop=True, inplace=True)

## Text preprocessing

### STEPS: 
1. Remove stopwords
2. Remove all numbers
3. Stem
4. Look for phrases (use gensim phraser)
5. Do wordcloud

In [None]:
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem import LancasterStemmer
from nltk.corpus import stopwords
porter = LancasterStemmer()
lemmatizer = WordNetLemmatizer()
from collections import defaultdict

from collections import Counter

import gensim
from gensim import models
from gensim.models.phrases import Phrases, Phraser
from gensim import corpora
from pprint import pprint

In [None]:
#keep updating
textbook_words=['thus', 'today', 'nowadays', 'actually', 'section', 'indeed', 'every', 'any', 'some', 'example', 'therefore', 'definition', 'introduction', 'conclusion', 'chapter', 'appendix', 'otherwise', 'thing', 'rather', 'instead', 'like', 'since', 'given', 'case', 'hence', 'iff', 'see', 'beyond', 'below', 'above', 'postscript', 'index', 'ensure', 'generally', 'anything', 'something', 'other']

In [None]:
def clean_text(text_data):
    tokens=word_tokenize(text_data)
    
    lowercased=[w.lower() for w in tokens]
    
    no_punct=[word for word in lowercased if word.isalpha()]
    
    stop_words = stopwords.words("english")
    
    no_sw=[w for w in no_punct if not w in stop_words]
    
    no_tw=[w for w in no_sw if not w in textbook_words]
    
    long_words=[w for w in no_tw if len(w)>2]
    
    clean_tokens=[lemmatizer.lemmatize(word) for word in long_words]
    
    return (" ").join(clean_tokens)

In [None]:
df_cann_lines_body["clean_content"] = df_cann_lines_body["content"].apply(lambda x: clean_text(x))
df_cann_lines_toc["clean_content"] = df_cann_lines_toc["content"].apply(lambda x: clean_text(x))
#df_cann_lines_index["clean_content"] = df_cann_lines_index["content"].apply(lambda x: clean_text(x))

In [None]:
df_cann_pages_body["clean_content"] = df_cann_pages_body["content"].apply(lambda x: clean_text(x))
df_cann_pages_toc["clean_content"] = df_cann_pages_toc["content"].apply(lambda x: clean_text(x))

### FIltering by POS to get only NPs as keywords

In [None]:
import spacy
nlp = spacy.load('en_core_web_sm')

def keep_NPs(sent):
    list_tokens=[]
    sent = nlp(sent)
    for word in sent:
        if word.tag_== "NN" or word.tag_=="NNP" or word.tag_=="NNPS" or word.tag_=="NNS" or word.tag_=="JJ" or word.tag_=="JJR" or word.tag_=="JJS":
            list_tokens.append(word) 
    clean_sent=(" ").join([i.text for i in list_tokens])
    return clean_sent

In [None]:
df_cann_lines_body["clean_content"] = df_cann_lines_body["clean_content"].apply(lambda x: keep_NPs(x))
#df_cann_lines_toc["clean_content"] = df_cann_lines_toc["content"].apply(lambda x: keep_NPs(x))

In [None]:
df_cann_pages_body["clean_content"] = df_cann_pages_body["clean_content"].apply(lambda x: keep_NPs(x))
#df_cann_pages_toc["clean_content"] = df_cann_pages_toc["content"].apply(lambda x: keep_NPs(x))

In [None]:
df_cann_lines_body.head()

In [None]:
df_cann_lines_body.to_csv("cann_lines_body_clean.csv", encoding = 'utf-8')

In [None]:
df_cann_pages_body.to_csv("cann_pages_body_clean.csv", encoding = 'utf-8')

In [None]:
path = 'cann_lines_body_clean.csv'
df_cann_lines_body = pd.read_csv(path,index_col=0)
df_cann_lines_body.head()

In [None]:
path = 'cann_pages_body_clean.csv'
df_cann_pages_body = pd.read_csv(path,index_col=0)
df_cann_pages_body.head()

### Using gensim to extract n-grams and most common terms

In [None]:
df_cann_lines_body["clean_content"] = df_cann_lines_body["clean_content"].fillna('')
df_cann_pages_body["clean_content"] = df_cann_pages_body["clean_content"].fillna('')

In [None]:
#Unclear to me what to use as documents -- lines or pages:
#sent = [page.split() for page in df_cann_pages_body["clean_content"]]
sent = [sentence.split() for sentence in df_cann_lines_body["clean_content"]]

In [None]:
# Build the bigram models
bigram = gensim.models.phrases.Phrases(sent, min_count=3, threshold=10)
sentences = bigram[sent]

In [None]:
sentences_list=list(sentences)

In [None]:
mydict = corpora.Dictionary([sent for sent in sentences])

In [None]:
print(mydict)

In [None]:
#same but easier, probably use this. Falta: add it to df (maybe normalise it to 0-1)
mydict.most_common()

In [None]:
map_frequency={}
for item in mydict.most_common():
    map_frequency[item[0]]=item[1]

In [None]:
def assign_frequency(x):
    return map_frequency[x]

In [None]:
#Next: 
#(1) get score if they are in sections or subsections, DONE
#(2) get score for their position in the sentence, LEAVE IT OUT
#(3) get importance of context score

#Finally,
#(4) do NER and add all names of people, institutions... what else?

#If I finish, calculate the length of index.


### Create a dataframe of all the candidate keywords, with a column consisting of its context and a column consisting of each relevant value

In [None]:
#raw_sent = [sentence for sentence in df_cann_lines_body["content"]]

In [None]:
#len(raw_sent)

In [None]:
def create_candidates_list(sentences):
    candidates=[]
    for item in sentences:
        for w in item:
            candidates.append([w, (' ').join(item)])
    return candidates 

In [None]:
candidates_list=create_candidates_list(sentences_list)

In [None]:
candidates_df=pd.DataFrame(candidates_list, columns=['candidate_keyword', 'context'])

In [None]:
candidates_df.head()

#### Add frequency

In [None]:
candidates_df['frequency']=candidates_df.candidate_keyword.apply(assign_frequency)

In [None]:
candidates_df.head()

#### Add appearance in title

In [None]:
#falta netejar toc
def clean_toc(text_data):
    tokens=word_tokenize(text_data)
    
    lowercased=[w.lower() for w in tokens]
    
    no_punct=[word for word in lowercased if word.isalpha()]
    
    clean_tokens=[lemmatizer.lemmatize(word) for word in no_punct]
    
    return (" ").join(clean_tokens)

In [None]:
df_cann_lines_toc['content'].apply(clean_toc)

In [None]:
words_toc=[]
for line in df_cann_lines_toc.content.apply(clean_toc):
    words_toc+=line.split()
#print(words_toc)

In [None]:
def is_in_toc(x):
    if "_" in x:
        if x.split("_")[0] in words_toc or x.split("_")[1] in words_toc:
            return 1
        else:
            return 0
    else:
        if x in words_toc:
            return 1
        else: 
            return 0

In [None]:
candidates_df['is_in_toc']=candidates_df.candidate_keyword.apply(is_in_toc)

In [None]:
candidates_df.head()

### Adding the importance of the context for the whole document

This seems to work! Using sentence embeddings and book embedding and compute cosine similarity between them

In [None]:
#!pip install sentence_transformers

In [None]:
from sentence_transformers import SentenceTransformer
doc=(' ').join(candidates_df.context.unique())
candidates=candidates_df.context.unique()

model = SentenceTransformer('distilbert-base-nli-mean-tokens')
doc_embedding = model.encode([doc])
candidate_embeddings = model.encode(candidates)

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

top_n = 30
distances = cosine_similarity(doc_embedding, candidate_embeddings)
keywords = [candidates[index] for index in distances.argsort()[0][-top_n:]]
keywords

### USELESS STUFF FOR NOW

In [None]:
#do it with tf-idf?: get a tf-idf vector per sentence and do cosine similarity with the tf-idf vectore of the whole book

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfdif_vectorizer = TfidfVectorizer(ngram_range=(1,1))

#Unclear to me what to use as documents -- sentences or pages
tfidf = tfdif_vectorizer.fit_transform(candidates_df["context"])
#tfidf = tfdif_vectorizer.fit_transform(df_cann_pages_body["clean_content"])

tfidf.shape

In [None]:
tfidf[2].toarray()
#print(tfdif_vectorizer.get_feature_names())

In [None]:
tfdif_vectorizer.vocabulary_

In [None]:
#1 embed sentences using sent2vec
#2 embed whole document using doc2vec
# rank sentence embeddings based on similarity to whole document

In [None]:
#!pip3 install sent2vec
#from sent2vec.vectorizer import Vectorizer

In [None]:
#candidates_df.context.head(5)

In [None]:
#sentences = candidates_df.context.unique()

#vectorizer = Vectorizer()
#vectorizer.bert(sentences)
#vectors = vectorizer.vectors

### Let's try it with word2vec (not pretrained)

In [None]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [None]:
import multiprocessing
from time import time 

from gensim.models import Word2Vec
cores = multiprocessing.cpu_count() # Count the number of cores in a computer
cores

In [None]:
w2v_model = Word2Vec(min_count=20,
                     window=2,
                     sample=6e-5, 
                     alpha=0.03, 
                     min_alpha=0.0007, 
                     negative=20,
                     workers=cores-1)

In [None]:
t = time()

w2v_model.build_vocab(sentences, progress_per=10000)

print('Time to build vocab: {} mins'.format(round((time() - t) / 60, 2)))

In [None]:
t = time()

w2v_model.train(sentences, total_examples=w2v_model.corpus_count, epochs=30, report_delay=1)

print('Time to train the model: {} mins'.format(round((time() - t) / 60, 2)))

In [None]:
w2v_model.init_sims(replace=True)

In [None]:
w2v_model.wv.most_similar(positive=["function"])

### Let's try it with pretrained model from spacy

In [None]:
import spacy

In [None]:
model = spacy.load('en_core_web_sm')#maybe try with bigger models later

In [None]:
for line in candidates_df.context.unique():
    doc = model(line)
    print(line, doc.vector, len(doc.vector))
#    for token in doc:
#        print(token.vector)
#        print(token.text, token.has_vector, token.vector_norm, token.is_oov)

In [None]:
for line in candidates_df.context.unique():
    
    doc = model(line)
    print(line, doc.vector, len(doc.vector))

In [None]:
len(doc.vector)

In [None]:
print(doc[89])
print(doc[3])
doc[1].similarity(doc[3])

In [None]:
mydict = corpora.Dictionary([sent for sent in sentences])
corpus = [mydict.doc2bow(sent) for sent in sentences]

# Show the Word Weights in Corpus
#for doc in corpus:
#    print([[mydict[id], freq] for id, freq in doc])

# Create the TF-IDF model
tfidf = models.TfidfModel(corpus, smartirs='ntc')

# Show the TF-IDF weights
for doc in tfidf[corpus]:
    print([[mydict[id], np.around(freq, decimals=2)] for id, freq in doc])

corpus_tfidf = tfidf[corpus]

In [None]:
#this gives you the words that get highest tfidf score in any sentence
#problem: gives you weird examples that only appear in one place
topWords = {}
for doc in corpus_tfidf:
    for iWord, tf_idf in doc:
        if iWord not in topWords:
            topWords[iWord] = 0

        if tf_idf > topWords[iWord]:
            topWords[iWord] = tf_idf

for i, item in enumerate(sorted(topWords.items(), key=lambda x: x[1], reverse=True), 1):
    print("%2s: %-13s %s" % (i, mydict[item[0]], item[1]))
    if i == 1000: break

In [None]:
#from nltk import ngrams

#sent = [line for line in df_cann_lines["clean_content"]]

#ngram_counts = Counter(ngrams(sent, 2))
#ngram_counts.most_common(10)

In [None]:
#def most_frequent_words(docs, vocabulary, top_words):
#    vocab = Counter()
#    
#    for doc in docs:
#        for word in doc.split(' '):
#            if word in vocabulary.keys():
#                vocab[word] += 1
#    return vocab.most_common(top_words)

In [None]:
#most_frequent_words(df_cann_pages_body["clean_content"], tfdif_vectorizer.vocabulary_, 30)