In [1]:
import pandas as pd
import numpy as np
import io
import os
import shutil

from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
import pdftotext

import re
from sentence_splitter import SentenceSplitter
splitter = SentenceSplitter(language="en")

import string

from typing import Dict
import fitz
import sys

In [2]:
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem import LancasterStemmer
from nltk.corpus import stopwords
lemmatizer = WordNetLemmatizer()
from collections import defaultdict
import nltk
from nltk.util import ngrams

from collections import Counter

import spacy
nlp = spacy.load('en_core_web_sm')

In [3]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

In [8]:
path = '../data/processed/cann_93/by_line_body.csv'
df_cann_lines_body = pd.read_csv(path,index_col=0)
df_cann_lines_body.head()

Unnamed: 0,content,page_number,real_page_num,section_level_1,section_level_2,section_level_3,clean_content
0,Introduction,19,1,1 Introduction,No info,No info,introduction
1,Semantics and semantic theory,19,1,1 Introduction,No info,No info,semantics semantic theory
2,"In its broadest sense, semantics is the study ...",19,1,1 Introduction,No info,No info,broadest sense semantics study meaning linguis...
3,"It is, however, more usual within linguistics ...",19,1,1 Introduction,No info,No info,however usual within linguistics interpret ter...
4,"In other words, semantics is the study of mean...",19,1,1 Introduction,No info,No info,word semantics study meaning abstracted away a...


In [9]:
path = '../data/processed/cann_93/by_page_body.csv'
df_cann_pages_body = pd.read_csv(path,index_col=0)
df_cann_pages_body.head()

Unnamed: 0,content,page_number,real_page_num,section_level_1,section_level_2,section_level_3,clean_content
0,1\n\nIntroduction\n\n1.1\n\nSemantics and sema...,19,1,1 Introduction,No info,No info,introduction semantics semantic theory broades...
1,1 Introduction semantics over the last two dec...,20,2,1 Introduction,No info,No info,introduction semantics last two decade theory ...
2,Semantics and semantic theory controversial ma...,21,3,1 Introduction,No info,No info,semantics semantic theory controversial matter...
3,1 Introduction the principle (3) minimally req...,22,4,1 Introduction,No info,No info,introduction principle minimally requires mean...
4,Semantics and semantic theory\nOne way in whic...,23,5,1 Introduction,No info,No info,semantics semantic theory one way may achieved...


In [None]:
path = 'cann_lines_toc_clean.csv'
df_cann_lines_toc = pd.read_csv(path,index_col=0)
df_cann_lines_toc.head()

In [None]:
path = "cann_lines_index.csv"
df_cann_lines_index = pd.read_csv(path,index_col=0)
df_cann_lines_index.head()

In [None]:
path = "cann_pages_biblio.csv"
df_cann_pages_biblio = pd.read_csv(path,index_col=0)
df_cann_pages_biblio.head()

In [None]:
df_cann_lines_body["clean_content"] = df_cann_lines_body["clean_content"].fillna('')
df_cann_pages_body["clean_content"] = df_cann_pages_body["clean_content"].fillna('')
df_cann_lines_toc["content"] = df_cann_lines_toc["content"].fillna('')
df_cann_lines_index["content"] = df_cann_lines_index["content"].fillna('')

### 1: Get candidates plus frequencies

In [None]:
#sent = [page.split() for page in df_cann_pages_body["clean_content"]]
sentences = [sentence.split() for sentence in df_cann_lines_body["clean_content"]]

In [None]:
whole_text=''
for sentence in df_cann_lines_body["clean_content"]:
    whole_text+=' '
    whole_text+=sentence
whole_text

Counter(ngrams(whole_text.split(), 2))

In [None]:
whole_text

In [None]:
freq_bigrams_tuples=dict(Counter(ngrams(whole_text.split(), 2)))
freq_bigrams={}
for item in freq_bigrams_tuples.items():
    pair=item[0]
    string=pair[0]+' '+pair[1]
    freq_bigrams[string]=item[1]

In [None]:
freq_unigrams=dict(Counter(whole_text.split()))

In [None]:
freq_ngrams=dict(freq_unigrams, **freq_bigrams)

In [None]:
list_bigrams_lines=[list(ngrams(sentence.split(), 2)) for sentence in df_cann_lines_body["clean_content"]]

In [None]:
list_bigrams_str=[]
for item in list_bigrams_lines:
    sublist=[]
    for pair in item:
        string=pair[0]+' '+pair[1]
        sublist.append(string)
    list_bigrams_str.append(sublist)   
list_bigrams_str 

In [None]:
def get_raw_sentences(raw_sent):
    tokens=word_tokenize(raw_sent)
    lowercased=[w.lower() for w in tokens]
    no_punct=[word for word in lowercased if (word.isalpha() or re.match("[a-z]+-[a-z]+", word))]
    clean_raw_words=[lemmatizer.lemmatize(w) for w in no_punct]
    return (" ").join(clean_raw_words)

In [None]:
raw_sent=[]
for sent in df_cann_lines_body["content"]:
    raw_sent.append(get_raw_sentences(sent))

In [None]:
bigrams_and_contexts=list(zip(list_bigrams_str, sentences, raw_sent))
bigrams_and_contexts

In [None]:
def create_candidates_list(bigrams_contexts):
    candidates=[]
    for item in bigrams_contexts:
        doc=nlp((' ').join(item[1]))
        for w in item[1]:
            if re.match("[a-z]+-[a-z]+", w):
                #print(w)
                candidates.append([w, item[1], item[2], 'NOUN'])
            else:
                for w in doc:
                    candidates.append([str(w), item[1], item[2], w.pos_])
        for w in item[0]:
            candidates.append([w, item[1], item[2], 'CHUNK'])
    return candidates 

In [None]:
candidates_list=create_candidates_list(bigrams_and_contexts)

In [None]:
candidates_df=pd.DataFrame(candidates_list, columns=['candidate_keyword', 'clean_context', 'raw_context', 'POS'])

### 2. Column frequencies

In [None]:
def get_book_length(df_book_pages_body):
    all_clean_content=[]
    for page in df_book_pages_body["clean_content"]:
        all_clean_content+=page.split(' ')
    return len(all_clean_content)

In [None]:
book_length=get_book_length(df_cann_pages_body)

In [None]:
def assign_frequency(x, mf):
    value=(mf.get(x)/book_length)
    return value

In [None]:
candidates_df['freq']=candidates_df.candidate_keyword.apply(lambda x:assign_frequency(x,freq_ngrams))

### 3. Column: is in toc

In [None]:
def clean_toc(text_data):
    tokens=word_tokenize(text_data)
    
    lowercased=[w.lower() for w in tokens]
    
    no_punct=[word for word in lowercased if (word.isalpha() or re.match("[a-z]+-[a-z]+", word))]
    
    clean_tokens=[lemmatizer.lemmatize(word) for word in no_punct]
    
    return (" ").join(clean_tokens)

In [None]:
df_cann_lines_toc['content'].apply(clean_toc)

In [None]:
words_toc=[]
for line in df_cann_lines_toc.content.apply(clean_toc):
    words_toc+=line.split()

In [None]:
def is_in_toc(x):
    if "_" in x:
        if x.split("_")[0] in words_toc and x.split("_")[1] in words_toc:
            return 1
        else:
            return 0
    else:
        if x in words_toc:
            return 1
        else: 
            return 0

In [None]:
candidates_df['is_in_toc']=candidates_df.candidate_keyword.apply(is_in_toc)

### 4. Column: Importance

In [None]:
def form_sentence(x):
    return (' ').join(x)

candidates_df.clean_context=candidates_df.clean_context.apply(form_sentence)

In [None]:
doc=(' ').join(candidates_df.clean_context.unique())
contexts=candidates_df.clean_context.unique()

model = SentenceTransformer('distilbert-base-nli-mean-tokens')
doc_embedding = model.encode([doc])
context_embeddings = model.encode(contexts)

In [None]:
distances = cosine_similarity(doc_embedding, context_embeddings)

In [None]:
similarities=dict(zip(candidates_df.clean_context.unique(),list(distances[0])))

In [None]:
def assign_similarities(x):
    return similarities[x]

In [None]:
candidates_df['importance']=candidates_df.clean_context.apply(assign_similarities)

### 5. Column: position in sentence

In [None]:
def return_pos(row):
    list_words=row['raw_context'].split(' ')
    word=row['candidate_keyword']
    if len(list_words)==1:
        return 0
    else:
        if len(word)>1:
            word=word.split(' ')[0]
            return list_words.index(word)/(len(list_words)-1)
        else:
            return list_words.index(word)/(len(list_words)-1)

In [None]:
candidates_df['position_in_context']=candidates_df.apply(return_pos, axis=1)

### 6. Column: is a named entity

In [None]:
def clean_list_names(x):
    tokens=word_tokenize(x)
    lowercased=[w.lower() for w in tokens]
    no_punct=[word for word in lowercased if (word.isalpha() or re.match("[a-z]+-[a-z]+", word))]
    clean_tokens=[w for w in no_punct if len(w)>2]
    return (" ").join(clean_tokens)

In [None]:
def find_named_entities(df_pages_body):
    named_entities=[]
    for page in df_pages_body.content:
        page_named_entities=re.findall('(?<=[a-zA-Z] )[A-Z]+[a-z]+', page)
        for item in page_named_entities:
            named_entities.append(clean_list_names(item))
    return named_entities

In [None]:
named_entities=find_named_entities(df_cann_pages_body)

In [None]:
def is_named_entity(x):
    if x in named_entities:
        return 1
    else:
        return 0

In [None]:
candidates_df['is_named_entity']=candidates_df.candidate_keyword.apply(is_named_entity)

### 7. Column: length of word

In [None]:
len_dict={}
for keyword in candidates_df.candidate_keyword.unique():
    len_dict[keyword]=len(keyword)

In [None]:
def assign_len(x):
    return len_dict[x]

In [None]:
candidates_df['length']=candidates_df.candidate_keyword.apply(assign_len)

### 8. Column: is a named author 

In [None]:
def clean_list_names(x):
    tokens=word_tokenize(x)
    lowercased=[w.lower() for w in tokens]
    no_punct=[word for word in lowercased if word.isalpha()]
    clean_tokens=[w for w in no_punct if len(w)>2]
    return (" ").join(clean_tokens)

In [None]:
def find_authors_in_biblio(df_pages_biblio):
    clean_list_authors=[]
    for page in df_pages_biblio.content:
        list_names=re.findall('[A-Z]\.\s[A-Za-z]+|[A-Za-z]+\,\s*[A-Z]\.', page)
        for name in list_names:
            if clean_list_names(name)!='and' and clean_list_names(name)!='&':
                if clean_list_names(name)!='':
                    clean_list_authors.append(clean_list_names(name))
    return clean_list_authors

In [None]:
unique_authors=set(find_authors_in_biblio(df_cann_pages_biblio))

In [None]:
def is_named_author(x, unique_authors):
    if x in unique_authors:
        return 1
    else:
        return 0

In [None]:
candidates_df['is_named_author']=candidates_df.candidate_keyword.apply(lambda x: is_named_author(x, unique_authors))

### 9. Target column: is in index

In [None]:
index_words=['page', 'see', 'also', 'index', 'bold']

In [None]:
def find_ngrams_index(text_data):
    list_bigrams=[]
    for ngram in re.findall('[a-zA-z]+\s[a-zA-z]+\s[a-zA-z]+|[a-zA-z]+\s[a-zA-z]+|[a-zA-Z]+', text_data):
        list_bigrams.append(ngram)
    for hw in re.findall("[a-z]+-[a-z]+", text_data):
        list_bigrams.append(hw)
    return list_bigrams    

In [None]:
df_cann_lines_index['ngrams']=df_cann_lines_index.content.apply(find_ngrams_index)

In [None]:
stop_words = stopwords.words("english")

In [None]:
def clean_index(list_ngrams):
    clean_list_ngrams=[]
    for ngram in list_ngrams:
        clean_ngram=[]
        for w in ngram.split():
            clean_w=w.lower()
            clean_w=lemmatizer.lemmatize(clean_w)
            if clean_w not in stop_words and clean_w not in index_words:
                clean_ngram.append(clean_w)
        clean_list_ngrams.append((' ').join(clean_ngram))     
    
    return clean_list_ngrams

In [None]:
df_cann_lines_index.ngrams=df_cann_lines_index.ngrams.apply(clean_index)

In [None]:
raw_list_indexes=df_cann_lines_index.ngrams.tolist()

In [None]:
f=open('cann_indexes.txt', 'w')
lines=[(",").join(sublist)+"\n" for sublist in raw_list_indexes]
f.writelines(lines)
f.close()

In [None]:
#quick manual cleaning

In [None]:
f=open('cann_indexes.txt', 'r')
cann_clean_indexes=f.readlines()
f.close()

In [None]:
def get_final_indexes(indexes_txt):
    cann_clean_indexes_nosep=[]
    for item in indexes_txt:
        item=item.strip('\n')
        item=item.split(",")
        cann_clean_indexes_nosep.append(item)
    final_indexes=[(item, 1) for sublist in cann_clean_indexes_nosep for item in sublist if item!='']
    return dict((set(final_indexes)))

In [None]:
dict_indexes=get_final_indexes(cann_clean_indexes)

In [None]:
def add_target_col(x):
    if x in dict_indexes:
        return dict_indexes[x]
    else:
        return 0

In [None]:
candidates_df['is_in_index']=candidates_df.candidate_keyword.apply(add_target_col)

In [None]:
#check how many ngrams are left out of cndidates to begin with:
list_kw=list(set(candidates_df.candidate_keyword.tolist()))
count_yes=0
count_no=0
for item in get_final_indexes(cann_clean_indexes):
    if item in list_kw:
        print(item+' 1')
        count_yes+=1
    else:
        print(item+ ' 0')
        count_no+=1

print('YES: '+str(count_yes)+', NO:'+ str(count_no))

In [None]:
#fixed. Conclusion: words that don't appear are either not extracted or trigrams or badly processed from pdf. I could do a visualisation about this

### 10. Aggregate lines with duplicated candidate_keyword

In [None]:
candidates_df.drop(columns=['clean_context', 'raw_context'], inplace=True)

In [None]:
#TODO: Save as CSV. 
candidates_df.groupby(['candidate_keyword', 'length', 'POS', 'is_named_entity', 'is_named_author', 'is_in_toc', 'importance', 'is_in_index'], as_index = False).agg({'freq':np.mean, 'position_in_context':np.mean}, inplace=True)