In [None]:
import pandas as pd
import numpy as np
import io
import os
import shutil

import re
from sentence_splitter import SentenceSplitter
splitter = SentenceSplitter(language="en")

import string

from typing import Dict
import fitz
import sys
import mdutils
import mdpdf

import pickle

from sklearn.preprocessing import MinMaxScaler


# 1. Get aggregated csv of relevant book

In [None]:
#TODO: parametritzar
path = '../data/processed/jenkins_08/aggregated.csv'
df_aggregated = pd.read_csv(path,index_col=0)
df_aggregated.head()

# 2. Data cleaning and feature engineering

In [None]:
df_aggregated.info()

In [None]:
df_aggregated.dropna(inplace=True)

In [None]:
textbook_words=['whereas', 
                'conversely', 
                'result',
                'suggest',
                'contrast', 
                'comparison', 
                'view', 
                'likewise',
                'despite',
                'while',
                'subsequent',
                'subsequently',
                'overall',
                'summary',
                'sumarize',
                'first',
                'second', 
                'third',
                'firstly',
                'secondly', 
                'thirdly',
                'finally',
                'although',
                'thus',
                'again',
                'further',
                'then',
                'besides',
                'too',
                'similarly',
                'correspondingly',
                'regarding',
                'involved',
                'approach',
                'account',
                'theory',
                'method',
                'required',
                'process',
                'research',
                'role',
                'significant',
                'source',
                'variable',
                'issue',
                'function',
                'formula',
                'formulate',
                'phrase',
                'factor',
                'evidence',
                'derived',
                'derive',
                'establish',
                'concept',
                'available',
                'context',
                'assume',
                'assumption',
                'additionally',
                'additional',
                'analysis',
                'general',
                'certain',
                'certainly',
                'furthermore',
                'moreover',
                'nonetheless',
                'nevertheless',
                'bibliography',
                'prove',
                'part',
                'today', 
                'nowadays', 
                'actually', 
                'section', 
                'indeed', 
                'every', 
                'any', 
                'some',
                'instance',
                'example', 
                'therefore', 
                'definition',
                'define',
                'explain',
                'explanation',
                'introduction', 
                'conclusion',
                'conclude',
                'chapter', 
                'appendix', 
                'otherwise', 
                'thing',
                'concisely',
                'concise',
                'brief',
                'briefly',
                'rather', 
                'instead', 
                'like', 
                'since', 
                'given', 
                'case', 
                'hence', 
                'iff', 
                'see', 
                'beyond', 
                'below', 
                'above', 
                'postscript',
                'preface',
                'index', 
                'ensure', 
                'generally', 
                'anything', 
                'something',
                'everything',
                'other']

In [None]:
dict_tbw={}
for w in df_aggregated.candidate_keyword:
    if w in textbook_words:
        dict_tbw[w]=1
    else:
        dict_tbw[w]=0

In [None]:
def is_a_tbw(x):
    if w in textbook_words:
        return 1
    else: 
        return 0

In [None]:
df_aggregated['is_a_textbook_word']=df_aggregated.candidate_keyword.apply(lambda x: dict_tbw[x])

In [None]:
df_candidates=df_aggregated['candidate_keyword']

In [None]:
df_aggregated_scaled = pd.get_dummies(df_aggregated, columns=['POS'],
prefix = ['POS'])

In [None]:
df_aggregated_scaled.drop(columns=['candidate_keyword'], inplace=True)

In [None]:
df_aggregated_scaled_columns = df_aggregated_scaled.columns

In [None]:
scaler = MinMaxScaler() 

In [None]:
df_aggregated_scaled = scaler.fit_transform(df_aggregated_scaled)
df_aggregated_scaled = pd.DataFrame(df_aggregated_scaled, columns= df_aggregated_scaled_columns)

In [None]:
df_aggregated_scaled=df_aggregated_scaled.rename(columns={"is_in_index": "target"})

In [None]:
df_aggregated_scaled=df_aggregated_scaled.drop(columns=['POS_SYM', 
                                        'POS_PART', 
                                        'POS_PUNCT', 
                                        'POS_INTJ', 
                                        'POS_DET',
                                        'POS_AUX',
                                        'POS_SCONJ',
                                        'POS_CCONJ',
                                        'POS_X',
                                        'POS_PRON',
                                        'POS_ADP',
                                        'POS_NUM'], errors='ignore')

In [None]:
df_aggregated_scaled.head()

In [None]:
df_aggregated_scaled.info()

In [None]:
#todo: parametritzar
df_aggregated_scaled.to_csv("../data/processed/jenkins_08/df_aggregated_scaled.csv", encoding = 'utf-8')

# 3. Make prediction

In [None]:
X = df_aggregated_scaled.drop('target', axis= 1)
y = df_aggregated_scaled.target

In [None]:
xgb_mod = pickle.load(open('../models/xgb_mod.sav', 'rb'))

In [None]:
xgb_mod_s = pickle.load(open('../models/xgb_mod_s.sav', 'rb'))

In [None]:
y_pred = xgb_mod.predict(X)

In [None]:
y_pred_s = xgb_mod_s.predict(X)

In [None]:
is_in_index=pd.DataFrame(y_pred)

In [None]:
results_df=pd.concat([df_candidates, is_in_index], axis=1)

In [None]:
results_df.rename(columns={0:'is_in_index'},inplace=True)

In [None]:
results_df.head()

# 4. Generate draft index

In [None]:
def get_line_numbers_concat(line_nums):
    seq = []
    final = []
    last = 0

    for index, val in enumerate(line_nums):

        if last + 1 == val or last + 2 == val or index == 0:
            seq.append(val)
            last = val
        else:
            if len(seq) > 1:
                final.append(str(seq[0]) + '-' + str(seq[len(seq)-1]))
            else:
                final.append(str(seq[0]))
            seq = []
            seq.append(val)
            last = val

        if index == len(line_nums) - 1:
            if len(seq) > 1:
                final.append(str(seq[0]) + '-' + str(seq[len(seq)-1]))
            else:
                final.append(str(seq[0]))

    final_str = ', '.join(map(str, final))
    return final_str

## 4.1. Get draft in md

In [None]:
def get_markdown_index(candidates_dataframe, pages_body_dataframe):
    keywords=candidates_dataframe[candidates_dataframe.is_in_index==1]['candidate_keyword'].tolist()
    dict_pagination={}
    for kw in keywords:
        pages_kw=pages_body_dataframe[pages_body_dataframe['clean_content'].str.contains(kw)]
        pages=pages_kw['real_page_num'].tolist()
        dict_pagination[kw]=get_line_numbers_concat(pages)
    md_string='## Index\n'
    last_unigram=''
    for word in dict_pagination:
        if len(word.split(' '))==1:
            md_string+='- '+word+' '+dict_pagination[word]+'\n'
            last_unigram=word
        else:
            if last_unigram in word.split(' '):
                md_string+='    - '+word+' '+dict_pagination[word]+'\n'
            else:
                md_string+='- '+word+' '+dict_pagination[word]+'\n'                
    return md_string

In [None]:
#TODO: parametritzar
path = '../data/processed/jenkins_08/by_page_body.csv'
pages_df = pd.read_csv(path,index_col=0)
pages_df.info()

In [None]:
md_string=get_markdown_index(results_df, pages_df)

In [None]:
#TODO: parametritzar
f=open('../data/processed/jenkins_08/markdown_index.md', 'w')
f.write(md_string)
f.close()

## 4.2. Get draft in txt for edition by human indexer

In [None]:
def get_txt_index(candidates_dataframe, pages_body_dataframe):
    keywords=candidates_dataframe[candidates_dataframe.is_in_index==1]['candidate_keyword'].tolist()
    dict_pagination={}
    for kw in keywords:
        pages_kw=pages_body_dataframe[pages_body_dataframe['clean_content'].str.contains(kw)]
        pages=pages_kw['real_page_num'].tolist()
        dict_pagination[kw]=get_line_numbers_concat(pages)
    txt_string='Index\n\n'
    last_unigram=''
    for word in dict_pagination:
        if len(word.split(' '))==1:
            txt_string+='- '+word+' '+dict_pagination[word]+'\n\n'
            last_unigram=word
        else:
            if last_unigram in word.split(' '):
                txt_string+='    - '+word+' '+dict_pagination[word]+'\n\n'
            else:
                txt_string+='- '+word+' '+dict_pagination[word]+'\n\n'      
    return txt_string

In [None]:
txt_string=get_txt_index(results_df, pages_df)

In [None]:
#todo: parametritzar
f=open('../data/processed/jenkins_08/txt_index.txt', 'w')
f.write(txt_string)
f.close()