In [19]:
import pandas as pd
import numpy as np
import io
import os
import shutil

from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
import pdftotext

import re
from sentence_splitter import SentenceSplitter
splitter = SentenceSplitter(language="en")

import string

from typing import Dict
import fitz
import sys
import mdutils
import mdpdf

import pickle

from sklearn.preprocessing import MinMaxScaler


In [159]:
#!pip install mdpdf

Collecting mdpdf
  Downloading mdpdf-0.0.9-py3-none-any.whl (14 kB)
Collecting commonmark>=0.9.1
  Downloading commonmark-0.9.1-py2.py3-none-any.whl (51 kB)
[K     |████████████████████████████████| 51 kB 2.7 MB/s eta 0:00:011
Installing collected packages: commonmark, mdpdf
Successfully installed commonmark-0.9.1 mdpdf-0.0.9


In [140]:
path = '../data/processed/jenkins_08/aggregated.csv'
df_aggregated = pd.read_csv(path,index_col=0)
df_aggregated.head()

Unnamed: 0,candidate_keyword,length,is_named_entity,is_named_author,is_in_toc,freq,is_in_index,tfidf,importance,position_in_context,POS
0,a-mechanisms,12,0,0,0,0.000125,0,0.0,0.371197,0.540822,NOUN
1,a-mechanisms ability,20,0,0,0,1e-05,0,0.0,0.216426,0.2,CHUNK
2,a-mechanisms enabling,21,0,0,0,1e-05,0,0.0,0.30528,0.806452,CHUNK
3,a-mechanisms except,19,0,0,0,1e-05,0,0.0,0.323576,0.363636,CHUNK
4,a-mechanisms later,18,0,0,0,1.9e-05,0,0.0,0.332272,0.65,CHUNK


### Data cleaning and feature engineering

In [141]:
df_aggregated.dropna(inplace=True)

In [142]:
textbook_words=['whereas', 
                'conversely', 
                'result',
                'suggest',
                'contrast', 
                'comparison', 
                'view', 
                'likewise',
                'despite',
                'while',
                'subsequent',
                'subsequently',
                'overall',
                'summary',
                'sumarize',
                'first',
                'second', 
                'third',
                'firstly',
                'secondly', 
                'thirdly',
                'finally',
                'although',
                'thus',
                'again',
                'further',
                'then',
                'besides',
                'too',
                'similarly',
                'correspondingly',
                'regarding',
                'involved',
                'approach',
                'account',
                'theory',
                'method',
                'required',
                'process',
                'research',
                'role',
                'significant',
                'source',
                'variable',
                'issue',
                'function',
                'formula',
                'formulate',
                'phrase',
                'factor',
                'evidence',
                'derived',
                'derive',
                'establish',
                'concept',
                'available',
                'context',
                'assume',
                'assumption',
                'additionally',
                'additional',
                'analysis',
                'general',
                'certain',
                'certainly',
                'furthermore',
                'moreover',
                'nonetheless',
                'nevertheless',
                'bibliography',
                'prove',
                'part',
                'today', 
                'nowadays', 
                'actually', 
                'section', 
                'indeed', 
                'every', 
                'any', 
                'some',
                'instance',
                'example', 
                'therefore', 
                'definition',
                'define',
                'explain',
                'explanation',
                'introduction', 
                'conclusion',
                'conclude',
                'chapter', 
                'appendix', 
                'otherwise', 
                'thing',
                'concisely',
                'concise',
                'brief',
                'briefly',
                'rather', 
                'instead', 
                'like', 
                'since', 
                'given', 
                'case', 
                'hence', 
                'iff', 
                'see', 
                'beyond', 
                'below', 
                'above', 
                'postscript',
                'preface',
                'index', 
                'ensure', 
                'generally', 
                'anything', 
                'something',
                'everything',
                'other']

In [143]:
dict_tbw={}
for w in df_aggregated.candidate_keyword:
    if w in textbook_words:
        dict_tbw[w]=1
    else:
        dict_tbw[w]=0

In [144]:
def is_a_tbw(x):
    if w in textbook_words:
        return 1
    else: 
        return 0

In [145]:
df_aggregated['is_a_textbook_word']=df_aggregated.candidate_keyword.apply(lambda x: dict_tbw[x])

In [146]:
df_candidates=df_aggregated['candidate_keyword']

In [147]:
df_aggregated_scaled = pd.get_dummies(df_aggregated, columns=['POS'],
prefix = ['POS'])

In [148]:
df_aggregated_scaled.drop(columns=['candidate_keyword'], inplace=True)

In [149]:
df_aggregated_scaled_columns = df_aggregated_scaled.columns

In [150]:
scaler = MinMaxScaler() 

In [151]:
df_aggregated_scaled = scaler.fit_transform(df_aggregated_scaled)
df_aggregated_scaled = pd.DataFrame(df_aggregated_scaled, columns= df_aggregated_scaled_columns)

In [152]:
df_aggregated_scaled=df_aggregated_scaled.rename(columns={"is_in_index": "target"})

In [153]:
df_aggregated_scaled.head()

Unnamed: 0,length,is_named_entity,is_named_author,is_in_toc,freq,target,tfidf,importance,position_in_context,is_a_textbook_word,...,POS_INTJ,POS_NOUN,POS_NUM,POS_PRON,POS_PROPN,POS_PUNCT,POS_SCONJ,POS_SYM,POS_VERB,POS_X
0,0.282051,0.0,0.0,0.0,0.008547,0.0,0.0,0.428039,0.540822,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.487179,0.0,0.0,0.0,0.0,0.0,0.0,0.258435,0.2,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.512821,0.0,0.0,0.0,0.0,0.0,0.0,0.355805,0.806452,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.461538,0.0,0.0,0.0,0.0,0.0,0.0,0.375855,0.363636,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.435897,0.0,0.0,0.0,0.000712,0.0,0.0,0.385384,0.65,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [154]:
df_aggregated_scaled=df_aggregated_scaled.drop(columns=['POS_SYM', 
                                        'POS_PART', 
                                        'POS_PUNCT', 
                                        'POS_INTJ', 
                                        'POS_DET',
                                        'POS_AUX',
                                        'POS_SCONJ',
                                        'POS_CCONJ',
                                        'POS_X',
                                        'POS_PRON',
                                        'POS_ADP',
                                        'POS_NUM'], errors='ignore')

In [155]:
df_aggregated_scaled.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38436 entries, 0 to 38435
Data columns (total 16 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   length               38436 non-null  float64
 1   is_named_entity      38436 non-null  float64
 2   is_named_author      38436 non-null  float64
 3   is_in_toc            38436 non-null  float64
 4   freq                 38436 non-null  float64
 5   target               38436 non-null  float64
 6   tfidf                38436 non-null  float64
 7   importance           38436 non-null  float64
 8   position_in_context  38436 non-null  float64
 9   is_a_textbook_word   38436 non-null  float64
 10  POS_ADJ              38436 non-null  float64
 11  POS_ADV              38436 non-null  float64
 12  POS_CHUNK            38436 non-null  float64
 13  POS_NOUN             38436 non-null  float64
 14  POS_PROPN            38436 non-null  float64
 15  POS_VERB             38436 non-null 

In [156]:
df_aggregated_scaled.to_csv("../data/processed/jenkins_08/df_aggregated_scaled.csv", encoding = 'utf-8')

### Division features-target

In [157]:
X = df_aggregated_scaled.drop('target', axis= 1)
y = df_aggregated_scaled.target

In [158]:
xgb_mod_s1 = pickle.load(open('../models/xgb_mod_s1.sav', 'rb'))

In [159]:
xgb_mod_s_sm1 = pickle.load(open('../models/xgb_mod_s_sm1.sav', 'rb'))

In [160]:
y_pred = xgb_mod_s1.predict(X)

In [161]:
y_pred_sm = xgb_mod_s_sm1.predict(X)

In [162]:
df_candidates

0                 a-mechanisms
1         a-mechanisms ability
2        a-mechanisms enabling
3          a-mechanisms except
4           a-mechanisms later
                 ...          
38431        ﬂow set-theoretic
38432                     ﬂows
38433           ﬂows knowledge
38434                     ﬂuid
38435               ﬂuid whose
Name: candidate_keyword, Length: 38436, dtype: object

In [163]:
is_in_index=pd.DataFrame(y_pred)

In [164]:
results_df=pd.concat([df_candidates, is_in_index], axis=1)

In [165]:
results_df

Unnamed: 0,candidate_keyword,0
0,a-mechanisms,0.0
1,a-mechanisms ability,0.0
2,a-mechanisms enabling,0.0
3,a-mechanisms except,0.0
4,a-mechanisms later,0.0
...,...,...
38431,ﬂow set-theoretic,0.0
38432,ﬂows,0.0
38433,ﬂows knowledge,0.0
38434,ﬂuid,0.0


In [166]:
results_df.rename(columns={0:'is_in_index'},inplace=True)

In [167]:
results_df[results_df.is_in_index==1].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 237 entries, 69 to 38125
Data columns (total 2 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   candidate_keyword  237 non-null    object 
 1   is_in_index        237 non-null    float64
dtypes: float64(1), object(1)
memory usage: 5.6+ KB


In [168]:
def get_line_numbers_concat(line_nums):
    seq = []
    final = []
    last = 0

    for index, val in enumerate(line_nums):

        if last + 1 == val or last + 2 == val or index == 0:
            seq.append(val)
            last = val
        else:
            if len(seq) > 1:
                final.append(str(seq[0]) + '-' + str(seq[len(seq)-1]))
            else:
                final.append(str(seq[0]))
            seq = []
            seq.append(val)
            last = val

        if index == len(line_nums) - 1:
            if len(seq) > 1:
                final.append(str(seq[0]) + '-' + str(seq[len(seq)-1]))
            else:
                final.append(str(seq[0]))

    final_str = ', '.join(map(str, final))
    return final_str

In [169]:
def get_markdown_index(candidates_dataframe, pages_body_dataframe):
    keywords=candidates_dataframe[candidates_dataframe.is_in_index==1]['candidate_keyword'].tolist()
    dict_pagination={}
    for kw in keywords:
        pages_kw=pages_body_dataframe[pages_body_dataframe['clean_content'].str.contains(kw)]
        pages=pages_kw['real_page_num'].tolist()
        dict_pagination[kw]=get_line_numbers_concat(pages)
    md_string='## Index\n'
    for word in dict_pagination:
        md_string+='- '+word+' '+dict_pagination[word]+'\n'
    return md_string

In [170]:
def get_markdown_index1(candidates_dataframe, pages_body_dataframe):
    keywords=candidates_dataframe[candidates_dataframe.is_in_index==1]['candidate_keyword'].tolist()
    dict_pagination={}
    for kw in keywords:
        pages_kw=pages_body_dataframe[pages_body_dataframe['clean_content'].str.contains(kw)]
        pages=pages_kw['real_page_num'].tolist()
        dict_pagination[kw]=get_line_numbers_concat(pages)
    md_string='## Index\n'
    last_unigram=''
    for word in dict_pagination:
        if len(word.split(' '))==1:
            md_string+='- '+word+' '+dict_pagination[word]+'\n'
            last_unigram=word
        else:
            if last_unigram in word.split(' '):
                md_string+='    - '+word+' '+dict_pagination[word]+'\n'
            else:
                md_string+='- '+word+' '+dict_pagination[word]+'\n'                
    return md_string

In [179]:
path = '../data/processed/jenkins_08/by_page_body.csv'
pages_df = pd.read_csv(path,index_col=0)
pages_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 266 entries, 0 to 265
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   content          266 non-null    object
 1   page_number      266 non-null    int64 
 2   real_page_num    266 non-null    int64 
 3   section_level_1  266 non-null    object
 4   section_level_2  263 non-null    object
 5   section_level_3  239 non-null    object
 6   clean_content    266 non-null    object
dtypes: int64(2), object(5)
memory usage: 16.6+ KB


In [180]:
md_string=get_markdown_index1(results_df, pages_df)

In [181]:
f=open('../data/processed/jenkins_08/markdown_index.md', 'w')
f.write(md_string)
f.close()

In [182]:
def get_txt_index(candidates_dataframe, pages_body_dataframe):
    keywords=candidates_dataframe[candidates_dataframe.is_in_index==1]['candidate_keyword'].tolist()
    dict_pagination={}
    for kw in keywords:
        pages_kw=pages_body_dataframe[pages_body_dataframe['clean_content'].str.contains(kw)]
        pages=pages_kw['real_page_num'].tolist()
        dict_pagination[kw]=get_line_numbers_concat(pages)
    txt_string='Index\n\n'
    last_unigram=''
    for word in dict_pagination:
        if len(word.split(' '))==1:
            txt_string+='- '+word+' '+dict_pagination[word]+'\n\n'
            last_unigram=word
        else:
            if last_unigram in word.split(' '):
                txt_string+='    - '+word+' '+dict_pagination[word]+'\n\n'
            else:
                txt_string+='- '+word+' '+dict_pagination[word]+'\n\n'      
    return txt_string

In [183]:
txt_string=get_txt_index(results_df, pages_df)

In [184]:
f=open('../data/processed/jenkins_08/txt_index.txt', 'w')
f.write(txt_string)
f.close()