In [80]:
# ! pip install python-docx
from docx import Document
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder

In [6]:
document = Document('G1700519.docx')
sections = document.sections
len(sections)

1

In [9]:
paragraphs = document.paragraphs
len(paragraphs)

96

In [92]:
paragraphs[0].style.name

'Normal'

In [132]:
df = pd.DataFrame()
df["styles"] = [p.style.name for p in paragraphs]
df["texts"] = [p.text for p in paragraphs]
df.head()

Unnamed: 0,styles,texts
0,Normal,Committee on the Elimination of Racial Discrim...
1,_ H _Ch_G,\t\tConcluding observations on the combined tw...
2,_ Single Txt_G,1.\tThe Committee on the Elimination of Racial...
3,_ H_1_G,\tA.\tIntroduction
4,_ Single Txt_G,2.\tThe Committee welcomes the presentation of...


In [133]:
df.styles.value_counts()

_ Single Txt_G    67
_ H_2/3_G         22
_ H_1_G            4
Normal             2
_ H _Ch_G          1
Name: styles, dtype: int64

In [134]:
enc = OneHotEncoder(handle_unknown='ignore')
enc.fit(df.styles.to_numpy().reshape(-1, 1))
enc.categories_

[array(['Normal', '_ H _Ch_G', '_ H_1_G', '_ H_2/3_G', '_ Single Txt_G'],
       dtype=object)]

In [135]:
tmp = pd.DataFrame(enc.transform(df.styles.to_numpy().reshape(-1, 1)).toarray().astype(int))
tmp = pd.merge(df, tmp, left_index=True, right_index=True)
# columns=enc.categories_
df = tmp.rename(columns={i: enc.categories_[0][i] for i in range(len(enc.categories_[0]))})
df.head()

Unnamed: 0,styles,texts,Normal,_ H _Ch_G,_ H_1_G,_ H_2/3_G,_ Single Txt_G
0,Normal,Committee on the Elimination of Racial Discrim...,1,0,0,0,0
1,_ H _Ch_G,\t\tConcluding observations on the combined tw...,0,1,0,0,0
2,_ Single Txt_G,1.\tThe Committee on the Elimination of Racial...,0,0,0,0,1
3,_ H_1_G,\tA.\tIntroduction,0,0,1,0,0
4,_ Single Txt_G,2.\tThe Committee welcomes the presentation of...,0,0,0,0,1


In [136]:
topheaders = []
superheaders = []
headers = []
subheaders = []
texts = []

topheader = np.nan
superheader = np.nan
header = np.nan
subheader = np.nan
text = ""

for i in range(len(df)-1):
    if df["Normal"][i]==1:
        topheader = df["texts"][i].lstrip('\t')
    elif df["_ H _Ch_G"][i]==1:
        superheader = df["texts"][i].lstrip('\t')
    elif df["_ H_1_G"][i]==1:
        header = df["texts"][i].lstrip('\t')
    elif df["_ H_2/3_G"][i]==1:
        subheader = df["texts"][i].lstrip('\t')
    elif df["_ Single Txt_G"][i]==1:
        tmp = df["texts"][i].lstrip('\t')
        if tmp.startswith("("):
            text += tmp
        else:
            topheaders.append(topheader)
            superheaders.append(superheader)
            headers.append(header)
            subheaders.append(subheader)
            texts.append(text)
            text = tmp
    else:
        print(i)

In [137]:
df = pd.DataFrame([topheaders, superheaders, headers, subheaders, texts]).T
df.head()

Unnamed: 0,0,1,2,3,4
0,Committee on the Elimination of Racial Discrim...,Concluding observations on the combined twenty...,,,
1,Committee on the Elimination of Racial Discrim...,Concluding observations on the combined twenty...,A.\tIntroduction,,1.\tThe Committee on the Elimination of Racial...
2,Committee on the Elimination of Racial Discrim...,Concluding observations on the combined twenty...,A.\tIntroduction,,2.\tThe Committee welcomes the presentation of...
3,Committee on the Elimination of Racial Discrim...,Concluding observations on the combined twenty...,B.\tPositive aspects,,3.\tThe Committee also welcomes the presentati...
4,Committee on the Elimination of Racial Discrim...,Concluding observations on the combined twenty...,B.\tPositive aspects,,4.\tThe Committee welcomes the State party’s r...


In [138]:
columns = ["topheader", "superheader", "header", "subheader", "text"]
df = df.rename(columns={i: columns[i] for i in range(len(enc.categories_[0]))})
df.head()

Unnamed: 0,topheader,superheader,header,subheader,text
0,Committee on the Elimination of Racial Discrim...,Concluding observations on the combined twenty...,,,
1,Committee on the Elimination of Racial Discrim...,Concluding observations on the combined twenty...,A.\tIntroduction,,1.\tThe Committee on the Elimination of Racial...
2,Committee on the Elimination of Racial Discrim...,Concluding observations on the combined twenty...,A.\tIntroduction,,2.\tThe Committee welcomes the presentation of...
3,Committee on the Elimination of Racial Discrim...,Concluding observations on the combined twenty...,B.\tPositive aspects,,3.\tThe Committee also welcomes the presentati...
4,Committee on the Elimination of Racial Discrim...,Concluding observations on the combined twenty...,B.\tPositive aspects,,4.\tThe Committee welcomes the State party’s r...


In [139]:
df.to_csv("parsed_docx.csv", sep="#")