In [2]:
from unstructured.staging.base import elements_from_json, convert_to_dict


elements = elements_from_json(filename="output.json")

elements = convert_to_dict(elements)
for element in elements[:5]:
    print(f"{element['type']}: {element['text']}")

UncategorizedText: ALEX TAN HONG PIN Nationality Malaysian D.O.B 22 September 1990 Marital Status Single Permanent Address Sungai Petani, Malaysia Phone +60 011-24286345 (MY)/+65 82653640 (SG) Email alextan220990@gmail.com
Title: OBJECTIVE
Title: Engineering Manager, potentially CTO in the future (5-10 years goal).
Title: EDUCATION
UncategorizedText: The Mannheim University of Applied Sciences, Germany Jan 2011 – Dec 2014 Degree in Chemical Engineering (cumulative GPA: 3.28)


In [3]:
from unstructured.cleaners.core import replace_unicode_quotes

In [29]:
for element in elements:
    if element['text'] != (clean := replace_unicode_quotes(element['text'])):
        print(clean)

In [33]:
import pandas as pd

df = pd.DataFrame(data=elements)
df = df[['type', 'text']]
df

Unnamed: 0,type,text
0,UncategorizedText,ALEX TAN HONG PIN Nationality Malaysian D.O.B ...
1,Title,OBJECTIVE
2,Title,"Engineering Manager, potentially CTO in the fu..."
3,Title,EDUCATION
4,UncategorizedText,"The Mannheim University of Applied Sciences, G..."
...,...,...
90,NarrativeText,▪ Able to justify technical decisions and when...
91,NarrativeText,Independent learner; ﬂexible in adapting and a...
92,NarrativeText,GitHub Link Check my GitHub to see what’s my c...
93,Title,Website Link http://alextanhongpin.github.io/


In [94]:
from unstructured.cleaners.core import clean, clean_non_ascii_chars, clean_bullets
from functools import partial

df['cleaned'] = df['text'].apply(partial(clean, bullets=True, dashes=True, extra_whitespace=True, trailing_punctuation=True))
df['cleaned'] = df['cleaned'].apply(clean_non_ascii_chars)
df['cleaned'] = df['cleaned'].str.strip()
df['has_changed'] = df['cleaned'] == df['text']
df.head()

Unnamed: 0,type,text,cleaned,has_changed
0,UncategorizedText,ALEX TAN HONG PIN Nationality Malaysian D.O.B ...,ALEX TAN HONG PIN Nationality Malaysian D.O.B ...,False
1,Title,OBJECTIVE,OBJECTIVE,True
2,Title,"Engineering Manager, potentially CTO in the fu...","Engineering Manager, potentially CTO in the fu...",False
3,Title,EDUCATION,EDUCATION,True
4,UncategorizedText,"The Mannheim University of Applied Sciences, G...","The Mannheim University of Applied Sciences, G...",False


In [97]:
# https://pandas.pydata.org/docs/user_guide/options.html
with pd.option_context('max_colwidth', 1, 'display.expand_frame_repr', True):
    print('before:', df.shape)
    df_len = df[df['cleaned'].str.len() > 0]
    print('after:', df_len.shape)
    print()
    print(df_len['cleaned'].head())

before: (95, 4)
after: (85, 4)

0    ALEX TAN HONG PIN Nationality Malaysian D.O.B 22 September 1990 Marital Status Single Permanent Address Sungai Petani, Malaysia Phone +60 011 24286345 (MY)/+65 82653640 (SG) Email alextan220990@gmail.com
1    OBJECTIVE                                                                                                                                                                                                  
2    Engineering Manager, potentially CTO in the future (5 10 years goal)                                                                                                                                       
3    EDUCATION                                                                                                                                                                                                  
4    The Mannheim University of Applied Sciences, Germany Jan 2011 Dec 2014 Degree in Chemical Engineering (cumulative GPA: 3.28)   

In [100]:
from unstructured.staging.base import convert_to_dataframe
elements = elements_from_json(filename="output.json")

df = convert_to_dataframe(elements)
df.head()

Unnamed: 0,type,element_id,text,coordinates_points,coordinates_system,coordinates_layout_width,coordinates_layout_height,filename,file_directory,last_modified,filetype,page_number,links,parent_id
0,UncategorizedText,a41fb543db4be102c4115032664392a1,ALEX TAN HONG PIN Nationality Malaysian D.O.B ...,"((181.4, 75.0), (181.4, 166.0), (413.6, 166.0)...",PixelSpace,595,842,resume.pdf,.,2022-11-10T00:34:27,application/pdf,1,"[{'text': 'alextan220990 @ gmail .', 'url': 'm...",
1,Title,77a8599f65aab98cebab9a1432031409,OBJECTIVE,"((72.0, 182.0), (72.0, 193.0), (137.4, 193.0),...",PixelSpace,595,842,resume.pdf,.,2022-11-10T00:34:27,application/pdf,1,,
2,Title,d49f1f08b85c6365e016dfb10c786b57,"Engineering Manager, potentially CTO in the fu...","((72.0, 202.6), (72.0, 212.6), (378.1, 212.6),...",PixelSpace,595,842,resume.pdf,.,2022-11-10T00:34:27,application/pdf,1,,
3,Title,25c3308bcdc49ca36826fbe41a38f852,EDUCATION,"((72.0, 231.6), (72.0, 242.6), (136.6, 242.6),...",PixelSpace,595,842,resume.pdf,.,2022-11-10T00:34:27,application/pdf,1,,
4,UncategorizedText,d0b962fe2fd49dcb01a9387c17e312ef,"The Mannheim University of Applied Sciences, G...","((72.0, 252.2), (72.0, 274.2), (653.3, 274.2),...",PixelSpace,595,842,resume.pdf,.,2022-11-10T00:34:27,application/pdf,1,,25c3308bcdc49ca36826fbe41a38f852


In [105]:
from unstructured.chunking.title import chunk_by_title

chunks = chunk_by_title(elements)
for chunk in chunks[:5]:
    print(chunk)
    print("\n\n" + "-"*80)

ALEX TAN HONG PIN Nationality Malaysian D.O.B 22 September 1990 Marital Status Single Permanent Address Sungai Petani, Malaysia Phone +60 011-24286345 (MY)/+65 82653640 (SG) Email alextan220990@gmail.com

OBJECTIVE

Engineering Manager, potentially CTO in the future (5-10 years goal).


--------------------------------------------------------------------------------
EDUCATION

The Mannheim University of Applied Sciences, Germany Jan 2011 – Dec 2014 Degree in Chemical Engineering (cumulative GPA: 3.28)

International Education Centre (INTEC UITM), Shah Alam Jun 2010 – Dec 2011 Edexcel A-Level (Chemistry A*, Math A*, Physics A)

Sekolah Menengah Kebangsaan Ibrahim Graduated in 2006 SPM (Malaysia Certiﬁcate of Education) (12 A1)


--------------------------------------------------------------------------------
EXP E RIENCE

Senior Backend Engineer at RateS (Singapore) January 2021 - present RateS is an Indonesia-focused membership-based social commerce platform that enables small business