In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
import ftfy
import textacy
import csv
from gensim.utils import simple_preprocess
import unicodedata
import re
import swifter



In [3]:
PATH_TO_DATASETS=Path('../../hedwig-data/datasets')


def is_whitespace(char):
    """Checks whether `chars` is a whitespace character."""
    # \t, \n, and \r are technically contorl characters but we treat them
    # as whitespace since they are generally considered as such.
    if char == " " or char == "\t" or char == "\n" or char == "\r":
        return True
    cat = unicodedata.category(char)
    if cat == "Zs":
        return True
    return False

def is_control(char):
    """Checks whether `chars` is a control character."""
    # These are technically control characters but we count them as whitespace
    # characters.
    if char == "\t" or char == "\n" or char == "\r":
        return False
    cat = unicodedata.category(char)
    if cat.startswith("C"):
        return True
    return False

def clean_text(text):
    """Performs invalid character removal and whitespace cleanup on text."""
    output = []
    for char in text:
        cp = ord(char)
        if cp == 0 or cp == 0xfffd or is_control(char):
            continue
        if is_whitespace(char):
            output.append(" ")
        else:
            output.append(char)
    return "".join(output)

def fix_contractions(text):
# standard
    text = text.replace("`","'").replace(
        "& amp ;", "[AMP]").replace(
        "@ USER", "[USER]").replace(
        "@ URL", "[URL]").replace(
        "@ HASHTAG", "[HASHTAG]").replace(
        "*NUMBER*", "[NUMBER]")
    
    text = re.sub(
        r"(\b)([Aa]re|[Cc]ould|[Dd]id|[Dd]oes|[Dd]o|[Hh]ad|[Hh]as|[Hh]ave|[Ii]s|[Mm]ight|[Mm]ust|[Ss]hould|[Ww]ere|[Ww]ould) n't",
        r"\1\2 not",
        text,
    )
    text = re.sub(
        r"(\b)([Hh]e|[Ii]|[Ss]he|[Tt]hey|[Ww]e|[Ww]hat|[Ww]ho|[Yy]ou) 'll",
        r"\1\2 will",
        text,
    )
    text = re.sub(r"(\b)([Tt]here|[Hh]ere) 's", r"\1\2 is", text)
    text = re.sub(r"(\b)([Tt]hey|[Ww]e|[Ww]hat|[Ww]ho|[Yy]ou) 're", r"\1\2 are", text)
    text = re.sub(
        r"(\b)([Ii]|[Ss]hould|[Tt]hey|[Ww]e|[Ww]hat|[Ww]ho|[Ww]ould|[Yy]ou) 've",
        r"\1\2 have",
        text,
    )
    text = re.sub(
        r"(\b)([Hh]e|[Ii]|[Ss]he|[Tt]hey|[Ww]e|[Yy]ou) 'd",
        r"\1\2 would",
        text,
    )
    # non-standard
    text = re.sub(r"(\b)([Cc]a) n't", r"\1\2n not", text)
    text = re.sub(r"(\b)([Ii]) 'm", r"\1\2 am", text)
    text = re.sub(r"(\b)([Ll]et) 's", r"\1\2 us", text)
    text = re.sub(r"(\b)([Ww]) on't", r"\1\2ill not", text)
    text = re.sub(r"(\b)([Ss]) han't", r"\1\2hall not", text)
    text = re.sub(r"(\b)([Yy])(?: 'all|a 'll)", r"\1\2ou all", text)
    #####################################################
    text = re.sub(
        r"(\b)([Aa]re|[Cc]ould|[Dd]id|[Dd]oes|[Dd]o|[Hh]ad|[Hh]as|[Hh]ave|[Ii]s|[Mm]ight|[Mm]ust|[Ss]hould|[Ww]ere|[Ww]ould) n ' t",
        r"\1\2 not ",
        text,
    )
    text = re.sub(
        r"(\b)([Hh]e|[Ii]|[Ss]he|[Tt]hey|[Ww]e|[Ww]hat|[Ww]ho|[Yy]ou) ' ll ",
        r"\1\2 will ",
        text,
    )
    text = re.sub(r"(\b)([Tt]here|[Hh]ere) ' s ", r"\1\2 is", text)
    text = re.sub(r"(\b)([Tt]hey|[Ww]e|[Ww]hat|[Ww]ho|[Yy]ou) ' re ", r"\1\2 are", text)
    text = re.sub(
        r"(\b)([Ii]|[Ss]hould|[Tt]hey|[Ww]e|[Ww]hat|[Ww]ho|[Ww]ould|[Yy]ou) ' ve ",
        r"\1\2 have ",
        text,
    )

    text = re.sub(
        r"(\b)([Hh]e|[Ii]|[Ss]he|[Tt]hey|[Ww]e|[Yy]ou) ' d ",
        r"\1\2 would ",
        text,
    )
    # non-standard
    text = re.sub(r"(\b)([Cc]a) n ' t ", r"\1\2n not ", text)
    text = re.sub(r"(\b)([Ii]) ' m ", r"\1\2 am ", text)
    text = re.sub(r"(\b)([Ll]et) ' s ", r"\1\2 us ", text)
    text = re.sub(r"(\b)([Ww]) on ' t ", r"\1\2ill not ", text)
    text = re.sub(r"(\b)([Ss]ha) n ' t ", r"\1\2ll not ", text)
    text = re.sub(r"(\b)([Yy])(?: ' all | a ' ll )", r"\1\2ou all ", text)
    text=text.replace(" 's ", "'s ").replace(" ' s ", "'s ").replace(" i ' m ", " i'm ")
    return text

def hard_preprocess(df):
    df.iloc[:,1]=df.iloc[:,1].swifter.apply(ftfy.fix_text)
    df.iloc[:,1]=df.iloc[:,1].apply(fix_contractions)
    df.iloc[:,1]=df.iloc[:,1].swifter.apply(clean_text)
    df.iloc[:,1]=df.iloc[:,1].swifter.apply(lambda x: x.replace('"', '').replace("\n", " ").replace("\\",""))
    df.iloc[:,1]=df.iloc[:,1].swifter.apply(lambda text: textacy.preprocess_text(
        text, no_currency_symbols=True,no_urls=True,no_emails=True,no_phone_numbers=True,no_numbers=True))
    df.iloc[:,1] = df.iloc[:,1].swifter.apply(lambda text: " ".join(
        [word for word in simple_preprocess(text) if word not in stop_words]).strip())
    return df

def soft_preprocess(df):
    df.iloc[:,1]=df.iloc[:,1].swifter.apply(ftfy.fix_text)
    df.iloc[:,1]=df.iloc[:,1].swifter.apply(clean_text)
    df.iloc[:,1]=df.iloc[:,1].swifter.apply(lambda x: x.replace('"', "'").replace("\n", " "))
    df.iloc[:,1]=df.iloc[:,1].swifter.apply(lambda text: textacy.preprocess_text(
        text, no_currency_symbols=True,no_urls=True,no_emails=True,no_phone_numbers=True,no_numbers=True))
    return df

def df_to_hedwig_tsv(df, dsname, outfilename, num_labels_in_col, hard_preprocess=True, label_cols=[0], text_col=1):
    def to_tsv(outfpath, labels, texts):
        with open(outfpath, 'w', newline='') as tsvfile:
            writer = csv.writer(tsvfile, delimiter='\t')
            for label, text in zip(labels, texts):
                writer.writerow([label, text])
    if hard_preprocess:
        df = hard_preprocess(df)
    else:
        df = soft_preprocess(df)
        
    df.iloc[:,0] = df.swifter.apply(lambda row: ''.join([str(lbl) for lbl in row[label_cols]]), axis=1)
    df = df.iloc[:,[0, 1]]
    df.iloc[:,0]=df.iloc[:,0].astype('str')
    df.iloc[:,0]=df.iloc[:,0].swifter.apply(
        lambda x: x if len(x) == num_labels_in_col else ''.join(
            ['0' for i in range(num_labels_in_col-len(x))]
        )+str(x)
    )
    dspath=PATH_TO_DATASETS/dsname
    outfpath = dspath/outfilename
    df = df.sample(frac=1.0)
    to_tsv(outfpath, df.iloc[:,0].tolist(), df.iloc[:,1].tolist())
    return df

In [29]:
df=pd.read_csv('../../hedwig-data/datasets/SST-2/ecommerce.csv')
df.head()

Unnamed: 0,AIPS Patent Id,Publication Number,Title,Abstract,All Claims,All CP Classifications,Label,Predicted Label,Confidence
0,172609,US09120015B2,Portable electronic device,"An electronic device (1) has hold parts (1R, 1...","1. A portable electronic device, comprising: ...",A63F13/06 A63F13 A63F13/214 A63F13 A63F13/92 A...,0,0,0.333889
1,198486,US07210620B2,System for facilitating online electronic tran...,The invention provides systems and methods wit...,1. A payment system for facilitating online pu...,G06Q30/00 G06Q30 G06Q20/40 G06Q20,1,1,0.967488
2,198519,US08731700B2,Device for determining build-to-order producti...,A build-to-order production process is determi...,1. An apparatus for determining a make-to-orde...,G05B19/41865 G05B19 G05B19/31389 G05B19 Y02P90...,0,1,0.593787
3,198544,US08736885B2,Multiproduct printing workflow system with dyn...,"A method for end-to-end printing, including en...",1. A computer-implemented method for end-to-en...,G06Q10/06 G06Q10,0,0,0.443172
4,172664,US07962050B2,System and method for controlling ordering of ...,A system and method for managing customer orde...,1. A services manager system associated with a...,G03G15/55 G03G15 G03G15/5079 G03G15 G03G15/553...,0,0,0.424413


In [30]:
df=df[['Title', 'Abstract', 'All Claims', 'All CP Classifications','Label']]
df=df[df['Label']!=2]
df.head()

Unnamed: 0,Title,Abstract,All Claims,All CP Classifications,Label
0,Portable electronic device,"An electronic device (1) has hold parts (1R, 1...","1. A portable electronic device, comprising: ...",A63F13/06 A63F13 A63F13/214 A63F13 A63F13/92 A...,0
1,System for facilitating online electronic tran...,The invention provides systems and methods wit...,1. A payment system for facilitating online pu...,G06Q30/00 G06Q30 G06Q20/40 G06Q20,1
2,Device for determining build-to-order producti...,A build-to-order production process is determi...,1. An apparatus for determining a make-to-orde...,G05B19/41865 G05B19 G05B19/31389 G05B19 Y02P90...,0
3,Multiproduct printing workflow system with dyn...,"A method for end-to-end printing, including en...",1. A computer-implemented method for end-to-en...,G06Q10/06 G06Q10,0
4,System and method for controlling ordering of ...,A system and method for managing customer orde...,1. A services manager system associated with a...,G03G15/55 G03G15 G03G15/5079 G03G15 G03G15/553...,0


In [31]:
df['text']=' Title: '+df['Title']+' Abstract: '+df['Abstract']+' Claims: '+df['All Claims']
df['label']=df['Label']
df=df[['text', 'label']]
df.head()

Unnamed: 0,text,label
0,Title: Portable electronic device Abstract: ...,0
1,Title: System for facilitating online electro...,1
2,Title: Device for determining build-to-order ...,0
3,Title: Multiproduct printing workflow system ...,0
4,Title: System and method for controlling orde...,0


In [32]:
text_list = df['text'].tolist()
labels = df['label'].tolist()

In [33]:
rows=[]
newlabels=[]

In [34]:


for line, lbl in zip(text_list, labels):
    toks=line.split()
    rows.append([(toks[i:i+200], lbl) for i in range(0, len(toks), 200)])
    

In [35]:
parsed_rows=[]
parsed_labels=[]
for row in rows:
    for sent, cls in row:
        parsed_rows.append(' '.join(sent).strip())
        parsed_labels.append(cls)
print(len(parsed_rows), len(parsed_labels))

9471 9471


In [36]:
newdf=pd.DataFrame(data={'label':parsed_labels,'text':parsed_rows})
newdf.head()

Unnamed: 0,label,text
0,0,Title: Portable electronic device Abstract: An...
1,0,side of the touch panel; at least a second ope...
2,0,part of the area between the first operative s...
3,1,Title: System for facilitating online electron...
4,1,computer is configured to receive a purchase r...


In [37]:
newdf.iloc[:,1]=newdf.iloc[:,1].apply(ftfy.fix_text)
newdf.iloc[:,1]=newdf.iloc[:,1].apply(clean_text)
newdf.iloc[:,1]=newdf.iloc[:,1].apply(lambda x: x.replace('"', "'").replace("\n", " "))
newdf.iloc[:,1]=newdf.iloc[:,1].apply(lambda x: x.lower())
newdf.head()

Unnamed: 0,label,text
0,0,title: portable electronic device abstract: an...
1,0,side of the touch panel; at least a second ope...
2,0,part of the area between the first operative s...
3,1,title: system for facilitating online electron...
4,1,computer is configured to receive a purchase r...


In [38]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(newdf)
train, dev = train_test_split(train)
print(len(train), len(test), len(dev))

5327 2368 1776


In [39]:
def to_tsv(outfpath, labels, texts):
    with open(outfpath, 'w', newline='') as tsvfile:
        writer = csv.writer(tsvfile, delimiter='\t')
        for label, text in zip(labels, texts):
            writer.writerow([label, text])

In [40]:

dspath=PATH_TO_DATASETS/"SST-2"
outfpath = dspath/"dev.tsv"
to_tsv(outfpath, dev.iloc[:,0].tolist(), dev.iloc[:,1].tolist())
#tdf=df_to_hedwig_tsv(dev, dsname="SST-2", outfilename='dev.tsv', num_labels_in_col=4, hard_preprocess=False)

In [41]:
dspath=PATH_TO_DATASETS/"SST-2"
outfpath = dspath/"train.tsv"
to_tsv(outfpath, train.iloc[:,0].tolist(), train.iloc[:,1].tolist())

In [42]:
dspath=PATH_TO_DATASETS/"SST-2"
outfpath = dspath/"test.tsv"
to_tsv(outfpath, test.iloc[:,0].tolist(), test.iloc[:,1].tolist())