In [2]:
import argparse
import json
import os
import random
import re
import string
from glob import glob
from typing import Any, Dict, List, Tuple, Union

import spacy
from spacy.tokens import Doc, Span, Token
from nltk.lm import MLE, KneserNeyInterpolated, Laplace, WittenBellInterpolated
from nltk.lm.preprocessing import everygrams, padded_everygram_pipeline
from nltk.util import flatten
from tqdm import tqdm

import utils

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
NLP = spacy.load('en_core_web_sm', disable=[ "ner", "textcat"])


# Type define
Model = Union[MLE, Laplace, KneserNeyInterpolated, WittenBellInterpolated]

# Rules define
PUNCTUATON = set(string.punctuation)
PUNCTUATON.remove('_')

EXCEPTION_DOT = {"a.m.", "p.m.", "e.g.",
                 "mr.", "ms.", "mrs.", "dr.", "st.", "u.s."}

LEMMA_GROUP = {"VERB","NOUN","PART"}

# Global Variable
assert spacy.prefer_gpu(), "Cannot run with gpu"

# Dubug Variable
DEBUG_ALL_ZERO = 0

In [4]:
from collections import Counter
from glob import glob
import json

tmp = []
test_list = glob(os.path.join("./hw3/test", "*.json"))
print("- Start Solving")
for file in test_list:
    with open(file, 'r') as F:
        question: dict = json.load(F)
        for _,v in question['options'].items():
            tmp.extend(v)
            
c = Counter(tmp)
with open('tmp.txt','w',encoding='utf-8') as G:
    for k,v in c.most_common():
        G.write(k+' '+str(v)+'\n')





- Start Solving


In [5]:
def preprocess(context: str, testing: bool = False) -> List[List[Union[str, Any]]]:
    '''
    prepocess text for tokenizing
    when testing mode, do not add dependency bigram
    '''
    # TODO : 先對context做去除所有符號和數字並且保留符號 ' , . _ -和空格
    # TODO : 刪掉兩個 . 以上的
    # TODO : 刪掉兩個 - 以上的
    # TODO : 刪掉兩個 ' 以上的
    # TODO : 刪除前綴和後綴的-
    # TODO : 由於有些字母依然會包含 . 所沒有在EXCEPTION_DOT中的要做split把點去掉
    result: List[List[Union[str, Any]]] = []
    context = re.sub('\d+', " ", context)
    context = re.sub(r"[^\w' ,._-]", " ", context)
    context = re.sub(r'(\.){2,}', ' ', context)
    context = re.sub(r'(\'){2,}', ' ', context)
    context = re.sub(r'(-){2,}', ' ', context)
    context = context.strip('-')
    docs = NLP(context)
    # Do normal process
    for sent in docs.sents:
        tkn = [x.lower_ for x in sent if (
            not x.is_space) and (not x.text in PUNCTUATON)]

        clean: List[str] = []
        for dirty in tkn:
            if dirty in EXCEPTION_DOT:
                clean.append(dirty)
            else:
                for d in dirty.split('.'):
                    if len(d)==1 and d not in {"i","a","_"}: continue
                    if len(d) > 0:
                        d = d.replace("'m",'am').replace("n't","not").replace("'ve","have")
                        clean.append(d)
        result.append(clean)

    return result


In [6]:
result: List[str] = []
print("- Start Loading External Training Set [BLOGS]")
fout = open("./en_US/en_US_process.blogs.txt",'w',encoding='utf-8')

with open("./en_US/en_US.blogs.txt", 'r', encoding="utf-8") as F:
    Lines = F.readlines()
    with tqdm(total=len(Lines)) as pbar:
        for f in Lines:
            tknz = preprocess(f.strip())
            for line in tknz:
                fout.write(' '.join(line)+"\n")
            pbar.update()
fout.close()

- Start Loading External Training Set [BLOGS]


 18%|█▊        | 165854/899288 [13:48<1:01:05, 200.08it/s]


KeyboardInterrupt: 

In [None]:
print(preprocess("They're so cute."))
print(preprocess("I'm so cute."))
y = NLP("They're so cute.")
y.text

[['they', "'re", 'so', 'cute']]
[['i', 'am', 'so', 'cute']]


"They're so cute."