## patent-keyword vector 생성

In [13]:
# 필요한 라이브러리 import

import pandas as pd
import numpy as np
import re
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from sklearn.feature_extraction.text import CountVectorizer

### 1. 수집된 patent 에서 keyword 추출 전처리

In [14]:
# 데이터 불러오기
patent = pd.read_excel("../data/raw/patent_1019.xlsx")

# patent number, text(title+abstract) 정보만 가져오기
pat_num = patent['patent_number']
title = patent['patent_title']
abstract = patent['patent_abstract']
aafc = patent['AAFC']

# title+abstract 한 줄로 합치기
text = title + '. ' + abstract

In [15]:
text[10]

'Healthcare assurance system. Systems and methods for healthcare assurance system are provided. A first set of confidential health information for an identified patient may be accessed, may be derived from a first data source, and may include first medical indicia corresponding to an indication of a first health condition and/or an indication of a first healthcare service. A second set of confidential health information may be accessed, may be derived from a second data source, and may include second medical indicia corresponding to an indication of a second health condition and/or an indication of a second healthcare service. Healthcare rules that include criteria indicating comorbidity conditions may be accessed. The first and second sets of confidential health information may be correlated to the healthcare rules. A comorbidity condition may be identified based on the first medical indicia, the second medical indicia, and the correlating. A prompt regarding the comorbidity condition

In [16]:
## 1. 소문자화, 영어가 아닌 문자 공백으로 변환
new1 = []
for i in text:
    temp = re.sub('[^a-zA-Z]', ' ', i.lower())
    new1.append(temp)

new1[10]

'healthcare assurance system  systems and methods for healthcare assurance system are provided  a first set of confidential health information for an identified patient may be accessed  may be derived from a first data source  and may include first medical indicia corresponding to an indication of a first health condition and or an indication of a first healthcare service  a second set of confidential health information may be accessed  may be derived from a second data source  and may include second medical indicia corresponding to an indication of a second health condition and or an indication of a second healthcare service  healthcare rules that include criteria indicating comorbidity conditions may be accessed  the first and second sets of confidential health information may be correlated to the healthcare rules  a comorbidity condition may be identified based on the first medical indicia  the second medical indicia  and the correlating  a prompt regarding the comorbidity condition

In [17]:
## 2. nltk 불용어 제거
stop_words = stopwords.words('english')
stop_words.append('NoClaimText')

new2 = []
for i in new1:
    sentence = i.split()
    s_p = []
    for cor in sentence:
        if cor not in stop_words and len(cor)>2 :
            s_p.append(cor)
    s_p = " ".join(s_p)
    new2.append(s_p)

new2[10]

'healthcare assurance system systems methods healthcare assurance system provided first set confidential health information identified patient may accessed may derived first data source may include first medical indicia corresponding indication first health condition indication first healthcare service second set confidential health information may accessed may derived second data source may include second medical indicia corresponding indication second health condition indication second healthcare service healthcare rules include criteria indicating comorbidity conditions may accessed first second sets confidential health information may correlated healthcare rules comorbidity condition may identified based first medical indicia second medical indicia correlating prompt regarding comorbidity condition may provided'

In [18]:
## 3. lemmatize
n = WordNetLemmatizer()

new3 = []
for i in range(len(new2)):
    words = word_tokenize(new2[i])
    lem_word = []
    for w in words:
        lemm = n.lemmatize(w)
        lem_word.append(lemm)
    ps_tg = pos_tag(lem_word)
    new3.append(ps_tg)

new3[10]

[('healthcare', 'NN'),
 ('assurance', 'NN'),
 ('system', 'NN'),
 ('system', 'NN'),
 ('method', 'NN'),
 ('healthcare', 'NN'),
 ('assurance', 'NN'),
 ('system', 'NN'),
 ('provided', 'VBD'),
 ('first', 'RB'),
 ('set', 'VBN'),
 ('confidential', 'JJ'),
 ('health', 'NN'),
 ('information', 'NN'),
 ('identified', 'VBN'),
 ('patient', 'NN'),
 ('may', 'MD'),
 ('accessed', 'VB'),
 ('may', 'MD'),
 ('derived', 'VB'),
 ('first', 'RB'),
 ('data', 'NNS'),
 ('source', 'NN'),
 ('may', 'MD'),
 ('include', 'VB'),
 ('first', 'JJ'),
 ('medical', 'JJ'),
 ('indicia', 'NN'),
 ('corresponding', 'VBG'),
 ('indication', 'NN'),
 ('first', 'RB'),
 ('health', 'NN'),
 ('condition', 'NN'),
 ('indication', 'NN'),
 ('first', 'RB'),
 ('healthcare', 'JJ'),
 ('service', 'NN'),
 ('second', 'JJ'),
 ('set', 'VBN'),
 ('confidential', 'JJ'),
 ('health', 'NN'),
 ('information', 'NN'),
 ('may', 'MD'),
 ('accessed', 'VB'),
 ('may', 'MD'),
 ('derived', 'VB'),
 ('second', 'JJ'),
 ('data', 'NNS'),
 ('source', 'NN'),
 ('may', 'MD'),
 

In [19]:
## 4. 명사 추출
new4 = []
for i in new3:
    a = []
    for j in i:
        if(j[1]=='NN' or j[1]=='NNP'):
            a.append(j[0])
    new4.append(a)

new4[10]

['healthcare',
 'assurance',
 'system',
 'system',
 'method',
 'healthcare',
 'assurance',
 'system',
 'health',
 'information',
 'patient',
 'source',
 'indicia',
 'indication',
 'health',
 'condition',
 'indication',
 'service',
 'health',
 'information',
 'source',
 'indicia',
 'health',
 'condition',
 'indication',
 'healthcare',
 'service',
 'healthcare',
 'rule',
 'criterion',
 'comorbidity',
 'condition',
 'health',
 'information',
 'healthcare',
 'rule',
 'comorbidity',
 'condition',
 'indicia',
 'indicia',
 'comorbidity',
 'condition']

In [20]:
## 5. 한줄로 합치기
preprocessed_text = []
for i in new4:
    lem_join = " ".join(i)
    preprocessed_text.append(lem_join)

preprocessed_text[10]

'healthcare assurance system system method healthcare assurance system health information patient source indicia indication health condition indication service health information source indicia health condition indication healthcare service healthcare rule criterion comorbidity condition health information healthcare rule comorbidity condition indicia indicia comorbidity condition'

In [21]:
## 6. 전체 단어 하나로 합쳐서 단어 빈도 (Term Frequency) 확인

# TF 확인
join_text = []
join_text = ' '.join(preprocessed_text)

# 단어-빈도 dict
termfreq = dict()
for i in join_text.split(' '):
    termfreq[i] = (termfreq.get(i,0) + 1)
termfreq_df = pd.DataFrame(termfreq, index=['freq']).T

#termfreq_df

In [23]:
# TF 상위 5%만 추출
top5 = termfreq_df.freq.quantile(.95)
word_df = termfreq_df[termfreq_df['freq'] >= top5]
word_df.to_csv("../data/termfreq/top5_1019.csv")
word_list = word_df.index.tolist()

word_df

Unnamed: 0,freq
healthcare,1076
information,5551
analysis,987
display,1509
system,11777
...,...
contrast,227
dispenser,185
pixel,158
stimulation,177


In [24]:
# 추가 불용어
stopword_list = ['wherein', 'pre', 'non', 'thereof', 'herein']

In [25]:
## 7. TF상위 10%만 남기기
tf_text = []
for j in new4:
    temp = []
    for i in j:
        sentence = i.split()
        s_p = []
        for cor in sentence:
            if cor not in stopword_list and cor in word_list:
                s_p.append(cor)
        s_p = " ".join(s_p)
        temp.append(s_p)
        temp = [v for v in temp if v]
    tf_text.append(temp)

tf_text[10]

['healthcare',
 'system',
 'system',
 'method',
 'healthcare',
 'system',
 'health',
 'information',
 'patient',
 'source',
 'indication',
 'health',
 'condition',
 'indication',
 'service',
 'health',
 'information',
 'source',
 'health',
 'condition',
 'indication',
 'healthcare',
 'service',
 'healthcare',
 'rule',
 'criterion',
 'condition',
 'health',
 'information',
 'healthcare',
 'rule',
 'condition',
 'condition']

In [26]:
# cleaned_text: 한 줄로 합치기
cleaned_text = []
for i in tf_text:
    lem_join = " ".join(i)
    cleaned_text.append(lem_join)

cleaned_text[10]

'healthcare system system method healthcare system health information patient source indication health condition indication service health information source health condition indication healthcare service healthcare rule criterion condition health information healthcare rule condition condition'

### 2. TF-IDF 값 계산

In [28]:
vect = CountVectorizer()
# 문서-단어 행렬 
document_term_matrix = vect.fit_transform(cleaned_text)       
# TF (Term Frequency)
tf = pd.DataFrame(document_term_matrix.toarray(), columns=vect.get_feature_names()) 
# IDF (Inverse Document Frequency)
D = len(tf)
df = tf.astype(bool).sum(axis=0)
idf = np.log((D+1) / (df+1)) + 1             
# TF-IDF (Term Frequency-Inverse Document Frequency)
tfidf = tf * idf                      
tfidf = tfidf / np.linalg.norm(tfidf, axis=1, keepdims=True)

In [29]:
tfidf

Unnamed: 0,access,accordance,account,acquisition,action,activity,adherence,administration,agent,alarm,...,unit,use,user,value,vector,vessel,video,view,volume,wireless
0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.000000,0.000000,0.148168,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.113798,0.339754,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.000000,0.000000,0.090718,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6827,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.241478,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
6828,0.0,0.106578,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
6829,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
6830,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0


### 3. PatentNumber, FC(Forward Citation) 열 추가

In [30]:
tfidf.insert(0, 'patent_number', pat_num)
tfidf = tfidf.assign(AAFC=aafc)

tfidf.to_csv("../data/textvector/tfidf.csv", index = False)