# DTM 정리 

# 0. 예제 문장 뽑기

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
from sklearn.metrics.pairwise import linear_kernel
import getpass
import pandas as pd
from numpy import dot
from numpy.linalg import norm

def avoid_null(data,header):
  not_null_data = data.copy()
  not_null_data[header] = data[header].fillna('')
  return not_null_data[header]

def cos_sim(A,B):
  return dot(A,B)/(norm(A)*norm(B))

def tfidf(df):
  tdf= avoid_null(df,'link')
  tfidf_metrix_of_fit = TfidfVectorizer().fit_transform(tdf)
  return tfidf_metrix_of_fit

def top10(data,q_num):
  cos_sim = linear_kernel(data,data)
  cos_sim_score = list(enumerate(cos_sim[q_num]))
  cos_sim_score = sorted(cos_sim_score,key=lambda x:x[1],reverse=True)
  score = cos_sim_score[1:11]
  tag_indices = [i[0] for i in score]
  return tag_indices

df = pd.read_csv('../data/recumbent_fin_327.csv',header=[0])
u_df = df.drop_duplicates(["link"],keep="first")
data_fit = tfidf(u_df)

for i in range(10):
  print(i,'/',len(u_df))
  print(u_df.loc[i,"link"])
  fit_10 = u_df["link"].iloc[top10(data_fit,i)]
  print(str(i),"번 논문 제목과 유사한 제목을 가진 논문 목록",fit_10)

0 / 284
Effects of passive pedaling exercise on the intracortical inhibition in subjects with spinal cord injury
0 번 논문 제목과 유사한 제목을 가진 논문 목록 7      The effect of active pedaling combined with el...
269    Chapter 7: Advances in the management of spina...
52     Stimulation of Shank Muscles During Functional...
46     Alternating stimulation of synergistic muscles...
55     Upper limb effort does not increase maximal vo...
185    Physical therapy treatment of a pediatric pati...
127    Effects of Different Sleeping Postures on Intr...
320    Kinesiological research: The use of surface el...
103    Effects of supine and lateral recumbent positi...
283    Effects of a mechanical pain stimulus on erect...
Name: link, dtype: object
1 / 284
Cardiovascular and Metabolic Responses During Functional Electric Stimulation Cycling at Different Cadences
1 번 논문 제목과 유사한 제목을 가진 논문 목록 52     Stimulation of Shank Muscles During Functional...
46     Alternating stimulation of synergistic muscles...
35   

In [10]:
sample_sentences = ["Cardiovascular and Metabolic Responses During Functional Electric Stimulation Cycling at Different Cadences",
"Alternating stimulation of synergistic muscles during functional electrical stimulation cycling improves endurance in persons with spinal cord injury",
"Comparison of Cardiovascular Responses Between Upright and Recumbent Cycle Ergometers in Healthy Young Volunteers Performing Low-Intensity Exercise: Assessment of Reliability of the Oxygen Uptake Calculated by Using the ACSM Metabolic Equation",
"Cycling exercise with functional electrical stimulation improves postural control in stroke patients",
"An improved design of home cycling system via functional electrical stimulation for paraplegics",
"Metabolic efficiency of volitional and electrically stimulated cycling in able-bodied subjects"
]
bow_senetneces = ','.join(sample_sentences)

## 단순 단어 맵핑

In [11]:
word_map_dict = {}
space = ' '
for sentence in sample_sentences:
  words = sentence.split(space)
  for word in words :
    word_map_dict.setdefault(word,0)
    word_map_dict[word]+=1

print('word map:',word_map_dict)

word map: {'Cardiovascular': 2, 'and': 3, 'Metabolic': 3, 'Responses': 2, 'During': 1, 'Functional': 1, 'Electric': 1, 'Stimulation': 1, 'Cycling': 2, 'at': 1, 'Different': 1, 'Cadences': 1, 'Alternating': 1, 'stimulation': 4, 'of': 6, 'synergistic': 1, 'muscles': 1, 'during': 1, 'functional': 3, 'electrical': 3, 'cycling': 3, 'improves': 2, 'endurance': 1, 'in': 4, 'persons': 1, 'with': 2, 'spinal': 1, 'cord': 1, 'injury': 1, 'Comparison': 1, 'Between': 1, 'Upright': 1, 'Recumbent': 1, 'Cycle': 1, 'Ergometers': 1, 'Healthy': 1, 'Young': 1, 'Volunteers': 1, 'Performing': 1, 'Low-Intensity': 1, 'Exercise:': 1, 'Assessment': 1, 'Reliability': 1, 'the': 2, 'Oxygen': 1, 'Uptake': 1, 'Calculated': 1, 'by': 1, 'Using': 1, 'ACSM': 1, 'Equation': 1, 'exercise': 1, 'postural': 1, 'control': 1, 'stroke': 1, 'patients': 1, 'An': 1, 'improved': 1, 'design': 1, 'home': 1, 'system': 1, 'via': 1, 'for': 1, 'paraplegics': 1, 'efficiency': 1, 'volitional': 1, 'electrically': 1, 'stimulated': 1, 'able-b

### 단순 단어 맵핑 단점 : 각 문장에 어떤 단어가 등장했는지 알 수 없다.
## BoW (Bag of Words)

### BoW 방식은 단순 단어 맵핑에서 단점을 보완해 줌.
1. 전처리 (불용어제거)
2. 단어에 고유한 인덱스값 생성
3. 각 문장마다 단어의 등장 횟수 기록한 벡터 생성

In [13]:
## 각문장마다 count를 세면 정확하지 못함. -> list 안의 문장 합쳐서 다시 돌려봐야함
vect = CountVectorizer(stop_words="english")
for sentence in sample_sentences:
  print('bag of words vector :',vect.fit_transform([sentence]).toarray())
  print('vocabulary :',vect.vocabulary_)
print('-------------------------------------------------------------------------------')
print('bow of joined_sentences vector :',vect.fit_transform([bow_senetneces]).toarray())
print('vocabulary :',vect.vocabulary_)

bag of words vector : [[1 1 1 1 1 1 1 1 1]]
vocabulary : {'cardiovascular': 1, 'metabolic': 6, 'responses': 7, 'functional': 5, 'electric': 4, 'stimulation': 8, 'cycling': 2, 'different': 3, 'cadences': 0}
bag of words vector : [[1 1 1 1 1 1 1 1 1 1 1 2 1]]
vocabulary : {'alternating': 0, 'stimulation': 11, 'synergistic': 12, 'muscles': 8, 'functional': 5, 'electrical': 3, 'cycling': 2, 'improves': 6, 'endurance': 4, 'persons': 9, 'spinal': 10, 'cord': 1, 'injury': 7}
bag of words vector : [[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]]
vocabulary : {'comparison': 4, 'cardiovascular': 3, 'responses': 17, 'upright': 18, 'recumbent': 15, 'cycle': 5, 'ergometers': 7, 'healthy': 9, 'young': 22, 'volunteers': 21, 'performing': 14, 'low': 11, 'intensity': 10, 'exercise': 8, 'assessment': 1, 'reliability': 16, 'oxygen': 13, 'uptake': 19, 'calculated': 2, 'using': 20, 'acsm': 0, 'metabolic': 12, 'equation': 6}
bag of words vector : [[1 1 1 1 1 1 1 1 1 1]]
vocabulary : {'cycling': 1, 'exercis

In [17]:
vect = CountVectorizer(stop_words="english")
sample_dtm = vect.fit_transform(sample_sentences).toarray()
feature_names = vect.get_feature_names()
sample_dtm_df = pd.DataFrame(sample_dtm,columns = feature_names)
# feature_names_desc = sorted(vect.vocabulary_, key = lambda voca:voca[1],reverse=True)
# sample_dtm_df = pd.DataFrame(sample_dtm,columns = feature_names_desc)
sample_dtm_df

Unnamed: 0,cycling,synergistic,cycle,exercise,oxygen,functional,muscles,subjects,stimulation,stroke,...,performing,reliability,design,acsm,able,cardiovascular,cadences,calculated,patients,paraplegics
0,0,0,0,0,0,1,0,1,0,0,...,1,0,0,0,0,0,0,0,0,0
1,0,0,1,0,0,0,0,0,0,0,...,2,0,0,1,0,0,0,0,0,0
2,0,1,0,1,0,0,1,1,1,0,...,0,0,0,0,1,1,1,0,1,1
3,0,0,0,0,0,0,0,0,0,1,...,1,1,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
5,1,0,0,0,1,0,0,0,0,0,...,0,0,1,0,0,0,0,1,0,0


In [16]:
print('feature_names : ',feature_names)
print('vocabulary :',vect.vocabulary_)
print(set(feature_names) == set(vect.vocabulary_.keys()))

feature_names :  ['able', 'acsm', 'alternating', 'assessment', 'bodied', 'cadences', 'calculated', 'cardiovascular', 'comparison', 'control', 'cord', 'cycle', 'cycling', 'design', 'different', 'efficiency', 'electric', 'electrical', 'electrically', 'endurance', 'equation', 'ergometers', 'exercise', 'functional', 'healthy', 'home', 'improved', 'improves', 'injury', 'intensity', 'low', 'metabolic', 'muscles', 'oxygen', 'paraplegics', 'patients', 'performing', 'persons', 'postural', 'recumbent', 'reliability', 'responses', 'spinal', 'stimulated', 'stimulation', 'stroke', 'subjects', 'synergistic', 'upright', 'uptake', 'using', 'volitional', 'volunteers', 'young']
vocabulary : {'cardiovascular': 7, 'metabolic': 31, 'responses': 41, 'functional': 23, 'electric': 16, 'stimulation': 44, 'cycling': 12, 'different': 14, 'cadences': 5, 'alternating': 2, 'synergistic': 47, 'muscles': 32, 'electrical': 17, 'improves': 27, 'endurance': 19, 'persons': 37, 'spinal': 42, 'cord': 10, 'injury': 28, 'com