In [12]:
import pymongo as mg
import pandas as pd

In [13]:
client = mg.MongoClient(host='mongodb://localhost:27017')

In [14]:
client

MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True)

In [15]:
database = client['db_NHIS']

In [21]:
collection = database['NSC2_M20']

In [22]:
cursor = collection.find()

In [23]:
list_M20 = list(cursor)

In [24]:
df_M20 = pd.DataFrame(list_M20)

In [25]:
df_M20.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 137163 entries, 0 to 137162
Data columns (total 23 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   _id                 137163 non-null  object 
 1   RN_INDI             137163 non-null  int64  
 2   RN_KEY              137163 non-null  int64  
 3   RN_INST             137163 non-null  int64  
 4   MDCARE_STRT_DT      137163 non-null  int64  
 5   FORM_CD             137163 non-null  int64  
 6   MCARE_SUBJ_CD       137163 non-null  int64  
 7   SICK_SYM1           137163 non-null  object 
 8   OFIJ_TYPE           134428 non-null  object 
 9   OPRTN_YN            137163 non-null  int64  
 10  MDCARE_DD_CNT       137163 non-null  int64  
 11  VSHSP_DD_CNT        137163 non-null  int64  
 12  TOT_PRSC_DD_CNT     137163 non-null  int64  
 13  MCARE_RSLT_TYPE     133958 non-null  float64
 14  EDC_ADD_RT          137163 non-null  float64
 15  ED_RC_TOT_AMT       137163 non-nul

In [26]:
df_M20.columns

Index(['_id', 'RN_INDI', 'RN_KEY', 'RN_INST', 'MDCARE_STRT_DT', 'FORM_CD',
       'MCARE_SUBJ_CD', 'SICK_SYM1', 'OFIJ_TYPE', 'OPRTN_YN', 'MDCARE_DD_CNT',
       'VSHSP_DD_CNT', 'TOT_PRSC_DD_CNT', 'MCARE_RSLT_TYPE', 'EDC_ADD_RT',
       'ED_RC_TOT_AMT', 'EDC_SBA', 'EDC_INSUR_BRDN_AMT', 'STD_YYYY',
       'HSPTZ_PATH_TYPE', 'SICK_SYM2', 'SPCF_SYM_TYPE', 'FST_HSPTZ_DT'],
      dtype='object')

#### 목표변수 - 연속형
- 목표변수 : TOT_PRSC_DD_CNT 
- 설명변수 : MDCARE_DD_CNT, VSHSP_DD_CNT

#### Preprocessing 전처리

##### 목표변수와 설명변수 추출 

In [32]:
df_M20_extract = df_M20[['TOT_PRSC_DD_CNT', 'MDCARE_DD_CNT', 'VSHSP_DD_CNT' ]]
df_M20_extract[:2]

Unnamed: 0,TOT_PRSC_DD_CNT,MDCARE_DD_CNT,VSHSP_DD_CNT
0,0,1,1
1,2,1,1


#### 결측치나 이상치 확인 

In [33]:
df_M20_extract.isnull().sum()

TOT_PRSC_DD_CNT    0
MDCARE_DD_CNT      0
VSHSP_DD_CNT       0
dtype: int64

In [34]:
df_M20_extract.to_csv('../../datasets/M20_extract.csv')

In [36]:
df_M20_extract = pd.read_csv('../../datasets/M20_extract.csv')
df_M20_extract[-10:]

Unnamed: 0.1,Unnamed: 0,TOT_PRSC_DD_CNT,MDCARE_DD_CNT,VSHSP_DD_CNT
137153,137153,0,11,7
137154,137154,0,2,2
137155,137155,0,10,3
137156,137156,0,2,2
137157,137157,0,9,6
137158,137158,0,7,4
137159,137159,0,27,27
137160,137160,0,11,8
137161,137161,0,33,11
137162,137162,0,28,28


##### Structured data 
- 목표변수, Y, **Target**
- 설명변수, X, **Label** 

In [39]:
target = df_M20_extract['TOT_PRSC_DD_CNT']
labels = df_M20_extract[['MDCARE_DD_CNT', 'VSHSP_DD_CNT']]
target.shape, labels.shape

((137163,), (137163, 2))

#### 모델(알고리즘) 학습 

##### 모델 학습
- target datatype 종속 

In [40]:
from sklearn.linear_model import LinearRegression

In [41]:
model = LinearRegression()
model.fit(labels, target)

#### 평가 

In [46]:
target_predict = model.predict(labels)
len(target_predict)
target_predict

array([8.94833513, 8.94833513, 8.94833513, ..., 7.80871464, 3.68798597,
       6.92538118])

In [47]:
from sklearn.metrics import r2_score
r2_score(target, target_predict)

0.0030173643581679066

#### 미래예측(서비스 개시) 

In [52]:
df_M20_extract[300:305]

Unnamed: 0.1,Unnamed: 0,TOT_PRSC_DD_CNT,MDCARE_DD_CNT,VSHSP_DD_CNT
300,300,0,1,1
301,301,0,1,1
302,302,3,1,1
303,303,3,1,1
304,304,1,1,1


In [53]:
temp_label = [[1,1]] 

In [54]:
model.predict(temp_label)



array([8.94833513])