In [15]:
#-*- coding:utf-8 -*-

# pandas import 
import pandas as pd

# numpy import
import numpy as np

# Graph lib import 
import matplotlib
import matplotlib.pyplot as plt
from matplotlib import rc

# heatmap
import seaborn as sns

# 피어슨 상관계수 
import scipy.stats as stats

# OLS Regression
import statsmodels.formula.api as smf

# Logistic Regression
import statsmodels.api as sm

# pre-required pip3 install factor-analyzer
from factor_analyzer import FactorAnalyzer

rc('font', family='AppleGothic')
plt.rcParams['axes.unicode_minus'] = False


# 각 열, 응답문항에 대하여 집계 한 데이터 return
def q13a_agg_col(col_name, agg_type, each_cnt, df, surfix):
    data = {}
    for i in range(1, each_cnt):
        col_idx = f'{i:02}'            # lpad 0, 2 length
        col_nm = col_name + col_idx 
        data[col_nm + '_' + surfix] = df[col_nm].groupby(df[col_nm]).agg(agg_type)
    return data

# cronbach alpha 
def cronbach_alpha(df):
    df_corr = df.corr()
    N = df.shape[1]
    rs = np.array([])
    for i, col in enumerate(df_corr.columns):
        sum_ = df_corr[col][i+1:].values
        rs = np.append(sum_, rs)
        mean_r = np.mean(rs)
    cronbach_alpha = (N * mean_r) / (1 + (N - 1) * mean_r)
    return cronbach_alpha


# 피어슨 상관계수를 구한뒤 dataFrame 으로 변환한다.
def pearson_frame(index_col, data_frame, each_cnt, col_list):
    data = {}
    for i in range(1, each_cnt):
        pear_val = stats.pearsonr(data_frame.corr()[index_col], data_frame.corr()[col_list[i-1]])
        data[col_list[i-1]] = pear_val
        
    res_data = pd.DataFrame.from_dict(data)
    res_data = res_data.rename(index={0 : '피어슨 상관계수 r', 1 : 'p-value'})
    return res_data

# raw data 5점 척도 변환

# one hot encoding 
def change_one_hot(origin_df, each_cnt, prefix):
    convert_df = origin_df
    for i in range(1, each_cnt):
        col_idx = f'{i:02}'
        col_nm = prefix + col_idx
        convert_df[col_nm] = convert_df[col_nm].apply(lambda x : 0 if x <= 3 else 1)
    return convert_df 


def change_one_hot_by_median(origin_df, each_cnt, prefix):
    convert_df = origin_df
    for i in range(1, each_cnt):
        col_idx = f'{i:02}'
        col_nm = prefix + col_idx
        
        # 중앙값 기준으로 -> 중앙값보다 작거나 같으면 0, 중앙값보다 크면 1 이다.
        this_median = convert_df[col_nm].mode()
        convert_df[col_nm] = convert_df[col_nm].apply(lambda x : 0 if x <= int(this_median) else 1)
    return convert_df


def change_one_hot_by_mean(origin_df, each_cnt, prefix):
    convert_df = origin_df
    for i in range(1, each_cnt):
        col_idx = f'{i:02}'
        col_nm = prefix + col_idx
        
        # 중앙값 기준으로 -> 중앙값보다 작거나 같으면 0, 중앙값보다 크면 1 이다.
        this_median = convert_df[col_nm].mean()
        convert_df[col_nm] = convert_df[col_nm].apply(lambda x : 0 if x < int(this_median) else 1)
    return convert_df


def change_one_hot_by_mode(origin_df, each_cnt, prefix):
    convert_df = origin_df
    for i in range(1, each_cnt):
        col_idx = f'{i:02}'
        col_nm = prefix + col_idx
        
        # 최빈값 기준으로 -> 최빈값보다 작으면 0, 최빈값보다 크면 1 이다.
        this_median = convert_df[col_nm].mode()
        convert_df[col_nm] = convert_df[col_nm].apply(lambda x : 0 if x < int(this_median) else 1)
    return convert_df


# view to heatmap
def view_to_heatmap(x_size, y_size, df_corr ):
    plt.figure(figsize=(x_size,y_size))


    # 삼각형 마스크를 만든다(위 쪽 삼각형에 True, 아래 삼각형에 False)
    mask = np.zeros_like(df_corr, dtype=np.bool)
    mask[np.triu_indices_from(mask)] = True

    # 히트맵을 그린다
    sns.heatmap(df_corr, 
                cmap = 'RdYlBu_r', 
                annot = True,   # 실제 값을 표시한다
                mask=mask,      # 표시하지 않을 마스크 부분을 지정한다
                linewidths=.5,  # 경계면 실선으로 구분하기
                cbar_kws={"shrink": .5},# 컬러바 크기 절반으로 줄이기
                vmin = -1,vmax = 1   # 컬러바 범위 -1 ~ 1
               )

In [4]:
# csv 파일을 읽어서 데이터 프레임으로 변환한다.
raw_data = pd.read_csv('./trip.csv')

raw_data['q13a04'] = raw_data['q13a04'].fillna(0).astype(int)

# 데이터 변환시, 원본데이터 오염을 막기위해 DataFrame을 복사하여 처리한다.
statistics_df = raw_data.copy()

In [32]:
trip_df = pd.read_csv('./business.csv')
trip_raw_data = trip_df.copy().sample(n=100)

In [33]:
trip_raw_data

Unnamed: 0.1,Unnamed: 0,NO,q1,q1a,q5,q13a01,q13a02,q13a03,q13a04,q13a05,...,q13a12,q13a13,q13a14,chasu,nat,city,sex,edu,job,age
513,3953,3954,2,4,6,5,5,5,5,5,...,5,4,4,9,97,5120,1,1,6,3
401,2839,2840,2,3,6,4,4,4,4,4,...,4,4,4,10,13,1600,1,3,2,9
473,3549,3550,2,4,6,4,5,3,5,4,...,5,5,5,9,11,1101,1,3,2,4
1282,9403,9404,2,3,6,5,5,4,3,4,...,4,4,4,3,16,6296,1,4,5,3
1340,9826,9827,2,2,6,5,4,4,4,4,...,4,4,4,3,4,402,1,2,2,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
61,386,387,1,1,6,5,5,5,5,4,...,5,5,5,12,9,811,2,3,4,3
1646,11987,11988,2,4,6,5,5,4,5,4,...,4,4,5,1,9,802,2,3,5,1
701,5598,5599,2,4,6,3,4,3,5,4,...,4,4,4,7,2,202,1,2,2,4
912,6897,6898,1,1,6,5,5,5,5,5,...,4,4,4,6,2,205,2,2,2,1


In [7]:
statistics_df_sample.to_csv(r'./trip_sample.csv')

In [16]:
business_raw_data = pd.read_csv('./business.csv')

In [9]:
business_raw_data['q13a04'] = business_raw_data['q13a04'].fillna(0).astype(int)

In [17]:
statistics_business_sample = business_raw_data.sample(n=100)

statistics_business_sample.to_csv(r'./business_sample.csv')

In [12]:
business_raw_data

Unnamed: 0.1,Unnamed: 0,NO,q1,q1a,q5,q13a01,q13a02,q13a03,q13a04,q13a05,...,q13a12,q13a13,q13a14,chasu,nat,city,sex,edu,job,age
0,1,2,2,3,1,4,4,3,4,4,...,4,4,4,12,3,401,1,1,3,3
1,3,4,1,1,1,4,4,4,5,4,...,4,4,5,12,3,401,1,2,8,1
2,4,5,2,3,1,3,4,5,5,5,...,5,4,4,12,3,401,2,2,3,1
3,8,9,1,1,1,4,5,5,5,5,...,5,5,5,12,14,1703,1,2,2,2
4,10,11,1,1,1,4,5,4,4,5,...,4,4,5,12,6,701,2,2,8,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5969,11978,11979,2,3,1,4,5,5,4,5,...,4,4,4,1,3,401,2,3,3,2
5970,11979,11980,2,2,1,5,4,4,4,5,...,3,4,4,1,3,401,2,2,8,0
5971,11980,11981,1,1,1,5,5,5,3,4,...,3,3,3,1,3,401,1,2,2,2
5972,11984,11985,1,1,1,4,4,5,5,5,...,5,5,5,1,16,5513,1,1,5,2


In [13]:
statistics_business_sample

Unnamed: 0.1,Unnamed: 0,NO,q1,q1a,q5,q13a01,q13a02,q13a03,q13a04,q13a05,...,q13a12,q13a13,q13a14,chasu,nat,city,sex,edu,job,age
5023,9974,9975,1,1,1,5,5,4,4,4,...,4,3,3,3,12,1303,1,4,5,2
4204,8353,8354,1,1,1,4,5,5,5,4,...,4,4,5,4,9,849,2,4,5,2
3860,7620,7621,1,1,1,5,4,4,4,5,...,4,4,4,5,5,501,2,1,12,1
2330,4531,4532,1,1,1,5,5,5,5,5,...,5,5,5,8,97,6176,1,3,12,1
4011,7991,7992,2,2,1,4,3,5,5,5,...,5,5,5,5,1,7,2,1,9,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3552,6986,6987,2,4,1,5,4,4,4,4,...,4,5,5,6,1,29,1,1,10,4
2783,5372,5373,1,1,1,5,5,5,5,5,...,5,4,5,7,14,1808,2,2,4,1
3628,7104,7105,1,1,1,5,5,5,5,5,...,5,5,5,5,7,5001,2,2,4,2
910,1854,1855,2,2,1,3,4,4,4,4,...,4,3,3,11,6,721,1,2,7,1


In [26]:
one_hot_business_mode = change_one_hot_by_mode(business_raw_data.copy(), 15,'q13a' )

one_hot_business_mode.to_csv(r'./filter_business_one_hot_by_mode.csv')

In [34]:
one_hot_trip_mode = change_one_hot_by_mode(trip_raw_data.copy(), 15, 'q13a')

one_hot_trip_mode.to_csv(r'./filter_trip_one_hot_by_mode.csv')

In [35]:
one_hot_trip_mode

Unnamed: 0.1,Unnamed: 0,NO,q1,q1a,q5,q13a01,q13a02,q13a03,q13a04,q13a05,...,q13a12,q13a13,q13a14,chasu,nat,city,sex,edu,job,age
513,3953,3954,2,4,6,1,1,1,1,1,...,1,1,1,9,97,5120,1,1,6,3
401,2839,2840,2,3,6,0,0,0,0,1,...,1,1,1,10,13,1600,1,3,2,9
473,3549,3550,2,4,6,0,1,0,1,1,...,1,1,1,9,11,1101,1,3,2,4
1282,9403,9404,2,3,6,1,1,0,0,1,...,1,1,1,3,16,6296,1,4,5,3
1340,9826,9827,2,2,6,1,0,0,0,1,...,1,1,1,3,4,402,1,2,2,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
61,386,387,1,1,6,1,1,1,1,1,...,1,1,1,12,9,811,2,3,4,3
1646,11987,11988,2,4,6,1,1,0,1,1,...,1,1,1,1,9,802,2,3,5,1
701,5598,5599,2,4,6,0,0,0,1,1,...,1,1,1,7,2,202,1,2,2,4
912,6897,6898,1,1,6,1,1,1,1,1,...,1,1,1,6,2,205,2,2,2,1


In [None]:
# csv 파일을 읽어서 데이터 프레임으로 변환한다.
# raw_data_business = pd.read_csv('./business_change.csv')

# raw_data_business['q13a04'] = raw_data_business['q13a04'].fillna(0).astype(int)

# 데이터 변환시, 원본데이터 오염을 막기위해 DataFrame을 복사하여 처리한다.
# business_statistics_df = raw_data_business.copy()