In [2]:
import pandas as pd
from pathlib import Path
mnt_dir = "/mnt/NAS2-2/data/"
dp_dir = f"{mnt_dir}/SAD_gangnam_DP/"
feature_dir = f"{dp_dir}/dp_features/"
feature_path = Path(feature_dir)
dp_path = Path(dp_dir)

subfolders = [f for f in feature_path.iterdir() if f.is_dir()]

dp_fmri_subject = pd.read_csv(dp_path/"dp_fmri_subject.csv")
dp_types = ["app", "call", "light", "location", "screen"]
group_types = ["HC", "SAD"]
week_types = ["weekdays", "weekends"]


In [3]:
def process_fmri_log_data(dp_fmri_subject, log_data):
    from datetime import datetime, timedelta
    import pandas as pd

    # 날짜 변환 중 오류가 발생한 피험자를 추적하기 위한 리스트
    date_conversion_errors = []

    # 최종 결과를 저장할 리스트
    results = []

    # subjNum의 결측치 확인 및 제거
    dp_fmri_subject = dp_fmri_subject.dropna(subset=['subjNum'])
    dp_fmri_subject['subjNum'] = dp_fmri_subject['subjNum'].astype(int)

    log_data = log_data.dropna(subset=['subjNum'])
    log_data['subjNum'] = log_data['subjNum'].astype(int)

    # 'fmri_date' 열을 datetime 형식으로 변환
    dp_fmri_subject['fmri_date'] = pd.to_datetime(dp_fmri_subject['fmri_date'], errors='coerce')

    # 'local_segment_start_datetime' 열을 datetime 형식으로 변환
    log_data['local_segment_start_datetime'] = pd.to_datetime(log_data['local_segment_start_datetime'], errors='coerce')

    # subjNum, fmri_code, fmri_date별로 그룹화하여 처리
    grouped_subjects = dp_fmri_subject.groupby(['subjNum', 'fmri_code', 'fmri_date'])

    for (subj_num, fmri_code, fmri_date), group in grouped_subjects:
        # fmri_date가 제대로 변환되었는지 확인
        if pd.isnull(fmri_date):
            date_conversion_errors.append(subj_num)
            continue

        # 해당 피험자의 로그 데이터를 필터링
        subj_log_data = log_data[log_data['subjNum'] == subj_num]

        # 로그 데이터의 날짜가 제대로 변환되었는지 확인하고 NaT 제거
        subj_log_data = subj_log_data.dropna(subset=['local_segment_start_datetime'])
        subj_log_dates = subj_log_data['local_segment_start_datetime']

        if subj_log_dates.empty:
            # 로그 데이터가 없을 경우, 해당 피험자를 건너뜁니다
            continue

        # 피험자의 전체 날짜 범위 계산
        min_date = subj_log_dates.min()
        max_date = subj_log_dates.max()

        # 윈도우 크기 설정 (2주)
        window_size = timedelta(weeks=2)

        # 모든 윈도우의 시작 날짜 생성
        window_start_dates = []
        current_start_date = min_date
        while current_start_date <= max_date:
            window_start_dates.append(current_start_date)
            current_start_date += window_size

        # 각 윈도우에 대해 데이터 계산
        window_results = []
        for ws_date in window_start_dates:
            we_date = ws_date + window_size
            window_data = subj_log_data[(subj_log_data['local_segment_start_datetime'] >= ws_date) &
                                        (subj_log_data['local_segment_start_datetime'] < we_date)]

            if not window_data.empty:
                numeric_data = window_data.select_dtypes(include='number').mean()
            else:
                numeric_columns = log_data.select_dtypes(include='number').columns
                numeric_data = pd.Series(0, index=numeric_columns)

            result_row = {
                'subjNum': subj_num,
                'fmri_code': fmri_code,
                'fmri_date': fmri_date,
                'window_start_date': ws_date,
                'window_end_date': we_date
            }
            result_row.update(numeric_data.to_dict())
            window_results.append(result_row)

        # 윈도우의 시작 날짜 중 fmri_date와 ±1주일 이내인 윈도우를 찾음
        fmri_date = pd.to_datetime(fmri_date)
        candidates = [wr for wr in window_results if abs((wr['window_start_date'] - fmri_date).days) <= 7]
        if candidates:
            # 가장 가까운 윈도우를 찾음
            closest_window = min(candidates, key=lambda x: abs((x['window_start_date'] - fmri_date).days))
            chunk_num_0_start_date = closest_window['window_start_date']
        else:
            # 해당하는 윈도우가 없으면 fmri_date와 가장 가까운 윈도우를 찾음
            closest_window = min(window_results, key=lambda x: abs((x['window_start_date'] - fmri_date).days))
            chunk_num_0_start_date = closest_window['window_start_date']

        # 각 윈도우에 chunk_num 할당
        for wr in window_results:
            # 윈도우의 시작 날짜와 chunk_num 0 윈도우의 시작 날짜 차이를 계산하여 chunk_num 할당
            days_diff = (wr['window_start_date'] - chunk_num_0_start_date).days
            wr['chunk_num'] = days_diff // 14  # 14일 단위로 나눠서 chunk_num 계산
            results.append(wr)

    # 결과를 데이터프레임으로 변환
    final_window_df = pd.DataFrame(results)

    # window_start_date나 window_end_date가 NaT인 행을 제거
    final_window_df = final_window_df.dropna(subset=['window_start_date', 'window_end_date'])

    # subjNum을 정수형으로 변환하여 소수점 제거
    final_window_df['subjNum'] = final_window_df['subjNum'].astype(int)

    # chunk_num을 정렬
    final_window_df = final_window_df.sort_values(by=['subjNum', 'chunk_num'])

    # 최종 데이터프레임과 날짜 변환 오류가 발생한 피험자 목록 반환
    return final_window_df, date_conversion_errors

In [4]:
for dp_type in dp_types:
    for group in group_types:
        for week_type in week_types:
            if group == "HC":
                group_specifier = "hc"
            elif group == "SAD":
                group_specifier = "sad"
            intput_file_name = f"{dp_type}_{week_type}_{group_specifier}.csv"
            output_file_name = f"{dp_type}_{week_type}_{group_specifier}_window.csv"
            target_df = pd.read_csv(feature_path / dp_type / group / intput_file_name)
            final_window_df, date_conversion_errors = process_fmri_log_data(dp_fmri_subject, target_df)
            final_window_df.to_csv(feature_path / dp_type / group / output_file_name)
            

In [6]:
from scipy.stats import ttest_ind

def find_significant_features(data1, data2, chunk_num=0, p_value_threshold=0.05):
    """
    This function compares two datasets, applies a t-test on matching numeric columns,
    and returns the features where p < 0.05.
    
    Parameters:
    - file1: path to the first CSV file (e.g., SAD group)
    - file2: path to the second CSV file (e.g., HC group)
    - chunk_num: chunk number to filter on (default is 0)
    - p_value_threshold: the p-value threshold to consider a feature significant (default is 0.05)
    
    Returns:
    - A dictionary of significant features and their p-values.
    """
    # Filter for the specified chunk number
    data1_chunk = data1[data1['chunk_num'] == chunk_num]
    data2_chunk = data2[data2['chunk_num'] == chunk_num]
    
    # Extract numeric columns
    numeric_columns1 = data1_chunk.select_dtypes(include='number').columns
    numeric_columns2 = data2_chunk.select_dtypes(include='number').columns
    
    # Ensure both datasets have matching numeric columns
    common_columns = numeric_columns1.intersection(numeric_columns2)
    print(len(common_columns))
    # Extract only the relevant numeric columns for the chunk
    data1_numeric = data1_chunk[common_columns]
    data2_numeric = data2_chunk[common_columns]
    
    # Perform t-tests and store p-values
    p_values = {}
    for col in common_columns:
        stat, p_val = ttest_ind(data1_numeric[col], data2_numeric[col], nan_policy='omit')
        p_values[col] = p_val
    
    # Filter for significant features
    significant_metrics = {k: v for k, v in p_values.items() if v < p_value_threshold}
    
    return significant_metrics

for dp_type in dp_types:
    for week_type in week_types:
        
        sad_data_file = f"{dp_type}_{week_type}_sad_window.csv"
        hc_data_file = f"{dp_type}_{week_type}_hc_window.csv"
        sad_data_df = pd.read_csv(feature_path / dp_type / "SAD" / sad_data_file)
        hc_data_df = pd.read_csv(feature_path / dp_type / "HC" / hc_data_file)
        significant_features = find_significant_features(sad_data_df, hc_data_df)
        print(f"{dp_type} {week_type}")
        print(significant_features)

108
app weekdays
{'subjNum': 0.003875052035449824, 'phone_applications_foreground_rapids_countepisodesns_custom': 0.025383931223577936, 'phone_applications_foreground_rapids_countepisodecommunication_custom_category': 0.018444863650960288, 'phone_applications_foreground_rapids_countepisodesocialmedia_custom_category': 0.020621960147809763, 'phone_applications_foreground_rapids_maxdurationall': 0.046328856350739904, 'phone_applications_foreground_rapids_counteventsns_custom': 0.025383931223577936, 'phone_applications_foreground_rapids_counteventcommunication_custom_category': 0.018444863650960288, 'phone_applications_foreground_rapids_counteventsocialmedia_custom_category': 0.020621960147809763, 'phone_applications_foreground_rapids_frequencyentropysns_custom': 0.028037361996839576, 'phone_applications_foreground_rapids_counteventratio_sns': 0.03431836516697072, 'phone_applications_foreground_rapids_counteventratio_communication': 0.02529162314092532, 'phone_applications_foreground_rapi

  res = hypotest_fun_out(*samples, **kwds)
  res = hypotest_fun_out(*samples, **kwds)
