# Remove First N Seconds Data per User

이 노트북은 train_ratings.csv 파일에서 각 user별로 최초 N초간의 데이터를 조건부로 제거하고
train_ratings_del_{N}sec.csv 파일로 저장합니다.

## 제거 조건
1. 초기 N초간의 데이터 개수가 해당 user 전체 데이터의 50% 미만일 경우만 제거
2. 제거 후 남은 데이터 개수가 최소 개수 이상일 경우만 제거

In [27]:
import pandas as pd
import numpy as np
from pathlib import Path

In [28]:
# 파라미터 설정
TIME_THRESHOLD_SECONDS = 300  # 초기 N초 (기본값: 300초 = 5분)
MIN_REMAINING_COUNT = 50      # 제거 후 최소 남아야 할 데이터 개수
REMOVAL_RATIO_THRESHOLD = 0.5 # 제거될 데이터가 전체의 이 비율 미만이어야 제거

# 파일 경로 설정
input_path = "~/juik/data/train/train_ratings.csv"
output_path = f"~/juik/data/train/train_ratings_del_{TIME_THRESHOLD_SECONDS}sec.csv"

In [29]:
# 데이터 로드
print("Loading data...")
df = pd.read_csv(input_path)
print(f"Original data shape: {df.shape}")
print(f"\nFirst few rows:")
df.head(10)

Loading data...
Original data shape: (5154471, 3)

First few rows:


Unnamed: 0,user,item,time
0,11,4643,1230782529
1,11,170,1230782534
2,11,531,1230782539
3,11,616,1230782542
4,11,2140,1230782563
5,11,2722,1230782583
6,11,2313,1230782646
7,11,2688,1230782656
8,11,2428,1230782694
9,11,3113,1230782719


In [30]:
# 데이터 타입 확인
print("Data types:")
print(df.dtypes)
print(f"\nNumber of unique users: {df['user'].nunique()}")
print(f"Number of unique items: {df['item'].nunique()}")
print(f"Total interactions: {len(df)}")

Data types:
user    int64
item    int64
time    int64
dtype: object

Number of unique users: 31360
Number of unique items: 6807
Total interactions: 5154471


In [31]:
# 조건부 데이터 제거 함수
def remove_initial_data_conditional(df, time_threshold=300, min_remaining=50, removal_ratio_threshold=0.5):
    """
    각 user별로 초기 time_threshold초 간의 데이터를 조건부로 제거합니다.
    
    Parameters:
    -----------
    df : pd.DataFrame
        user, item, time 컬럼을 가진 데이터프레임
    time_threshold : int
        제거할 초기 시간 (초 단위), 기본값 300초 (5분)
    min_remaining : int
        제거 후 최소 남아야 할 데이터 개수, 기본값 50
    removal_ratio_threshold : float
        제거될 데이터 비율의 최대값 (0~1), 기본값 0.5 (50%)
        초기 데이터가 전체의 이 비율 미만일 때만 제거
    
    Returns:
    --------
    pd.DataFrame : 필터링된 데이터프레임
    dict : 제거 통계 정보
    """
    
    def process_user_group(group):
        # 시간순으로 정렬
        group = group.sort_values('time')
        
        # 전체 데이터 개수
        total_count = len(group)
        
        # 첫 번째 타임스탬프
        first_time = group['time'].min()
        
        # 초기 time_threshold초 내의 데이터와 그 이후 데이터 분리
        initial_data = group[group['time'] <= first_time + time_threshold]
        remaining_data = group[group['time'] > first_time + time_threshold]
        
        initial_count = len(initial_data)
        remaining_count = len(remaining_data)
        
        # 조건 1: 초기 데이터가 전체의 removal_ratio_threshold(50%) 미만인지 확인
        condition1 = (initial_count / total_count) < removal_ratio_threshold
        
        # 조건 2: 제거 후 남은 개수가 min_remaining개 이상인지 확인
        condition2 = remaining_count >= min_remaining
        
        # 두 조건을 모두 만족하면 초기 데이터 제거, 아니면 원본 반환
        if condition1 and condition2:
            return remaining_data, {
                'removed': True,
                'total': total_count,
                'initial': initial_count,
                'remaining': remaining_count,
                'removal_ratio': initial_count / total_count
            }
        else:
            return group, {
                'removed': False,
                'total': total_count,
                'initial': initial_count,
                'remaining': remaining_count,
                'removal_ratio': initial_count / total_count,
                'reason': 'condition1_failed' if not condition1 else 'condition2_failed'
            }
    
    # 각 user별로 처리
    results = []
    stats = {}
    
    for user_id, group in df.groupby('user'):
        filtered_group, user_stats = process_user_group(group)
        results.append(filtered_group)
        stats[user_id] = user_stats
    
    # 결과 합치기
    df_filtered = pd.concat(results, ignore_index=True)
    
    # 전체 통계 계산
    removed_users = sum(1 for s in stats.values() if s['removed'])
    total_users = len(stats)
    total_removed_interactions = sum(s['initial'] if s['removed'] else 0 for s in stats.values())
    
    summary_stats = {
        'total_users': total_users,
        'users_with_removal': removed_users,
        'users_without_removal': total_users - removed_users,
        'total_interactions_before': len(df),
        'total_interactions_after': len(df_filtered),
        'total_removed_interactions': total_removed_interactions,
        'removal_percentage': (total_removed_interactions / len(df)) * 100,
        'user_stats': stats
    }
    
    return df_filtered, summary_stats


# 함수 실행
print(f"Removing initial {TIME_THRESHOLD_SECONDS} seconds of data per user...")
print(f"Conditions:")
print(f"  - Initial data must be < {REMOVAL_RATIO_THRESHOLD*100}% of user's total data")
print(f"  - Remaining data must be >= {MIN_REMAINING_COUNT} interactions")
print()

df_filtered, stats = remove_initial_data_conditional(
    df, 
    time_threshold=TIME_THRESHOLD_SECONDS,
    min_remaining=MIN_REMAINING_COUNT,
    removal_ratio_threshold=REMOVAL_RATIO_THRESHOLD
)

print(f"\n{'='*60}")
print(f"SUMMARY STATISTICS")
print(f"{'='*60}")
print(f"Total users: {stats['total_users']}")
print(f"Users with removal: {stats['users_with_removal']}")
print(f"Users without removal: {stats['users_without_removal']}")
print(f"\nTotal interactions before: {stats['total_interactions_before']:,}")
print(f"Total interactions after: {stats['total_interactions_after']:,}")
print(f"Total removed interactions: {stats['total_removed_interactions']:,}")
print(f"Removal percentage: {stats['removal_percentage']:.2f}%")
print(f"{'='*60}")

Removing initial 300 seconds of data per user...
Conditions:
  - Initial data must be < 50.0% of user's total data
  - Remaining data must be >= 50 interactions


SUMMARY STATISTICS
Total users: 31360
Users with removal: 26890
Users without removal: 4470

Total interactions before: 5,154,471
Total interactions after: 4,906,369
Total removed interactions: 248,102
Removal percentage: 4.81%


In [32]:
# 제거 전후 비교 (샘플 user - 제거된 경우)
removed_user = None
for user_id, user_stat in stats['user_stats'].items():
    if user_stat['removed']:
        removed_user = user_id
        break

if removed_user:
    print(f"Sample user with removal: {removed_user}")
    print(f"  Total interactions: {stats['user_stats'][removed_user]['total']}")
    print(f"  Initial {TIME_THRESHOLD_SECONDS}s interactions: {stats['user_stats'][removed_user]['initial']}")
    print(f"  Remaining interactions: {stats['user_stats'][removed_user]['remaining']}")
    print(f"  Removal ratio: {stats['user_stats'][removed_user]['removal_ratio']:.2%}")
    print(f"\nOriginal data (first 10 rows):")
    print(df[df['user'] == removed_user].sort_values('time').head(10))
    print(f"\nFiltered data (first 10 rows):")
    print(df_filtered[df_filtered['user'] == removed_user].sort_values('time').head(10))
else:
    print("No users had data removed.")

# 제거되지 않은 샘플 user
print(f"\n{'='*60}\n")
not_removed_user = None
for user_id, user_stat in stats['user_stats'].items():
    if not user_stat['removed']:
        not_removed_user = user_id
        break

if not_removed_user:
    print(f"Sample user without removal: {not_removed_user}")
    print(f"  Total interactions: {stats['user_stats'][not_removed_user]['total']}")
    print(f"  Initial {TIME_THRESHOLD_SECONDS}s interactions: {stats['user_stats'][not_removed_user]['initial']}")
    print(f"  Would-be remaining: {stats['user_stats'][not_removed_user]['remaining']}")
    print(f"  Removal ratio: {stats['user_stats'][not_removed_user]['removal_ratio']:.2%}")
    print(f"  Reason for not removing: {stats['user_stats'][not_removed_user]['reason']}")
    print(f"\nData kept intact (first 10 rows):")
    print(df_filtered[df_filtered['user'] == not_removed_user].sort_values('time').head(10))

Sample user with removal: 11
  Total interactions: 376
  Initial 300s interactions: 12
  Remaining interactions: 364
  Removal ratio: 3.19%

Original data (first 10 rows):
   user  item        time
0    11  4643  1230782529
1    11   170  1230782534
2    11   531  1230782539
3    11   616  1230782542
4    11  2140  1230782563
5    11  2722  1230782583
6    11  2313  1230782646
7    11  2688  1230782656
8    11  2428  1230782694
9    11  3113  1230782719

Filtered data (first 10 rows):
   user   item        time
0    11   8169  1230783004
1    11   2572  1230783041
2    11  58293  1230783046
3    11   7541  1230783053
4    11   1367  1230783065
5    11     32  1230783095
6    11   4792  1230783100
7    11   7444  1230783123
8    11  53953  1230783161
9    11  56949  1230783384


Sample user without removal: 43
  Total interactions: 59
  Initial 300s interactions: 10
  Would-be remaining: 49
  Removal ratio: 16.95%
  Reason for not removing: condition2_failed

Data kept intact (first 10 

In [33]:
# 상세 통계 확인
print("\nDetailed Statistics by Removal Reason:")
print(f"{'='*60}")

# 제거 이유별 분류
condition1_failed = sum(1 for s in stats['user_stats'].values() 
                        if not s['removed'] and s.get('reason') == 'condition1_failed')
condition2_failed = sum(1 for s in stats['user_stats'].values() 
                        if not s['removed'] and s.get('reason') == 'condition2_failed')

print(f"\nUsers with data removed: {stats['users_with_removal']}")
print(f"\nUsers without removal breakdown:")
print(f"  - Condition 1 failed (initial data >= {REMOVAL_RATIO_THRESHOLD*100}%): {condition1_failed}")
print(f"  - Condition 2 failed (remaining < {MIN_REMAINING_COUNT}): {condition2_failed}")
print(f"  - Total: {stats['users_without_removal']}")

# 제거 비율 분포
removal_ratios = [s['removal_ratio'] for s in stats['user_stats'].values()]
print(f"\nRemoval ratio distribution (initial data / total data):")
print(f"  - Min: {min(removal_ratios):.2%}")
print(f"  - Max: {max(removal_ratios):.2%}")
print(f"  - Mean: {np.mean(removal_ratios):.2%}")
print(f"  - Median: {np.median(removal_ratios):.2%}")

print(f"\n{'='*60}")


Detailed Statistics by Removal Reason:

Users with data removed: 26890

Users without removal breakdown:
  - Condition 1 failed (initial data >= 50.0%): 246
  - Condition 2 failed (remaining < 50): 4224
  - Total: 4470

Removal ratio distribution (initial data / total data):
  - Min: 0.06%
  - Max: 100.00%
  - Mean: 9.14%
  - Median: 6.22%



In [34]:
# 결과 저장
print(f"\nSaving filtered data to {output_path}...")
df_filtered.to_csv(output_path, index=False)
print("Done!")


Saving filtered data to ~/juik/data/train/train_ratings_del_300sec.csv...
Done!


In [35]:
# 저장된 파일 확인
df_check = pd.read_csv(output_path)
print(f"\nVerification - Loaded saved file:")
print(f"Shape: {df_check.shape}")
print(f"\nFirst few rows:")
df_check.head()


Verification - Loaded saved file:
Shape: (4906369, 3)

First few rows:


Unnamed: 0,user,item,time
0,11,8169,1230783004
1,11,2572,1230783041
2,11,58293,1230783046
3,11,7541,1230783053
4,11,1367,1230783065
