# packs

In [9]:
import pandas as pd
import numpy as np
import pickle
import json
from datetime import datetime
from collections import Counter
from tqdm import tqdm
import os
np.random.seed(42) # Set random seed for reproducibility

print("✅ Libraries imported successfully!")


✅ Libraries imported successfully!


# paths

In [4]:
main_data_path = '' # Adjust path as needed

In [5]:
data_path = f"{main_data_path}/raw/"

In [None]:
# Save processed data
output_path = f"{main_data_path}/processed/"
seq_data_file_path_parquet = f'{output_path}/seq_data.parquet'

# Create directory if it doesn't exist

os.makedirs(output_path, exist_ok=True)

## Read data --> Load MovieLens-1M data

In [None]:
# Load ratings data
ratings_df = pd.read_csv(
    f"{data_path}/ratings.dat",
    sep="::",
    names=['UserID', 'MovieID', 'Rating', 'Timestamp'],
    engine='python'
)

# Load movies data
movies_df = pd.read_csv(
    f"{data_path}/movies.dat",
    sep="::",
    names=['MovieID', 'Title', 'Genres'],
    engine='python',
    encoding='latin1'
)

# Load users data
users_df = pd.read_csv(
    f"{data_path}/users.dat",
    sep="::",
    names=['UserID', 'Gender', 'Age', 'Occupation', 'Zip'],
    engine='python'
)

print(f"📊 Loaded data:")
print(f"  - Ratings: {len(ratings_df):,} records")
print(f"  - Movies: {len(movies_df):,} records")
print(f"  - Users: {len(users_df):,} records")

# Display sample data
print("\n🎬 Sample ratings:")
print(ratings_df.head())

print("\n🍿 Sample movies:")
print(movies_df.head())


📊 Loaded data:
  - Ratings: 1,000,209 records
  - Movies: 3,883 records
  - Users: 6,040 records

🎬 Sample ratings:
   UserID  MovieID  Rating  Timestamp
0       1     1193       5  978300760
1       1      661       3  978302109
2       1      914       3  978301968
3       1     3408       4  978300275
4       1     2355       5  978824291

🍿 Sample movies:
   MovieID                               Title                        Genres
0        1                    Toy Story (1995)   Animation|Children's|Comedy
1        2                      Jumanji (1995)  Adventure|Children's|Fantasy
2        3             Grumpier Old Men (1995)                Comedy|Romance
3        4            Waiting to Exhale (1995)                  Comedy|Drama
4        5  Father of the Bride Part II (1995)                        Comedy


# Step 1: Create User Sequences


In [None]:
def create_user_sequences(ratings_df, movies_df, min_rate_for_data=1):
    """Create chronological user interaction sequences"""

    # Merge ratings with movie information
    merged_df = ratings_df.merge(movies_df, on='MovieID', how='left')
    merged_df = merged_df[merged_df.Rating>=min_rate_for_data].reset_index(drop=True)
    # Sort by user and timestamp
    merged_df = merged_df.sort_values(['UserID', 'Timestamp'])

    user_sequences = {}

    print("Creating user sequences...")
    for user_id, group in tqdm(merged_df.groupby('UserID')):
        sequence = {
            'user_id': user_id,
            'movie_ids': group['MovieID'].tolist(),
            'ratings': group['Rating'].tolist(),
            'timestamps': group['Timestamp'].tolist(),
            'genres_list': group['Genres'].tolist(),
            'titles': group['Title'].tolist()
        }
        user_sequences[user_id] = sequence

    return user_sequences

# Create user sequences
user_sequences = create_user_sequences(ratings_df, movies_df, min_rate_for_data=3)

print(f"\n✅ Created sequences for {len(user_sequences):,} users")

# Show example sequence
example_user = list(user_sequences.keys())[0]
example_seq = user_sequences[example_user]
print(f"\n🎯 Example sequence for User {example_user}:")
print(f"  - Number of ratings: {len(example_seq['ratings'])}")
print(f"  - First 5 ratings: {example_seq['ratings'][:5]}")
print(f"  - First 5 movies: {example_seq['titles'][:5]}")


Creating user sequences...


100%|██████████| 6039/6039 [00:00<00:00, 7760.25it/s]



✅ Created sequences for 6,039 users

🎯 Example sequence for User 1:
  - Number of ratings: 53
  - First 5 ratings: [4, 5, 4, 5, 3]
  - First 5 movies: ['Girl, Interrupted (1999)', 'Back to the Future (1985)', 'Titanic (1997)', 'Cinderella (1950)', 'Meet Joe Black (1998)']


# Step 2: Generate Action Sequences (seq_actions)


In [None]:
def generate_action_sequences(user_sequences):
    action_sequences = {}
    rating_sequences = {}
    all_movie_ids = set()

    print("Generating movie ID sequences (TiSASRec style)...")
    for user_id, seq in tqdm(user_sequences.items()):
        # Use movie IDs directly as actions (like original TiSASRec)
        movie_ids = seq['movie_ids']
        ratings = seq['ratings']

        # Convert movie IDs to strings for consistency
        actions = [f"movie_{movie_id}" for movie_id in movie_ids]
        action_sequences[user_id] = actions
        rating_sequences[user_id] = ratings
        all_movie_ids.update(actions)

    print(f"\n✅ Generated movie ID sequences")
    print(f"📊 Total unique movies: {len(all_movie_ids)}")

    return action_sequences, all_movie_ids, rating_sequences

# Generate action sequences
action_sequences, all_movie_ids, rating_sequences = generate_action_sequences(user_sequences)

# Show example action sequence
example_actions = action_sequences[example_user]
print(f"\n🎯 Example actions for User {example_user}:")
print(f"  - Total actions: {len(example_actions)}")
print(f"  - First 10 actions: {example_actions[:10]}")

# Show movie statistics
movie_counts = Counter()
for actions in action_sequences.values():
    movie_counts.update(actions)

print(f"\n📈 Most popular movies:")
for movie, count in movie_counts.most_common(10):
    print(f"  - {movie}: {count:,} times")


Generating movie ID sequences (TiSASRec style)...


100%|██████████| 6039/6039 [00:00<00:00, 38588.95it/s]


✅ Generated movie ID sequences
📊 Total unique movies: 3628

🎯 Example actions for User 1:
  - Total actions: 53
  - First 10 actions: ['movie_3186', 'movie_1270', 'movie_1721', 'movie_1022', 'movie_2340', 'movie_1836', 'movie_3408', 'movie_2804', 'movie_1207', 'movie_1193']






📈 Most popular movies:
  - movie_2858: 3,211 times
  - movie_260: 2,910 times
  - movie_1196: 2,885 times
  - movie_1210: 2,716 times
  - movie_2028: 2,561 times
  - movie_589: 2,509 times
  - movie_593: 2,498 times
  - movie_1198: 2,473 times
  - movie_1270: 2,460 times
  - movie_2571: 2,434 times


# Step 3: Calculate Time Intervals (seq_times)


In [None]:
def calculate_time_intervals(user_sequences):
    """Calculate time intervals from first interaction (TiSASRec compatible)"""

    time_intervals = {}

    print("Calculating time intervals...")
    for user_id in tqdm(user_sequences.keys()):
        seq = user_sequences[user_id]
        timestamps = seq['timestamps']
        first_timestamp = timestamps[0]  # First interaction

        intervals = []
        for timestamp in timestamps:
            # Calculate days since first interaction
            days_since_first = int((timestamp - first_timestamp) / (24 * 3600))
            intervals.append(days_since_first)

        time_intervals[user_id] = intervals

    return time_intervals

# Calculate time intervals
time_intervals = calculate_time_intervals(user_sequences)

# Show example time intervals
example_times = time_intervals[example_user]
print(f"\n🎯 Example time intervals for User {example_user}:")
print(f"  - Total intervals: {len(example_times)}")
print(f"  - First 10 intervals: {example_times[:10]}")
print(f"  - Time span: {max(example_times)} days")

# Time statistics
all_intervals = []
for intervals in time_intervals.values():
    all_intervals.extend(intervals)

print(f"\n📊 Time interval statistics:")
print(f"  - Max time span: {max(all_intervals)} days")
print(f"  - Average interval: {np.mean(all_intervals):.1f} days")
print(f"  - Median interval: {np.median(all_intervals):.1f} days")


Calculating time intervals...


100%|██████████| 6039/6039 [00:00<00:00, 51123.93it/s]


🎯 Example time intervals for User 1:
  - Total intervals: 53
  - First 10 intervals: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
  - Time span: 6 days

📊 Time interval statistics:
  - Max time span: 1032 days
  - Average interval: 33.7 days
  - Median interval: 0.0 days





# Step 4: Create Targets


In [None]:
def create_next_item_targets(user_sequences):
    """Create next item prediction targets (TiSASRec compatible)"""

    targets = {}

    print("Creating next item prediction targets...")
    for user_id, seq in tqdm(user_sequences.items()):
        movie_ids = seq['movie_ids']
        if len(movie_ids) > 1:
            # Predict next movie ID (shift by 1)
            next_movies = [f"movie_{movie_id}" for movie_id in movie_ids[1:]]
            targets[user_id] = next_movies
        else:
            targets[user_id] = []  # Skip users with single interaction

    return targets

def create_sequence_level_targets(user_sequences):
    """Create sequence-level targets (last item in sequence)"""

    targets = {}

    print("Creating sequence-level targets...")
    for user_id, seq in tqdm(user_sequences.items()):
        movie_ids = seq['movie_ids']
        if len(movie_ids) > 0:
            # Use last movie as sequence target
            last_movie = f"movie_{movie_ids[-1]}"
            targets[user_id] = last_movie
        else:
            targets[user_id] = None

    return targets

# Create next-item prediction targets (main task)
next_item_targets = create_next_item_targets(user_sequences)
sequence_level_targets = create_sequence_level_targets(user_sequences)

# Show example targets
example_next_targets = next_item_targets[example_user]
example_seq_target = sequence_level_targets[example_user]

print(f"\n🎯 Example targets for User {example_user}:")
print(f"  - Next item targets: {example_next_targets[:10]}")
print(f"  - Sequence-level target: {example_seq_target}")

# Target statistics
all_next_items = []
for targets in next_item_targets.values():
    all_next_items.extend(targets)

print(f"\n📊 Target statistics:")
print(f"  - Total next-item predictions: {len(all_next_items):,}")
print(f"  - Unique target movies: {len(set(all_next_items)):,}")
print(f"  - Users with valid targets: {len([t for t in next_item_targets.values() if len(t) > 0]):,}")

# Most common target movies
target_counts = Counter(all_next_items)
print(f"\n📈 Most common target movies:")
for movie, count in target_counts.most_common(10):
    print(f"  - {movie}: {count:,} times")


Creating next item prediction targets...


100%|██████████| 6039/6039 [00:00<00:00, 61801.34it/s]


Creating sequence-level targets...


100%|██████████| 6039/6039 [00:00<00:00, 777047.02it/s]


🎯 Example targets for User 1:
  - Next item targets: ['movie_1270', 'movie_1721', 'movie_1022', 'movie_2340', 'movie_1836', 'movie_3408', 'movie_2804', 'movie_1207', 'movie_1193', 'movie_720']
  - Sequence-level target: movie_48

📊 Target statistics:
  - Total next-item predictions: 830,439
  - Unique target movies: 3,623
  - Users with valid targets: 6,038






📈 Most common target movies:
  - movie_2858: 3,197 times
  - movie_260: 2,765 times
  - movie_1196: 2,759 times
  - movie_1210: 2,509 times
  - movie_2028: 2,507 times
  - movie_589: 2,490 times
  - movie_2571: 2,430 times
  - movie_2762: 2,381 times
  - movie_608: 2,368 times
  - movie_593: 2,345 times


# Step 5: Create Final Dataset


In [None]:
def create_final_dataset(action_sequences, time_intervals, rating_sequences, targets, min_length=3, max_length=int(1e10)):
    """Create final dataset in MUTTI format (TiSASRec compatible)"""

    final_dataset = []

    print("Creating final dataset for next-item prediction...")
    for user_id in tqdm(action_sequences.keys()):
        seq_actions = action_sequences[user_id]
        seq_times = time_intervals[user_id]
        seq_ratings = rating_sequences[user_id]
        next_targets = targets[user_id]

        # Skip users without valid targets
        if len(next_targets) == 0:
            continue

        # For next-item prediction, input sequence is all but last, target is next items
        if len(seq_actions) >= min_length and len(seq_actions) <= max_length:
            # Ensure sequence lengths match
            assert len(seq_actions) == len(seq_times), f"Length mismatch for user {user_id}"

            assert len(seq_actions) == len(seq_ratings), f"Length mismatch  with ratings for user {user_id}"

            # Input sequence: all items except last one
            input_actions = seq_actions[:-1]
            input_times = seq_times[:-1]
            input_ratings = seq_ratings[:-1]

            # Target: next item (last item in original sequence)
            target_item = seq_actions[-1]  # Last item is what we want to predict

            final_dataset.append({
                'user_id': user_id,
                'seq_actions': input_actions,
                'seq_times': input_times,
                'seq_ratings': input_ratings,
                'target': target_item,  # Single target item (not sequence)
                'sequence_length': len(input_actions)
            })

    return final_dataset

# Create final dataset using next-item prediction format
final_dataset = create_final_dataset(action_sequences, time_intervals, rating_sequences, next_item_targets)

print(f"\n✅ Created final dataset:")
print(f"  - Total sequences: {len(final_dataset):,}")
print(f"  - Average sequence length: {np.mean([d['sequence_length'] for d in final_dataset]):.1f}")
print(f"  - Min sequence length: {min([d['sequence_length'] for d in final_dataset])}")
print(f"  - Max sequence length: {max([d['sequence_length'] for d in final_dataset])}")

# Show example final data
example_final = final_dataset[0]
print(f"\n🎯 Example final data structure:")
print(f"  - user_id: {example_final['user_id']}")
print(f"  - seq_actions (first 10): {example_final['seq_actions'][:10]}")
print(f"  - seq_ratings (first 10): {example_final['seq_ratings'][:10]}")
print(f"  - seq_times (first 10): {example_final['seq_times'][:10]}")
print(f"  - target (first 10): {example_final['target'][:10]}")
print(f"  - sequence_length: {example_final['sequence_length']}")


Creating final dataset for next-item prediction...


100%|██████████| 6039/6039 [00:00<00:00, 66222.56it/s]


✅ Created final dataset:
  - Total sequences: 6,038
  - Average sequence length: 137.5
  - Min sequence length: 6
  - Max sequence length: 1967

🎯 Example final data structure:
  - user_id: 1
  - seq_actions (first 10): ['movie_3186', 'movie_1270', 'movie_1721', 'movie_1022', 'movie_2340', 'movie_1836', 'movie_3408', 'movie_2804', 'movie_1207', 'movie_1193']
  - seq_ratings (first 10): [4, 5, 4, 5, 3, 5, 4, 5, 4, 5]
  - seq_times (first 10): [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
  - target (first 10): movie_48
  - sequence_length: 52





# Step 6: Save Processed Data


In [None]:
# Save final dataset
with open(f"{output_path}/movielens_final_dataset.pkl", 'wb') as f:
    pickle.dump(final_dataset, f)

# Save vocabulary (all unique movie IDs)
vocab = {
    '[PAD]': 0,
    '[UNK]': 1,
    '[CLS]': 2,
    '[SEP]': 3
}

for i, movie_id in enumerate(sorted(all_movie_ids)):
    vocab[movie_id] = i + 4

with open(f"{output_path}/movielens_vocab.json", 'w') as f:
    json.dump(vocab, f, indent=2)

# Save metadata
metadata = {
    'dataset_name': 'MovieLens-1M',
    'task_type': 'next_item_recommendation',
    'task_description': 'Sequential recommendation (TiSASRec compatible)',
    'num_sequences': len(final_dataset),
    'vocab_size': len(vocab),
    'avg_sequence_length': float(np.mean([d['sequence_length'] for d in final_dataset])),
    'min_sequence_length': min([d['sequence_length'] for d in final_dataset]),
    'max_sequence_length': max([d['sequence_length'] for d in final_dataset]),
    'num_unique_movies': len(all_movie_ids),
    'total_interactions': len(all_next_items),
    'evaluation_metrics': ['Hit Rate (HR)', 'NDCG'],
    'preprocessing_date': datetime.now().isoformat()
}

with open(f"{output_path}/movielens_metadata.json", 'w') as f:
    json.dump(metadata, f, indent=2)

print(f"✅ Saved processed data to {output_path}")
print(f"📁 Files created:")
print(f"  - movielens_final_dataset.pkl ({len(final_dataset):,} sequences)")
print(f"  - movielens_vocab.json ({len(vocab):,} tokens)")
print(f"  - movielens_metadata.json")

# Display final summary
print(f"\n🎉 MovieLens-1M preprocessing completed!")
print(f"📊 Final dataset summary:")
print(f"  - Format: seq_actions, seq_times, target")
print(f"  - Sequences: {len(final_dataset):,}")
print(f"  - Vocabulary size: {len(vocab):,}")
print(f"  - Average length: {metadata['avg_sequence_length']:.1f} movies")
print(f"  - Task: Next-item recommendation (TiSASRec compatible)")
print(f"  - Evaluation: Hit Rate (HR), NDCG")
print(f"  - Ready for MUTTI vs TiSASRec comparison! 🚀")


✅ Saved processed data to /content/drive/MyDrive/Colab Notebooks/Thesis coding/public datasets try/movielens/data/processed/
📁 Files created:
  - movielens_final_dataset.pkl (6,038 sequences)
  - movielens_vocab.json (3,632 tokens)
  - movielens_metadata.json

🎉 MovieLens-1M preprocessing completed!
📊 Final dataset summary:
  - Format: seq_actions, seq_times, target
  - Sequences: 6,038
  - Vocabulary size: 3,632
  - Average length: 137.5 movies
  - Task: Next-item recommendation (TiSASRec compatible)
  - Evaluation: Hit Rate (HR), NDCG
  - Ready for MUTTI vs TiSASRec comparison! 🚀


In [None]:
d = pd.DataFrame(final_dataset)
d.head()

Unnamed: 0,user_id,seq_actions,seq_times,seq_ratings,target,sequence_length
0,1,"[movie_3186, movie_1270, movie_1721, movie_102...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[4, 5, 4, 5, 3, 5, 4, 5, 4, 5, 3, 4, 4, 4, 4, ...",movie_48,52
1,2,"[movie_1198, movie_1210, movie_1217, movie_271...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[4, 4, 3, 3, 5, 4, 5, 5, 5, 4, 4, 5, 4, 5, 4, ...",movie_1917,115
2,3,"[movie_593, movie_2858, movie_3534, movie_1968...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[3, 4, 3, 4, 3, 4, 5, 5, 4, 5, 4, 5, 4, 4, 5, ...",movie_2081,45
3,4,"[movie_1210, movie_1097, movie_3468, movie_480...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[3, 4, 5, 4, 5, 5, 5, 5, 4, 5, 5, 5, 4, 4, 4, ...",movie_1954,18
4,5,"[movie_908, movie_919, movie_1250, movie_2858,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[4, 4, 5, 4, 3, 5, 4, 5, 4, 3, 4, 3, 5, 3, 3, ...",movie_1884,142


In [None]:
d.to_parquet(seq_data_file_path_parquet, index=False)