# Data Processing

This notebook contains the code for reading in and processing the data.

In [2]:
# Import Packages
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from datetime import datetime
from sklearn.model_selection import train_test_split


First we will read in the data:

In [3]:
# Read User Info Data
user_info = pd.read_csv('data/raw/user_info.txt', sep='\t')
user_info = user_info[user_info['JOB_TITLE'].notna() & (user_info['JOB_TITLE'] != '')]

# print(user_info.head(10))
# print(len(user_info))

In [4]:
# Read Tenant Info Data
tenant_info = pd.read_csv('data/raw/tenant_info.txt', sep='\t')

# print(tenant_info.head(10))
# print(len(tenant_info))

In [5]:
# Read Interaction Data
interactions = pd.read_csv('data/raw/interactions.txt', sep='\t')

# print(interactions.head(10))
# print(len(interactions))

In [6]:
# Read Topic Data
content_topics = pd.read_csv('data/raw/content_topics.txt', sep='\t')
content_topics = content_topics[content_topics['TOPIC'].notna() & (content_topics['TOPIC'] != '')]

# print(content_topics.head(10))
# print(len(content_topics))

  content_topics = pd.read_csv('data/raw/content_topics.txt', sep='\t')


In [7]:
# Read Tag Data
content_tags = pd.read_csv('data/raw/content_tags.txt', sep='\t')
content_tags = content_tags[content_tags['TAG'].notna() & (content_tags['TAG'] != '')]

# print(content_tags.head(10))
# print(len(content_tags))

In [8]:
# Read Keywords Data
content_keywords = pd.read_csv('data/raw/content_keywords.txt', sep='\t')
content_keywords = content_keywords[content_keywords['KEYWORD'].notna() & (content_keywords['KEYWORD'] != '')]

# print(content_keywords.head(10))
# print(len(content_keywords))

I will create and export samples for quick local viewing:

In [9]:
# Create and Export Sample Sets
user_info_sample = user_info.sample(n=100, random_state=42)
tenant_info_sample = tenant_info.sample(n=100, random_state=42)
interactions_sample = interactions.sample(n=100, random_state=42)
content_topics_sample = content_topics.sample(n=100, random_state=42)
content_tags_sample = content_tags.sample(n=100, random_state=42)
content_keywords_sample = content_keywords.sample(n=100, random_state=42)

# Export
export_path = 'data/sample/'
user_info_sample.to_csv(f'{export_path}user_info_sample.csv', index=False)
tenant_info_sample.to_csv(f'{export_path}tenant_info_sample.csv', index=False)
interactions_sample.to_csv(f'{export_path}interactions_sample.csv', index=False)
content_topics_sample.to_csv(f'{export_path}content_topics_sample.csv', index=False)
content_tags_sample.to_csv(f'{export_path}content_tags_sample.csv', index=False)
content_keywords_sample.to_csv(f'{export_path}content_keywords_sample.csv', index=False)

Create aggregated lists of Topics, Tags, and Keywords for each piece of content:

In [10]:
# Aggregate Topic Data
content_topics_agg = content_topics.groupby('CONTENT_ID').agg({
    'TOPIC': lambda x: ', '.join(x.dropna().astype(str).unique()),
    'SUPTOPIC': lambda x: ', '.join(x.dropna().astype(str).unique()),
    'TOPIC_SUBTOPIC': lambda x: ', '.join(x.dropna().astype(str).unique())
}).reset_index()
# Source: https://stackoverflow.com/questions/27298178/concatenate-strings-from-several-rows-using-pandas-groupby

content_topics_agg.columns = ['CONTENT_ID', 'TOPICS', 'SUBTOPICS', 'TOPIC_SUBTOPICS']

# print(content_topics_agg.head(10))

In [11]:
# Aggregate Tag Data
content_tags_agg = content_tags.groupby('CONTENT_ID').agg({
    'TAG': lambda x: ', '.join(x.dropna().astype(str).unique())
}).reset_index()

content_tags_agg.columns = ['CONTENT_ID', 'TAGS']

# print(content_tags_agg.head(20))

In [12]:
# Aggregate Keyword Data
content_keywords_agg = content_keywords.groupby('CONTENT_ID').agg({
    'KEYWORD': lambda x: ', '.join(x.dropna().astype(str).unique())
}).reset_index()

content_keywords_agg.columns = ['CONTENT_ID', 'KEYWORDS']

# print(content_keywords_agg.head(10))

Export and Re-import if needed to save time:

In [46]:
# Export Aggregated Content Data
# export_path = 'data/processed/' 
# content_topics_agg.to_csv(f'{export_path}content_topics_agg.csv', index=False)
# content_tags_agg.to_csv(f'{export_path}content_tags_agg.csv', index=False)
# content_keywords_agg.to_csv(f'{export_path}content_keywords_agg.csv', index=False)

In [47]:
# Import Aggregated Content Data
# content_topics_agg = pd.read_csv('data/processed/content_topics_agg.csv', sep=',')
# content_tags_agg = pd.read_csv('data/processed/content_tags_agg.csv', sep=',')
# content_keywords_agg = pd.read_csv('data/processed/content_keywords_agg.csv', sep=',')

Aggregate the grouped topics, tags, and keywords data into interactions. Join Tenant and User info:

In [13]:
# Copy and Aggregate Interactions Data
interactions_agg = interactions.copy()

interactions_agg = interactions_agg.merge(
    user_info, 
    how='left',
    left_on='USER_ID', 
    right_on='USERID'
).drop('USERID', axis=1)  # Drop duplicate ID

interactions_agg = interactions_agg.merge(
    tenant_info, 
    how='left',
    on='TENANT_ID'
)

interactions_agg = interactions_agg.merge(
    content_topics_agg, 
    how='left',
    on='CONTENT_ID'
)


interactions_agg = interactions_agg.merge(
    content_tags_agg, 
    how='left',
    on='CONTENT_ID'
)

interactions_agg = interactions_agg.merge(
    content_keywords_agg, 
    how='left',
    on='CONTENT_ID'
)
# Source: https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.merge.html

# print(interactions_agg.head(10))

In [14]:
# Display Shape and Columns
print(f"Shape: {interactions_agg.shape}")
print(f"Columns: {list(interactions_agg.columns)}")


# Show Statistics
print("\n")
print("Aggregation Stats:")
cols = ['INDUSTRY', 'SEGMENT', 'JOB_TITLE', 'TOPICS', 'SUBTOPICS', 'TOPIC_SUBTOPICS', 'TAGS', 'KEYWORDS']
for col in cols:
    non_empty = interactions_agg[col].str.len() > 0
    print(f"{col}: {non_empty.mean():.1%} filled")


Shape: (3727152, 14)
Columns: ['INTERACTION_ID', 'INTERACTION_DATE', 'INTERACTION_TYPE', 'TENANT_ID', 'USER_ID', 'CONTENT_ID', 'JOB_TITLE', 'INDUSTRY', 'SEGMENT', 'TOPICS', 'SUBTOPICS', 'TOPIC_SUBTOPICS', 'TAGS', 'KEYWORDS']


Aggregation Stats:
INDUSTRY: 84.7% filled
SEGMENT: 91.9% filled
JOB_TITLE: 23.9% filled
TOPICS: 94.2% filled
SUBTOPICS: 89.5% filled
TOPIC_SUBTOPICS: 94.2% filled
TAGS: 17.3% filled
KEYWORDS: 38.7% filled


Since job title, tags, and keywords aren't heavily populated, we will drop them. We will also drop subtopics as we will only use them with the combined topics_subtopics field.I'll also pull out 10 random users to use for the validation of recommendations after the models are trained. 

In [None]:
# Create Subset and Drop JOB_TITLE, TAGS, and KEYWORDS
interactions_agg = interactions_agg[['INTERACTION_ID', 'INTERACTION_DATE', 'INTERACTION_TYPE', 'TENANT_ID', 'USER_ID', 'CONTENT_ID', 'INDUSTRY', 'SEGMENT', 'TOPICS', 'TOPIC_SUBTOPICS']]

# Select 10 random users for validation set
random_users = interactions_agg['USER_ID'].drop_duplicates().sample(n=10, random_state=42)

# Create validation set with all interactions from these 10 users
interactions_agg_val = interactions_agg[interactions_agg['USER_ID'].isin(random_users)]

# Remove these users from the main dataset
interactions_agg = interactions_agg[~interactions_agg['USER_ID'].isin(random_users)]

# Export dataset
export_path = 'data/processed/' 
interactions_agg.to_csv(f'{export_path}interactions_agg.csv', index=False)
interactions_agg_val.to_csv(f'{export_path}interactions_agg_val.csv', index=False)

# print(interactions_agg.head(10))

# Create and Export Sample Set
interactions_agg_sample = interactions_agg.sample(n=100, random_state=42)

# Export
export_path = 'data/sample/'
interactions_agg_sample.to_csv(f'{export_path}interactions_agg_sample.csv', index=False)

For the purpose of content recommendations, it might be better to train on a list without aggregated values. I will create a 'long' dataset with an entry for each aggregate combo:

In [None]:
# Create 'Long' Dataset
interactions_agg_long = interactions.copy()

interactions_agg_long = interactions_agg_long.merge(
    user_info, 
    how='left',
    left_on='USER_ID', 
    right_on='USERID',
).drop('USERID', axis=1)  # Drop duplicate ID

interactions_agg_long = interactions_agg_long.merge(
    tenant_info, 
    how='left',
    on='TENANT_ID'
)

interactions_agg_long = interactions_agg_long.merge(
    content_topics, 
    how='left',
    on='CONTENT_ID'
)

# Grab relevant columns
interactions_agg_long = interactions_agg_long[['INTERACTION_ID', 'INTERACTION_DATE', 'INTERACTION_TYPE', 'TENANT_ID', 'USER_ID', 'CONTENT_ID', 'INDUSTRY', 'SEGMENT', 'TOPIC', 'TOPIC_SUBTOPIC']]

# Remove rows with missing data
interactions_agg_long = interactions_agg_long[
    interactions_agg_long['INDUSTRY'].notna() & 
    interactions_agg_long['SEGMENT'].notna() & 
    interactions_agg_long['TOPIC'].notna() & 
    interactions_agg_long['TOPIC_SUBTOPIC'].notna()
]
# Source: https://www.geeksforgeeks.org/python/drop-rows-from-pandas-dataframe-with-missing-values-or-nan-in-columns/

# Replace 'Engineering' and 'Architecture' with 'Multidiscipline (Arch & Eng)'
interactions_agg_long['INDUSTRY'] = interactions_agg_long['INDUSTRY'].replace(['Engineering', 'Architecture'], 'Multidiscipline (Arch & Eng)')

# Remove rows where INDUSTRY is 'Unknown'
interactions_agg_long = interactions_agg_long[interactions_agg_long['INDUSTRY'] != 'Unknown']

# Create validation set with the same 10 users
interactions_agg_long_val = interactions_agg_long[interactions_agg_long['USER_ID'].isin(random_users)]

# Remove these users from the main long dataset
interactions_agg_long = interactions_agg_long[~interactions_agg_long['USER_ID'].isin(random_users)]

# Export
export_path = 'data/processed/'
interactions_agg_long.to_csv(f'{export_path}interactions_agg_long.csv', index=False)
interactions_agg_long_val.to_csv(f'{export_path}interactions_agg_long_val.csv', index=False)

# print(interactions_agg_long.head(10))

In [17]:
# Display Shape and Columns
print(f"Shape: {interactions_agg_long.shape}")
print(f"Columns: {list(interactions_agg_long.columns)}")


# Show Statistics
print("\n")
print("Aggregation Stats:")
cols = ['INDUSTRY', 'SEGMENT', 'TOPIC', 'TOPIC_SUBTOPIC']
for col in cols:
    non_empty = interactions_agg_long[col].str.len() > 0
    print(f"{col}: {non_empty.mean():.1%} filled")

Shape: (4919195, 10)
Columns: ['INTERACTION_ID', 'INTERACTION_DATE', 'INTERACTION_TYPE', 'TENANT_ID', 'USER_ID', 'CONTENT_ID', 'INDUSTRY', 'SEGMENT', 'TOPIC', 'TOPIC_SUBTOPIC']


Aggregation Stats:
INDUSTRY: 100.0% filled
SEGMENT: 100.0% filled
TOPIC: 100.0% filled
TOPIC_SUBTOPIC: 100.0% filled


In [18]:
# Create and Export Sample Set
interactions_agg_long_sample = interactions_agg_long.sample(n=100, random_state=42)

# Export Sample
export_path = 'data/sample/'
interactions_agg_long_sample.to_csv(f'{export_path}interactions_agg_long_sample.csv', index=False)

# Export Full
export_path = 'data/processed/'
interactions_agg_long.to_csv(f'{export_path}interactions_agg_long.csv', index=False)

We will encode each of the values for use later on.

In [29]:
export_path = 'data/processed/'

# INDUSTRY Encoding
industry_unique = interactions_agg_long['INDUSTRY'].dropna().unique()
industry_le = LabelEncoder()
industry_encoded = industry_le.fit_transform(industry_unique)
industry_df = pd.DataFrame({
    'INDUSTRY': industry_unique,
    'INDUSTRY_encoded': industry_encoded
})
industry_df = industry_df.sort_values(by='INDUSTRY').reset_index(drop=True)
industry_df.to_csv(f'{export_path}industry_enc.csv', index=False)

# SEGMENT Encoding
segment_unique = interactions_agg_long['SEGMENT'].dropna().unique()
segment_le = LabelEncoder()
segment_encoded = segment_le.fit_transform(segment_unique)
segment_df = pd.DataFrame({
    'SEGMENT': segment_unique,
    'SEGMENT_encoded': segment_encoded
})
segment_df = segment_df.sort_values(by='SEGMENT').reset_index(drop=True)
segment_df.to_csv(f'{export_path}segment_enc.csv', index=False)

# TOPIC Encoding
topic_unique = interactions_agg_long['TOPIC'].dropna().unique()
topic_le = LabelEncoder()
topic_encoded = topic_le.fit_transform(topic_unique)
topic_df = pd.DataFrame({
    'TOPIC': topic_unique,
    'TOPIC_encoded': topic_encoded
})
topic_df = topic_df.sort_values(by='TOPIC').reset_index(drop=True)
topic_df.to_csv(f'{export_path}topic_enc.csv', index=False)

# TOPIC_SUBTOPIC Encoding
topic_subtopic_unique = interactions_agg_long['TOPIC_SUBTOPIC'].dropna().unique()
topic_subtopic_le = LabelEncoder()
topic_subtopic_encoded = topic_subtopic_le.fit_transform(topic_subtopic_unique)
topic_subtopic_df = pd.DataFrame({
    'TOPIC_SUBTOPIC': topic_subtopic_unique,
    'TOPIC_SUBTOPIC_encoded': topic_subtopic_encoded
})
topic_subtopic_df = topic_subtopic_df.sort_values(by='TOPIC_SUBTOPIC').reset_index(drop=True)
topic_subtopic_df.to_csv(f'{export_path}topic_subtopic_enc.csv', index=False)

Create KNN Dataset: 
This dataset will have a row for every user, and a column for each topic. The intersect will be the interaction precentages (num_interactions / total_interactions).

In [None]:
# Combine the two DataFrames for complete dataset, then pull out 10 users as the end
interactions_knn= pd.concat([interactions_agg_long, interactions_agg_long_val])


# Group by user
grouped_users = interactions_knn.groupby('USER_ID').agg({
    'INDUSTRY': 'first',
    'SEGMENT': 'first',
    'TENANT_ID': 'first'
}).reset_index()

# Map encoded values
grouped_users = grouped_users.merge(
    industry_df[['INDUSTRY', 'INDUSTRY_encoded']], 
    on='INDUSTRY', 
    how='left'
)

grouped_users = grouped_users.merge(
    segment_df[['SEGMENT', 'SEGMENT_encoded']], 
    on='SEGMENT', 
    how='left'
)

# Order columns
grouped_users = grouped_users[['USER_ID', 'TENANT_ID', 'INDUSTRY', 'INDUSTRY_encoded', 'SEGMENT', 'SEGMENT_encoded']]

# Count interactions per user
topic_pivot = interactions_knn.groupby(['USER_ID', 'TOPIC']).size().unstack(fill_value=0)
# NOTICE: Claude assisted me with this line of code. I was originally trying a loop, but this wasn't working. Claude suggested group_by + .size()

# Grab topic column names
topic_cols = list(topic_pivot.columns)

# Calculate total interactions per user
total_interactions = topic_pivot.sum(axis=1).replace(0, 1) # Added replace to div 0 errors

# Convert counts to percentages
for col in topic_cols:
    topic_pivot[col] = (topic_pivot[col] / total_interactions * 100).round(2)

# Merge grouped users with topic percentages
knn_dataset = grouped_users.merge(
    topic_pivot, 
    how='left',
    on='USER_ID'
)

# Fill null values with 0
knn_dataset[topic_cols] = knn_dataset[topic_cols].fillna(0)

# Configure validation set with the 10 random users
knn_dataset_val = knn_dataset[knn_dataset['USER_ID'].isin(random_users)]
knn_dataset = knn_dataset[~knn_dataset['USER_ID'].isin(random_users)]

# print(knn_dataset.head(1))

In [None]:
# Export
export_path = 'data/processed/'
knn_dataset.to_csv(f'{export_path}knn_dataset.csv', index=False)
knn_dataset_val.to_csv(f'{export_path}knn_dataset_val.csv', index=False)

Create Neural Network Dataset:
Each row in this dataset will be a collection of 5 sequential content interactions for a user. The interactions will be represented by their topic_subtopic, with a target for classification. The actual content_id of the target will also be included as this will be used for judging recommendation accuracy. 

In [None]:
neural_network_set = interactions_agg_long.copy()

# Convert date to datetime and sort
neural_network_set['INTERACTION_DATE'] = pd.to_datetime(neural_network_set['INTERACTION_DATE'])
neural_newtork_sorted = neural_network_set.sort_values(['USER_ID', 'INTERACTION_DATE']).reset_index(drop=True)

# Function to create sequences for a single user
def create_user_sequences(user_data, sequence_length):
    
    user_data = user_data.reset_index(drop=True)
    
    # Return nothing is user doesn't have enough data
    if len(user_data) < sequence_length + 1:
        return []
    
    # Grab user's tenant, industry, and segment
    user_context = {
        'tenant_id': user_data['TENANT_ID'].iloc[0], # Grab first value
        'industry': user_data['INDUSTRY'].iloc[0],
        'segment': user_data['SEGMENT'].iloc[0]
    }
    
    # Grab topic_subtopic, content_id, and topic
    topic_subtopics = user_data['TOPIC_SUBTOPIC'].values # Grab all values
    content_ids = user_data['CONTENT_ID'].values
    topics = user_data['TOPIC'].values
    
    # Sliding window
    sequences = []
    for i in range(len(topic_subtopics) - sequence_length): # How many sequences
        sequence_dict = {'user_id': user_data['USER_ID'].iloc[0]}
        sequence_dict.update(user_context)
        
        # Build sequence
        for j in range(sequence_length):
            sequence_dict[f'topic_subtopic_{j+1}'] = topic_subtopics[i + j]
        
        # Target fields, what comes after sequence
        sequence_dict['target_topic_subtopic'] = topic_subtopics[i + sequence_length]
        sequence_dict['target_content_id'] = content_ids[i + sequence_length]
        sequence_dict['target_topic'] = topics[i + sequence_length]
        
        sequences.append(sequence_dict)
    # Source: https://www.geeksforgeeks.org/dsa/window-sliding-technique/
    
    return sequences

# Process each user and create sequences
all_sequences = []
for user_id, user_group in neural_newtork_sorted.groupby('USER_ID'):
    user_sequences = create_user_sequences(user_group, 5) # sequence length of 5
    all_sequences.extend(user_sequences)  

# Create DF from sequences
neural_network_dataset = pd.DataFrame(all_sequences)

# Store the original target_topic as target_topic_eng before encoding
neural_network_dataset['target_topic_eng'] = neural_network_dataset['target_topic'].copy()

# Map encoded values - NOTICE: Claude helped me implement this code. I knew conceptually what I wanted to achieve (as done so for the KNN dataset), Claude gave me the skeleton to incorporate here.
# Industry
neural_network_dataset = neural_network_dataset.merge(
    industry_df[['INDUSTRY', 'INDUSTRY_encoded']],
    left_on='industry',
    right_on='INDUSTRY',
    how='left'
).drop(columns=['INDUSTRY'])
neural_network_dataset['industry'] = neural_network_dataset['INDUSTRY_encoded']
neural_network_dataset = neural_network_dataset.drop(columns=['INDUSTRY_encoded'])

# Segment
neural_network_dataset = neural_network_dataset.merge(
    segment_df[['SEGMENT', 'SEGMENT_encoded']],
    left_on='segment',
    right_on='SEGMENT',
    how='left'
).drop(columns=['SEGMENT'])
neural_network_dataset['segment'] = neural_network_dataset['SEGMENT_encoded']
neural_network_dataset = neural_network_dataset.drop(columns=['SEGMENT_encoded'])

# Topic_subtopic columns (1-5 and target)
for i in range(1, 6):
    col_name = f'topic_subtopic_{i}'
    neural_network_dataset = neural_network_dataset.merge(
        topic_subtopic_df[['TOPIC_SUBTOPIC', 'TOPIC_SUBTOPIC_encoded']],
        left_on=col_name,
        right_on='TOPIC_SUBTOPIC',
        how='left',
        suffixes=('', f'_{i}')
    ).drop(columns=['TOPIC_SUBTOPIC'])
    neural_network_dataset[col_name] = neural_network_dataset['TOPIC_SUBTOPIC_encoded']
    neural_network_dataset = neural_network_dataset.drop(columns=['TOPIC_SUBTOPIC_encoded'])

# Target_topic_subtopic
neural_network_dataset = neural_network_dataset.merge(
    topic_subtopic_df[['TOPIC_SUBTOPIC', 'TOPIC_SUBTOPIC_encoded']],
    left_on='target_topic_subtopic',
    right_on='TOPIC_SUBTOPIC',
    how='left'
).drop(columns=['TOPIC_SUBTOPIC'])
neural_network_dataset['target_topic_subtopic'] = neural_network_dataset['TOPIC_SUBTOPIC_encoded']
neural_network_dataset = neural_network_dataset.drop(columns=['TOPIC_SUBTOPIC_encoded'])

# Target_topic
neural_network_dataset = neural_network_dataset.merge(
    topic_df[['TOPIC', 'TOPIC_encoded']],
    left_on='target_topic',
    right_on='TOPIC',
    how='left'
).drop(columns=['TOPIC'])
neural_network_dataset['target_topic'] = neural_network_dataset['TOPIC_encoded']
neural_network_dataset = neural_network_dataset.drop(columns=['TOPIC_encoded'])

# print(neural_network_dataset.head(1))

In [39]:
# Export
export_path = 'data/processed/'
neural_network_dataset.to_csv(f'{export_path}neural_network_dataset.csv', index=False)

In [40]:
# Create and Export Sample Set
knn_dataset_sample = knn_dataset.sample(n=100, random_state=42)
neural_network_dataset_sample = neural_network_dataset.sample(n=100, random_state=42)

# Export
export_path = 'data/sample/'
knn_dataset_sample.to_csv(f'{export_path}knn_dataset_sample.csv', index=False)
neural_network_dataset_sample.to_csv(f'{export_path}neural_network_dataset_sample.csv', index=False)

In [None]:
# Repeat for Validation dataset
neural_network_set_val = interactions_agg_long_val.copy()

# Convert date to datetime and sort
neural_network_set_val['INTERACTION_DATE'] = pd.to_datetime(neural_network_set_val['INTERACTION_DATE'])
neural_newtork_sorted_val = neural_network_set_val.sort_values(['USER_ID', 'INTERACTION_DATE']).reset_index(drop=True)

# Function to create sequences for a single user
def create_user_sequences(user_data, sequence_length):
    
    user_data = user_data.reset_index(drop=True)
    
    # Return nothing is user doesn't have enough data
    if len(user_data) < sequence_length + 1:
        return []
    
    # Grab user's tenant, industry, and segment
    user_context = {
        'tenant_id': user_data['TENANT_ID'].iloc[0], # Grab first value
        'industry': user_data['INDUSTRY'].iloc[0],
        'segment': user_data['SEGMENT'].iloc[0]
    }
    
    # Grab topic_subtopic, content_id, and topic
    topic_subtopics = user_data['TOPIC_SUBTOPIC'].values # Grab all values
    content_ids = user_data['CONTENT_ID'].values
    topics = user_data['TOPIC'].values
    
    # Sliding window
    sequences = []
    for i in range(len(topic_subtopics) - sequence_length): # How many sequences
        sequence_dict = {'user_id': user_data['USER_ID'].iloc[0]}
        sequence_dict.update(user_context)
        
        # Build sequence
        for j in range(sequence_length):
            sequence_dict[f'topic_subtopic_{j+1}'] = topic_subtopics[i + j]
        
        # Target fields, what comes after sequence
        sequence_dict['target_topic_subtopic'] = topic_subtopics[i + sequence_length]
        sequence_dict['target_content_id'] = content_ids[i + sequence_length]
        sequence_dict['target_topic'] = topics[i + sequence_length]
        
        sequences.append(sequence_dict)
    # Source: https://www.geeksforgeeks.org/dsa/window-sliding-technique/
    
    return sequences

# Process each user and create sequences
all_sequences_val = []
for user_id, user_group in neural_newtork_sorted_val.groupby('USER_ID'):
    user_sequences = create_user_sequences(user_group, 5) # sequence length of 5
    all_sequences_val.extend(user_sequences)  

# Create DF from sequences
neural_network_dataset_val = pd.DataFrame(all_sequences_val)

# Store the original target_topic as target_topic_eng before encoding
neural_network_dataset_val['target_topic_eng'] = neural_network_dataset_val['target_topic'].copy()

# Map encoded values - NOTICE: Claude helped me implement this code. I knew conceptually what I wanted to achieve (as done so for the KNN dataset), Claude gave me the skeleton to incorporate here.
# Industry
neural_network_dataset_val = neural_network_dataset_val.merge(
    industry_df[['INDUSTRY', 'INDUSTRY_encoded']],
    left_on='industry',
    right_on='INDUSTRY',
    how='left'
).drop(columns=['INDUSTRY'])
neural_network_dataset_val['industry'] = neural_network_dataset_val['INDUSTRY_encoded']
neural_network_dataset_val = neural_network_dataset_val.drop(columns=['INDUSTRY_encoded'])

# Segment
neural_network_dataset_val = neural_network_dataset_val.merge(
    segment_df[['SEGMENT', 'SEGMENT_encoded']],
    left_on='segment',
    right_on='SEGMENT',
    how='left'
).drop(columns=['SEGMENT'])
neural_network_dataset_val['segment'] = neural_network_dataset_val['SEGMENT_encoded']
neural_network_dataset_val = neural_network_dataset_val.drop(columns=['SEGMENT_encoded'])

# Topic_subtopic columns (1-5 and target)
for i in range(1, 6):
    col_name = f'topic_subtopic_{i}'
    neural_network_dataset_val = neural_network_dataset_val.merge(
        topic_subtopic_df[['TOPIC_SUBTOPIC', 'TOPIC_SUBTOPIC_encoded']],
        left_on=col_name,
        right_on='TOPIC_SUBTOPIC',
        how='left',
        suffixes=('', f'_{i}')
    ).drop(columns=['TOPIC_SUBTOPIC'])
    neural_network_dataset_val[col_name] = neural_network_dataset_val['TOPIC_SUBTOPIC_encoded']
    neural_network_dataset_val = neural_network_dataset_val.drop(columns=['TOPIC_SUBTOPIC_encoded'])

# Target_topic_subtopic
neural_network_dataset_val = neural_network_dataset_val.merge(
    topic_subtopic_df[['TOPIC_SUBTOPIC', 'TOPIC_SUBTOPIC_encoded']],
    left_on='target_topic_subtopic',
    right_on='TOPIC_SUBTOPIC',
    how='left'
).drop(columns=['TOPIC_SUBTOPIC'])
neural_network_dataset_val['target_topic_subtopic'] = neural_network_dataset_val['TOPIC_SUBTOPIC_encoded']
neural_network_dataset_val = neural_network_dataset_val.drop(columns=['TOPIC_SUBTOPIC_encoded'])

# Target_topic
neural_network_dataset_val = neural_network_dataset_val.merge(
    topic_df[['TOPIC', 'TOPIC_encoded']],
    left_on='target_topic',
    right_on='TOPIC',
    how='left'
).drop(columns=['TOPIC'])
neural_network_dataset_val['target_topic'] = neural_network_dataset_val['TOPIC_encoded']
neural_network_dataset_val = neural_network_dataset_val.drop(columns=['TOPIC_encoded'])

# Export
export_path = 'data/processed/'
neural_network_dataset_val.to_csv(f'{export_path}neural_network_dataset_val.csv', index=False)

Split knn_dataset and neural_network_dataset into test and train:
* Update, this is now being done in the model_training.ipynb file due to some additional feature engineering that takes place.

In [None]:
# # Basic 80/20 Split
# knn_train, knn_test = train_test_split(knn_dataset, test_size=0.2, random_state=42,)
# neural_network_train, neural_network_test = train_test_split(neural_network_dataset, test_size=0.2, random_state=42,)
# # Source: https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html

# # Export
# export_path = 'data/processed/'
# knn_train.to_csv(f'{export_path}knn_train.csv', index=False)
# knn_test.to_csv(f'{export_path}knn_test.csv', index=False)
# neural_network_train.to_csv(f'{export_path}neural_network_train.csv', index=False)
# neural_network_test.to_csv(f'{export_path}neural_network_test.csv', index=False)