# Assignment 2
## Twitch Recommendation System
Authors: Alex Cojocaru, Kyle Jorrin, Diego Otero-Caldwell, Alec Drumm

Research articles to get started:\
[YouTube Recommendation Algorithm](https://dl.acm.org/doi/abs/10.1145/1864708.1864770)\
[Overview Of Recommender Systems](https://search-library.ucsd.edu/permalink/01UCS_SDI/1vtf07t/cdi_doaj_primary_oai_doaj_org_article_e1fff15ae9b64b96915b66bc5dc81ac5)\
[FPMC](https://dl.acm.org/doi/abs/10.1145/1772690.1772773)\
...




In [20]:
# If first time, run this script to set up virtual environment
# Requires python3.11
!chmod +x scripts/setup_env.sh
!./scripts/setup_env.sh

./scripts/setup_env.sh: line 3: python3.11: command not found


Data link: https://cseweb.ucsd.edu/~jmcauley/datasets.html#twitch

Start and stop times are provided as integers and represent periods of 10 minutes. Stream ID could be used to retrieve a single broadcast segment from a streamer (not used in our work).

    User ID (anonymized)
    Stream ID
    Streamer username
    Time start
    Time stop

[Original research paper of the data](https://search-library.ucsd.edu/permalink/01UCS_SDI/1vtf07t/cdi_unpaywall_primary_10_1145_3460231_3474267)


Load in the data

In [21]:
# Imports
import pandas as pd
import numpy as np

In [22]:
# Data with header names
data = pd.read_csv('100k_a.csv', names=['user_id', 'stream_id', 'streamer_username', 'time_start', 'time_stop'])
print(type(data.iloc[5,1]))
data.head()

<class 'numpy.int64'>


Unnamed: 0,user_id,stream_id,streamer_username,time_start,time_stop
0,1,33842865744,mithrain,154,156
1,1,33846768288,alptv,166,169
2,1,33886469056,mithrain,587,588
3,1,33887624992,wtcn,589,591
4,1,33890145056,jrokezftw,591,594


In [23]:
# Create train and test splits temporally sorted by time_start
data = data.sort_values('time_start').reset_index(drop=True)
split_point = int(len(data) * 0.8)

train_data = data.iloc[:split_point].sample(frac=1)
test_data = data.iloc[split_point:].sample(frac=1)

print('train_data entries:', len(train_data), train_data.head(), sep='\n')
print('test_data entries:', len(test_data), test_data.head(), sep='\n')

# Count novel user-streamer pairs in test data
tst_sample = test_data.sample(frac=1/10000)
trn_sample = train_data.sample(frac=1/10000)

# Build a fast lookup set
train_pairs = set(zip(train_data['user_id'], train_data['streamer_username']))

novel_pairs = 0

for row in tst_sample.itertuples(index=False):
    if (row.user_id, row.streamer_username) not in train_pairs:
        novel_pairs += 1

# for row in tst_sample.itertuples(index=True):
#     # print(row)
#     test_user = row.user_id
#     test_streamer = row.streamer_username
    
#     if train_data[(trn_sample['user_id'] == test_user) & 
#                   (trn_sample['streamer_username'] == test_streamer)].empty:
#         novel_pairs += 1

print(f'Number of test rows with novel user-streamer pairs: {novel_pairs}')
print(f'Percentage of novel pairs in test data: {novel_pairs / len(test_data) * 100:.2f}%')

train_data entries:
2441386
         user_id    stream_id streamer_username  time_start  time_stop
1953051    45782  34209914672          mrsavage        4021       4027
1421947     4180  34116179072         immortoru        2990       2992
332954     70831  33898739136         uccleague         710        712
2195019    52677  34247211760        teamsp00ky        4484       4485
1613237    15702  34145283280       thedarkness        3357       3359
test_data entries:
610347
         user_id    stream_id  streamer_username  time_start  time_stop
2994499     9878  34405885312  avoidingthepuddle        6028       6033
2661542    83721  34346879136        djboucherie        5417       5419
2455752    86418  34308943008     montanablack88        5016       5017
2929563    58962  34396443312         banditgang        5906       5910
2748913    43311  34362497184       darkest_mage        5574       5576
Number of test rows with novel user-streamer pairs: 30
Percentage of novel pairs in test

In [24]:
# ----BROKEN DON'T USE----
# find the ratio of user-streamer pairs that are repeated from the training set in the test set
novel_views = 0

for row in test_data:
    #if not train_data[train_data['streamer_username'] == row['streamer_username'] and train_data['user_id'] == row['user_id']].empty:
    print(row)
    test_user_id = row[0]
    test_streamer = row[2]
    temp_df = train_data[train_data['user_id'] == test_user_id]
    temp_df = temp_df[temp_df['streamer_username'] == test_streamer]

    if temp_df.empty:
        novel_views += 1
    
print('percentage of novel views in test data:', novel_views/len(test_data))

user_id
stream_id
streamer_username
time_start
time_stop
percentage of novel views in test data: 8.19206123729616e-06


### Training the model (Part 3)
#### From section 4.2 of the paper

In [25]:
import pandas as pd
from scipy.sparse import csr_matrix

# Group by user_id and streamer_username, count interactions
interaction_counts = train_data.groupby(['user_id', 'streamer_username']).size().reset_index(name='count')

# Map user_id and streamer_username to indices for the matrix
user_ids = interaction_counts['user_id'].unique()
streamer_usernames = interaction_counts['streamer_username'].unique()

user_to_idx = {user: idx for idx, user in enumerate(user_ids)}
streamer_to_idx = {streamer: idx for idx, streamer in enumerate(streamer_usernames)}
idx_to_streamer = {idx: streamer for idx, streamer in enumerate(streamer_usernames)}


# Prepare data for sparse matrix
rows = interaction_counts['user_id'].map(user_to_idx)
cols = interaction_counts['streamer_username'].map(streamer_to_idx)
values = interaction_counts['count']

# Create sparse matrix (users x streamers)
user_streamer_matrix = csr_matrix((values, (rows, cols)), shape=(len(user_ids), len(streamer_usernames)))
user_streamer_matrix.shape

(98184, 143670)

In [26]:
# Replace surprise with implicit
import implicit

# Train ALS model
model = implicit.als.AlternatingLeastSquares(factors=50)
model.fit(user_streamer_matrix)

# Recommend for user 0
recommendations, scores = model.recommend(0, user_streamer_matrix[0])
print("Recommended items:", recommendations)
print("Scores:", scores)

ModuleNotFoundError: No module named 'implicit'

In [None]:
# Testing the model - get testing recs
test_users = test_data['user_id']
user_set = set()

for user in test_users:
  if user in user_to_idx:
    user_set.add(user_to_idx[user])

# Convert user_set to a list for iteration and get recommendations for each user
all_recommendations = []
all_scores = []

for user_idx in user_set:
    recs, scores = model.recommend(user_idx, user_streamer_matrix[user_idx])
    all_recommendations.append((user_idx, recs))
    all_scores.append((user_idx, scores))

print("Recommended items:", all_recommendations)
print("Scores:", all_scores)

In [None]:
# Evaluating the model with hit@1
total_num_recs = len(all_recommendations)
num_hit_at_1 = 0

for uid, rec in all_recommendations:
  # filter the test data for only rows with the given user, then filter that for rows with the top reccomended steamer
  temp_df = test_data[test_data['user_id'] == uid]
  temp_df = temp_df[temp_df['streamer_username'] == idx_to_streamer[rec[0]]]
  if not temp_df.empty:
    num_hit_at_1 += 1

print('Hit@1 prediction accuracy:', num_hit_at_1 / total_num_recs)

In [None]:
# Evaluating the model with hit@10
num_hit_at_10 = 0
# i=0

for uid, rec in all_recommendations:
  # filter the test data for only rows with the given user, then filter that for rows with the top reccomended steamer
  temp_df = test_data[test_data['user_id'] == uid]
  for r in rec:
    # if i<10:
    #     print('idx_to_streamer[r]', idx_to_streamer[r], "temp_df['streamer_username']", temp_df['streamer_username'].to_numpy())
    #     i += 1
    if idx_to_streamer[r] in temp_df['streamer_username'].to_numpy():
      num_hit_at_10 += 1
      break


print('Hit@10 prediction accuracy:', num_hit_at_10 / total_num_recs)


### REP baseline (repeat-interaction model)


In [28]:
# Build REP rankings: score equals repeated views in the training sequence
rep_counts = (
    train_data.groupby(['user_id', 'streamer_username'])
    .size()
    .reset_index(name='views')
)

rep_rankings = (
    rep_counts.sort_values(['user_id', 'views'], ascending=[True, False])
    .groupby('user_id')['streamer_username']
    .apply(list)
    .to_dict()
)


def evaluate_rep(rankings, test_df, k=10):
    user_targets = test_df.groupby('user_id')['streamer_username'].apply(set).to_dict()
    total_users = hit_at_1 = hit_at_k = 0

    for user, ordered_streamers in rankings.items():
        targets = user_targets.get(user)
        if not targets:
            continue

        total_users += 1
        topk = ordered_streamers[:k]
        hit_at_1 += bool(topk) and topk[0] in targets
        hit_at_k += any(streamer in targets for streamer in topk)

    return {
        'users_evaluated': total_users,
        'hit@1': hit_at_1 / total_users if total_users else 0.0,
        f'hit@{k}': hit_at_k / total_users if total_users else 0.0,
    }

rep_metrics = evaluate_rep(rep_rankings, test_data, k=10)
pd.Series(rep_metrics)



users_evaluated    76144.000000
hit@1                  0.513369
hit@10                 0.800720
dtype: float64