### Load u.data into Python

In [1]:
import pandas as pd

# Load ratings
cols = ["user_id", "item_id", "rating", "timestamp"]
df = pd.read_csv("u.data", sep="\t", names=cols)

print(df.head())

   user_id  item_id  rating  timestamp
0      196      242       3  881250949
1      186      302       3  891717742
2       22      377       1  878887116
3      244       51       2  880606923
4      166      346       1  886397596


### Pivot to User × Item matrix

In [2]:
# Create a user-item matrix (943 users × 1682 movies)
R_full = df.pivot(index="user_id", columns="item_id", values="rating").fillna(0)

print(R_full.shape)  # (943, 1682)

(943, 1682)


### Subsample to 100 × 20

In [3]:
# Select top-20 most rated movies
top20_movies = df["item_id"].value_counts().head(20).index

# Filter matrix to those movies
R_top20 = R_full[top20_movies]

# Keep only users who rated at least one of these 20
valid_users = R_top20.index[R_top20.sum(axis=1) > 0]
R_top20 = R_top20.loc[valid_users]

# Select 100 random users (but only those who rated at least some of the top movies)
R_subset = R_top20.sample(n=100, random_state=42) # actually for assigment we need `100` users  
# R_subset = R_top20.sample(n=len(R_top20), random_state=42) # for fun you can use all possible users (in our case 936)

print(R_subset.shape)  # (100, 20)

(100, 20)


### Get your matrix R

In [4]:
# Rename for clarity: "course_i" and "user_i"
R_subset.columns = [f"course_{i}" for i in range(len(R_subset.columns))]
R_subset.index = [f"user_{i}" for i in range(len(R_subset.index))]

print(R_subset.shape)  # (100, 20)
print(R_subset.head())

(100, 20)
        course_0  course_1  course_2  course_3  course_4  course_5  course_6  \
user_0       5.0       0.0       0.0       4.0       0.0       0.0       0.0   
user_1       2.0       0.0       5.0       1.0       0.0       0.0       0.0   
user_2       5.0       4.0       5.0       4.0       3.0       3.0       4.0   
user_3       3.0       0.0       5.0       0.0       0.0       3.0       0.0   
user_4       4.0       0.0       0.0       0.0       4.0       0.0       0.0   

        course_7  course_8  course_9  course_10  course_11  course_12  \
user_0       3.0       0.0       0.0        4.0        1.0        2.0   
user_1       4.0       0.0       3.0        5.0        5.0        5.0   
user_2       2.0       0.0       5.0        5.0        5.0        5.0   
user_3       0.0       0.0       0.0        0.0        0.0        0.0   
user_4       5.0       0.0       0.0        0.0        0.0        4.0   

        course_13  course_14  course_15  course_16  course_17  course_

In [5]:
# Save to CSV
R_subset.to_csv("user_course_ratings.csv", index=True)
