# Data Loading & Processing (Setup)

## Data Loading

### Import libraries

In [1]:
import kagglehub
import os
import shutil

### Download datasets

In [2]:
dest_folder = "../data/raw"
print("Downloading MovieLens 100K from KaggleHub...")
dataset_path = kagglehub.dataset_download("prajitdatta/movielens-100k-dataset")

# Go inside the actual dataset folder (usually /.../ml-100k)
dataset_subfolder = os.path.join(dataset_path, "ml-100k")
if not os.path.exists(dataset_subfolder):
    raise Exception(f"Expected 'ml-100k' folder not found in {dataset_path}")

os.makedirs(dest_folder, exist_ok=True)

# Copy each file inside ml-100k to dest_folder
for filename in os.listdir(dataset_subfolder):
    src_file = os.path.join(dataset_subfolder, filename)
    dst_file = os.path.join(dest_folder, filename)
    shutil.copy2(src_file, dst_file)

print(f"Dataset successfully copied to: {dest_folder}")

Downloading MovieLens 100K from KaggleHub...
Dataset successfully copied to: ../data/raw


## Data Preprocessing

### Setup folders and import

In [6]:
import pandas as pd

# Make sure the processed folder exists
os.makedirs("../data/processed", exist_ok=True)

### Convert u.data - item - user - occupation - genres to CSV files

In [7]:
ratings = pd.read_csv("../data/raw/u.data", sep='\t', header=None,
                      names=["user_id", "item_id", "rating", "timestamp"])
ratings.to_csv("../data/processed/ratings.csv", index=False)
ratings.head()

movie_columns = [
    "item_id", "title", "release_date", "video_release_date", "IMDb_URL",
    "unknown", "Action", "Adventure", "Animation", "Children's", "Comedy",
    "Crime", "Documentary", "Drama", "Fantasy", "Film-Noir", "Horror",
    "Musical", "Mystery", "Romance", "Sci-Fi", "Thriller", "War", "Western"
]

movies = pd.read_csv("../data/raw/u.item", sep='|', encoding='latin-1', header=None, names=movie_columns)
movies.to_csv("../data/processed/movies.csv", index=False)
movies.head()

users = pd.read_csv("../data/raw/u.user", sep='|', header=None,
                    names=["user_id", "age", "gender", "occupation", "zip_code"])
users.to_csv("../data/processed/users.csv", index=False)
users.head()

occupations = pd.read_csv("../data/raw/u.occupation", header=None, names=["occupation"])
occupations.to_csv("../data/processed/occupations.csv", index=False)
occupations.head()

genres = pd.read_csv("../data/raw/u.genre", sep='|', header=None, names=["genre", "genre_id"])
genres.to_csv("../data/processed/genres.csv", index=False)
genres.head()

Unnamed: 0,genre,genre_id
0,unknown,0
1,Action,1
2,Adventure,2
3,Animation,3
4,Children's,4
