### Install missing packages to kernel...


### Import packages...

# CSDS-435: Project #3
# Adam Brennan and Madi Edge

## Helper Functions...

## Random

In [9]:
from surprise import NormalPredictor 
from surprise import Dataset 
from surprise.model_selection import cross_validate
import numpy as np
import pandas as pd

# Load small MovieLens 100K dataset 
data = Dataset.load_builtin('ml-100k')

# Hyperparameter analysis function 
def analyze_rand_hyperparameters():
    # Store results 
    results = [] 

    # Test different random seed values 
    seed_values = [1, 2, 42, 102, 163, 251, 337, 1024]

    for seed in seed_values:
        # random seed values 
        np.random.seed(seed)

        # create predictor 
        predictor = NormalPredictor()

        # run 5-fold cross-validation 
        cv_results = cross_validate(predictor, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

        # Store mean RMSE result 
        results.append({
            'seed': seed,
            'Mean RMSE': np.mean(cv_results['test_rmse'])
        }) 

    # convert to dataframe 
    results_df = pd.DataFrame(results)

    # sort by RMSE
    best_seed = results_df.sort_values(by='Mean RMSE').iloc[0]['seed']
    print(f'Best seed: {best_seed}')

    # return results
    return results_df

# Run hyperparameter analysis
analysis_results = analyze_rand_hyperparameters() 

Evaluating RMSE, MAE of algorithm NormalPredictor on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    1.5114  1.5198  1.5269  1.5113  1.5249  1.5189  0.0065  
MAE (testset)     1.2144  1.2199  1.2237  1.2151  1.2270  1.2200  0.0049  
Fit time          0.03    0.03    0.03    0.03    0.03    0.03    0.00    
Test time         0.03    0.02    0.02    0.02    0.08    0.04    0.02    
Evaluating RMSE, MAE of algorithm NormalPredictor on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    1.5199  1.5182  1.5132  1.5242  1.5018  1.5155  0.0077  
MAE (testset)     1.2216  1.2161  1.2119  1.2254  1.2071  1.2164  0.0066  
Fit time          0.03    0.04    0.03    0.03    0.03    0.03    0.00    
Test time         0.02    0.02    0.02    0.09    0.03    0.04    0.02    
Evaluating RMSE, MAE of algorithm NormalPredictor on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fo

## KNN

In [3]:
from surprise import KNNBasic
from surprise import Dataset 
from surprise import Reader
from surprise.model_selection import cross_validate
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

# Load small MovieLens 100K dataset based off movie tags (tags.csv file) 
data = pd.read_csv('tags.csv')

# group tags by movieID 
movie_tags = data.groupby('movieId')['tag'].apply(' '.join).reset_index()

# using similar to bag-of-words for movie tags
vectorizer = CountVectorizer(stop_words='english')
tag_features = vectorizer.fit_transform(movie_tags['tag'].values)

# Hyperparameter analysis function 
def analyze_knn_hyperparameters():

    # load ratings from ratings.csv
    ratings = pd.read_csv('ratings.csv')
    reader = Reader(rating_scale=(1, 5))
    data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)

    # range of k values 
    k_values = [1, 5, 10, 20, 50, 100]

    # store results 
    results = []

    # Iterate over k values 
    for k in k_values:
        algo = KNNBasic(k=k)

        # run 5-fold cross-validation 
        cv_results = cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

        # Store mean RMSE result
        results.append({
            'k': k,
            'Mean RMSE': np.mean(cv_results['test_rmse'])
        })

    # convert to dataframe 
    results_df = pd.DataFrame(results)

    # sort by RMSE
    best_k = results_df.sort_values(by='Mean RMSE').iloc[0]['k']
    print(f'Best k: {best_k}')

    # return results
    return results_df

# Run hyperparameter analysis
analysis_results = analyze_knn_hyperparameters()    



Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Evaluating RMSE, MAE of algorithm KNNBasic on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    1.1865  1.1680  1.1572  1.1570  1.1605  1.1658  0.0111  
MAE (testset)     0.8845  0.8699  0.8639  0.8650  0.8677  0.8702  0.0074  
Fit time          0.03    0.04    0.04    0.04    0.04    0.04    0.00    
Test time         0.26    0.21    0.23    0.19    0.19    0.21    0.03    
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing s

## SVD

## Matrix Factorization W/ Regularization

## Deep Neural Network

## Compare Algorithms 