## 1. 분류 모형 활용 사례

## 2. Smote + MLP

In [None]:
!pip install pycaret

In [1]:
!unzip caravan.zip

Archive:  caravan.zip
replace caravan.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: A
  inflating: caravan.csv             


In [2]:
import pandas as pd
data = pd.read_csv("caravan.csv")
print( data.Purchase.value_counts() )

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

X  = data.drop("Purchase", axis=1)
X_scaled = StandardScaler().fit_transform( X )
y = data.Purchase

X_train, X_test,y_train,y_test = train_test_split(X_scaled,y,test_size=0.2, stratify=y)

0    5474
1     348
Name: Purchase, dtype: int64


In [3]:
from imblearn.over_sampling import SMOTE
X_train_res, y_train_res = SMOTE(sampling_strategy = 0.25).fit_resample( X_train, y_train )

In [4]:
from sklearn.neural_network import MLPClassifier
model = MLPClassifier( hidden_layer_sizes=(24,12 ) )
model.fit( X_train_res, y_train_res)



In [5]:
from sklearn.metrics import classification_report
print( classification_report( y_test, model.predict( X_test)) )

              precision    recall  f1-score   support

           0       0.94      0.94      0.94      1095
           1       0.11      0.11      0.11        70

    accuracy                           0.89      1165
   macro avg       0.53      0.53      0.53      1165
weighted avg       0.89      0.89      0.89      1165



- pycaret 비교

In [7]:
from pycaret.classification import *

In [None]:
tmp = pd.DataFrame( X_train_res)
tmp["Purchase"] = y_train_res

In [16]:
clf = setup(data=tmp, target='Purchase', train_size=0.8, session_id=10, normalize=True)

Unnamed: 0,Description,Value
0,Session id,10
1,Target,Purchase
2,Target type,Binary
3,Original data shape,"(5473, 86)"
4,Transformed data shape,"(5473, 86)"
5,Transformed train set shape,"(4378, 86)"
6,Transformed test set shape,"(1095, 86)"
7,Numeric features,85
8,Preprocess,True
9,Imputation type,simple


In [17]:
models = compare_models(sort='Recall', n_select=3, fold=2)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
nb,Naive Bayes,0.4159,0.6656,0.9087,0.2549,0.3935,0.1163,0.1842,0.13
xgboost,Extreme Gradient Boosting,0.9326,0.9339,0.7474,0.8983,0.816,0.7751,0.7801,0.72
lightgbm,Light Gradient Boosting Machine,0.9301,0.9414,0.7337,0.8981,0.8074,0.7652,0.7712,1.13
dt,Decision Tree Classifier,0.8748,0.8139,0.7075,0.68,0.6934,0.6148,0.615,0.195
knn,K Neighbors Classifier,0.7997,0.8442,0.6915,0.499,0.5796,0.4526,0.463,0.28
gbc,Gradient Boosting Classifier,0.9324,0.9398,0.6892,0.9619,0.803,0.7635,0.779,0.98
et,Extra Trees Classifier,0.9153,0.9369,0.6709,0.876,0.7597,0.7093,0.7187,0.525
rf,Random Forest Classifier,0.9121,0.939,0.6343,0.8949,0.7413,0.6904,0.7058,1.16
ada,Ada Boost Classifier,0.8968,0.9105,0.6115,0.8286,0.7029,0.6422,0.6535,0.385
svm,SVM - Linear Kernel,0.759,0.0,0.3109,0.3802,0.3417,0.1959,0.1977,0.235


Processing:   0%|          | 0/67 [00:00<?, ?it/s]

In [32]:
tmp_test = pd.DataFrame(X_test)

model1 = tune_model(models[2], optimize="Prec.")

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.8973,0.9524,0.4943,0.9773,0.6565,0.6036,0.6522
1,0.9132,0.9031,0.5747,0.9804,0.7246,0.6773,0.7113
2,0.8973,0.9183,0.4828,1.0,0.6512,0.5993,0.6541
3,0.8904,0.9209,0.4545,1.0,0.625,0.5711,0.6322
4,0.8881,0.9027,0.4432,1.0,0.6142,0.5599,0.6235
5,0.8858,0.9449,0.4318,1.0,0.6032,0.5485,0.6147
6,0.8973,0.9165,0.4886,1.0,0.6565,0.6043,0.658
7,0.9041,0.9325,0.5227,1.0,0.6866,0.6364,0.6832
8,0.8879,0.9104,0.4483,0.975,0.6142,0.5588,0.6168
9,0.8719,0.9089,0.3563,1.0,0.5254,0.47,0.5542


Processing:   0%|          | 0/7 [00:00<?, ?it/s]

Fitting 10 folds for each of 10 candidates, totalling 100 fits


In [34]:
predicted =  predict_model(model1, tmp_test)
print(classification_report( y_test, predicted.prediction_label))

              precision    recall  f1-score   support

           0       0.94      1.00      0.97      1095
           1       0.00      0.00      0.00        70

    accuracy                           0.94      1165
   macro avg       0.47      0.50      0.48      1165
weighted avg       0.88      0.94      0.91      1165



## 3. MLP + GridSearch

In [36]:
from sklearn import model_selection
from sklearn import metrics

param_grid = {
    "hidden_layer_sizes":[(24,12),(12,6)]
}
#declaring GridSearchCV model
grid = model_selection.GridSearchCV(
    estimator = MLPClassifier(), #모형
    param_grid = param_grid,  #
    scoring = 'accuracy', #정분류율
    verbose = 10,
    n_jobs = 1,
    cv = 2      #k-fold CV, k값.
)
#fitting values to the gridsearchcv model
grid.fit(X_train_res, y_train_res)

#printing the best possible values to enhance accuracy
print(grid.best_params_)
print(grid.best_estimator_)
#printing the best score
print(grid.best_score_)



Fitting 2 folds for each of 2 candidates, totalling 4 fits
[CV 1/2; 1/2] START hidden_layer_sizes=(24, 12).................................
[CV 1/2; 1/2] END ..hidden_layer_sizes=(24, 12);, score=0.891 total time=  10.8s
[CV 2/2; 1/2] START hidden_layer_sizes=(24, 12).................................
[CV 2/2; 1/2] END ..hidden_layer_sizes=(24, 12);, score=0.918 total time=  14.6s
[CV 1/2; 2/2] START hidden_layer_sizes=(12, 6)..................................
[CV 1/2; 2/2] END ...hidden_layer_sizes=(12, 6);, score=0.850 total time=   6.8s
[CV 2/2; 2/2] START hidden_layer_sizes=(12, 6)..................................
[CV 2/2; 2/2] END ...hidden_layer_sizes=(12, 6);, score=0.865 total time=   4.6s
{'hidden_layer_sizes': (24, 12)}
MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(24, 12), learning_rate='constant',
              learning_rate_init=0.001, max_fun=

## 4. Keras Tuner 활용 하이퍼파라미터 튜닝

 - DNN의 하이터파라미터: 층의 수, 노드의 수, 학습율, 배치크기 등

In [37]:
import tensorflow as tf
from tensorflow import keras

In [38]:
!pip install -q -U keras-tuner

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/128.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.0/128.9 kB[0m [31m1.2 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━[0m [32m122.9/128.9 kB[0m [31m1.9 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m128.9/128.9 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
[?25h

In [39]:
import keras_tuner as kt

In [40]:
(img_train, label_train), (img_test, label_test) = keras.datasets.fashion_mnist.load_data()

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/train-labels-idx1-ubyte.gz
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/train-images-idx3-ubyte.gz
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/t10k-labels-idx1-ubyte.gz
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/t10k-images-idx3-ubyte.gz


In [42]:
img_train.shape

(60000, 28, 28)

In [44]:
# Normalize pixel values between 0 and 1
img_train = img_train.astype('float32').reshape(60000, 784) / 255.0
img_test = img_test.astype('float32').reshape(10000, 784) / 255.0

- 하이퍼튜닝을 위한 모형 구축 시 하이퍼 파라미터 탐색 공간 정의
- 하이퍼 모델: 하이퍼튜닝을 위해 설정하는 모델

- 하이퍼 모형의 설정
 - 모델 빌더 함수 사용: Keras Tuner API의 HyperModel 클래스를 하위 클래스화
 - 컴파일된 모델을 반환하고 인라인으로 정의한 하이퍼파라미터를 사용하여 모델을 하이퍼튜닝

In [45]:
def model_builder(hp):
  model = keras.Sequential()
  model.add(keras.layers.Flatten(input_shape=(784, )))

  # Choose an optimal value between 32-96
  hp_units = hp.Int('units', min_value=32, max_value=96, step=32)
  model.add(keras.layers.Dense(units=hp_units, activation='relu'))
  model.add(keras.layers.Dense(10))

  # Choose an optimal value from 0.01, 0.001, or 0.0001
  hp_learning_rate = hp.Choice('learning_rate', values=[1e-2, 1e-3, 1e-4])

  model.compile(optimizer=keras.optimizers.Adam(learning_rate=hp_learning_rate),
                loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
                metrics=['accuracy'])

  return model

- tuner 객체 만들고 튜닝 수행
 - randomsearch, hyperband, bayesianoptimization, sklearn 등의 tuner 가능
 - Hyperband 사용 시: objective와 epoch 지정

In [46]:
tuner = kt.Hyperband(model_builder,
                     objective='val_accuracy',
                     max_epochs=10,
                     factor=3,
                     directory='my_dir',
                     project_name='intro_to_kt')

In [48]:
class ClearTrainingOutput(tf.keras.callbacks.Callback): #조기 학습 중단을 위해 사용
  def on_train_end(*args, **kwargs):
    IPython.display.clear_output(wait = True)

In [47]:
#하이퍼 파라미터 탐색 실행
tuner.search(img_train, label_train, epochs=50, validation_split=0.2, callbacks=[tf.keras.callbacks.EarlyStopping(patience=1)])

# Get the optimal hyperparameters
best_hps=tuner.get_best_hyperparameters(num_trials=1)[0]

print(f"""
optimal number of units:  {best_hps.get('units')}
optimal learning rate for the optimizer:  {best_hps.get('learning_rate')}
""")

Trial 8 Complete [00h 00m 16s]
val_accuracy: 0.09833333641290665

Best val_accuracy So Far: 0.12333333492279053
Total elapsed time: 00h 02m 13s

optimal number of units:  64
optimal learning rate for the optimizer:  0.001



In [50]:
#탐색을 통해 찾은 하이퍼파라미터로 훈련 시 최적의 epoch을 발견
model = tuner.hypermodel.build(best_hps)
history = model.fit(img_train, label_train, epochs=10, validation_split=0.2)

val_acc_per_epoch = history.history['val_accuracy']
best_epoch = val_acc_per_epoch.index(max(val_acc_per_epoch)) + 1
print('Best epoch: %d' % (best_epoch,))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Best epoch: 10


In [51]:
#최적 모형 객체 만들기
hypermodel = tuner.hypermodel.build(best_hps)

# 최적 epoch으로 학습
hypermodel.fit(img_train, label_train, epochs=best_epoch, validation_split=0.2)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x7b6b03759f00>

In [52]:
eval_result = hypermodel.evaluate(img_test, label_test)
print("[test loss, test accuracy]:", eval_result)

[test loss, test accuracy]: [2.294664144515991, 0.2549999952316284]


## 6. 유사도와 추천

In [None]:
import pandas as pd
df = pd.read_csv('movies.csv')

In [None]:
df.head()

Unnamed: 0,userId,title,rating,timestamp
0,1,American Pie,4.0,1260759139
1,4,American Pie,4.0,949896114
2,15,American Pie,4.0,1052896867
3,30,American Pie,2.0,994439964
4,34,American Pie,4.0,973747765


In [None]:
df = df.pivot_table('rating', index = 'userId', columns = 'title')
df.head()

title,10 Things I Hate About You,12 Angry Men,1408,15 Minutes,16 Blocks,"20,000 Leagues Under the Sea",2001: A Space Odyssey,2046,21 Grams,25th Hour,...,Willy Wonka & the Chocolate Factory,World Trade Center,X-Men Origins: Wolverine,Y Tu Mamá También,You Only Live Twice,"You, Me and Dupree",Young Frankenstein,Zodiac,eXistenZ,xXx
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,3.0,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,3.0,,,,,...,,,5.0,,,,5.0,,,
5,,,,,,,,,,,...,,,,,,,,,,


In [None]:
df.shape

(670, 856)

- user들별로 평점을 주는 범위가 다르기 때문에 scaling하여 조정

In [None]:
import numpy as np
df_scaled = df.apply(lambda x: (x-np.mean(x))/(np.max(x)-np.min(x)), axis=1)

### Collaborative Filtering (협업 필터링) 리뷰
- 추천 시스템: 고객의 선호, 관심, 구매경력과 같은 개인화 정보를 기초로 고객에게 가장 알맞은 구매정보 제공
- 방식
  - Item-based collaborative filtering: item 간 similarity를 기반으로 추천
  - User-based collaborative filtering: user 간 similarity를 기반으로 추천
    - 성능이 다른 방식에 비해 떨어지는 것으로 알려져있음
  - Matrix factorization collaborative filtering: 잠재요인이 있다고 가정하여, 그 잠재요인을 행렬분해를 통해 찾아냄
    - 일반적으로 SVD(singular value decomposition)을 활용

- Input: user-item matrix (preference matrix)
  - 일반적으로 5점 척도로 구성된 rating + cosine similarity
  - binary인 경우 jaccard similarity 사용

- 한계
  - 단순 matrix를 사용하여 추천하기 때문에 context/content를 고려하지 않음
    - 최근에는 item의 text 정보, user의 context 정보들을 반영하여 딥러닝 기반의 추천 시스템 등장

### Item-based collaborative filtering
- Item*user으로 구성된 matrix를 사용하여 유사도 측정

In [None]:
df_scaled = df_scaled.transpose().fillna(0)
df_scaled.head(5)

userId,1,2,3,4,5,6,7,8,9,10,...,662,663,664,665,666,667,668,669,670,671
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10 Things I Hate About You,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,-0.060526,0.0,0.0,0.0,0.0,0.0,0.0
12 Angry Men,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1408,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
15 Minutes,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
16 Blocks,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


- user들 간 평점이 비슷한 정도를 기반으로 영화들의 유사도 평가

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
movie_sim = cosine_similarity(df_scaled, df_scaled)
print(movie_sim.shape)

(856, 856)


In [None]:
movie_sim

In [None]:
movie_sim = pd.DataFrame(movie_sim, index = df_scaled.index, columns = df_scaled.index)

In [None]:
movie_sim['Snow White and the Seven Dwarfs'].sort_values(ascending=False)[:10]

title
Garfield                           1.000000
The Matrix                         1.000000
Snow White and the Seven Dwarfs    1.000000
Highlander                         1.000000
Alvin and the Chipmunks            0.922721
Prom Night                         0.913674
They Live                          0.558437
Zodiac                             0.546653
The Dark Knight                    0.491283
Shine a Light                      0.463028
Name: Snow White and the Seven Dwarfs, dtype: float64

## 7 .추천 라이브러리

In [None]:
!pip install surprise

Collecting surprise
  Downloading surprise-0.1-py2.py3-none-any.whl (1.8 kB)
Collecting scikit-surprise (from surprise)
  Downloading scikit-surprise-1.1.3.tar.gz (771 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m772.0/772.0 kB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (setup.py) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.3-cp310-cp310-linux_x86_64.whl size=3163484 sha256=5d5d56e7c17e7cc8172636ef5676a513f38a2f27ee0d9faa70561eca1cb427ea
  Stored in directory: /root/.cache/pip/wheels/a5/ca/a8/4e28def53797fdc4363ca4af740db15a9c2f1595ebc51fb445
Successfully built scikit-surprise
Installing collected packages: scikit-surprise, surprise
Successfully installed scikit-surprise-1.1.3 surprise-0.1


In [None]:
df = pd.read_csv('movies.csv')
df

Unnamed: 0,userId,title,rating,timestamp
0,1,American Pie,4.0,1260759139
1,4,American Pie,4.0,949896114
2,15,American Pie,4.0,1052896867
3,30,American Pie,2.0,994439964
4,34,American Pie,4.0,973747765
...,...,...,...,...
18566,652,Lion of the Desert,5.0,1440269953
18567,652,Mud,5.0,1439586954
18568,659,K-PAX,4.0,836137550
18569,659,Starship Troopers,3.0,834694187


In [None]:
from surprise import Reader, Dataset, KNNBasic, SVD
from surprise.model_selection import train_test_split, cross_validate
import pandas as pd

# Prepare the data to be used in Surprise
reader = Reader(rating_scale=(0,5))
data = Dataset.load_from_df(df[['userId', 'title', 'rating']], reader=reader)


In [None]:
data

<surprise.dataset.DatasetAutoFolds at 0x7dc4f2a7f3d0>

In [None]:
sim_options = {
    'name': 'cosine',
    'user_based': True
}
algo = KNNBasic(sim_options=sim_options)

# Retrieve the trainset.
trainset, testset = train_test_split(data)
algo.fit(trainset)

# Predict
print(algo.predict(1, 5, r_ui=None, verbose=True))

Computing the cosine similarity matrix...
Done computing similarity matrix.
user: 1          item: 5          r_ui = None   est = 3.54   {'was_impossible': True, 'reason': 'User and/or item is unknown.'}
user: 1          item: 5          r_ui = None   est = 3.54   {'was_impossible': True, 'reason': 'User and/or item is unknown.'}


In [None]:
prediction = algo.test(testset)
prediction[:5]

[Prediction(uid=472, iid='Boogie Nights', r_ui=4.0, est=4.201173462840423, details={'actual_k': 40, 'was_impossible': False}),
 Prediction(uid=517, iid='Metropolis', r_ui=3.0, est=2.6268575319505496, details={'actual_k': 40, 'was_impossible': False}),
 Prediction(uid=23, iid='Being John Malkovich', r_ui=4.5, est=3.5271392246906523, details={'actual_k': 17, 'was_impossible': False}),
 Prediction(uid=523, iid='A Nightmare on Elm Street', r_ui=4.0, est=3.4626506634583705, details={'actual_k': 40, 'was_impossible': False}),
 Prediction(uid=105, iid='20,000 Leagues Under the Sea', r_ui=2.0, est=2.887541215716813, details={'actual_k': 40, 'was_impossible': False})]

- Surprise 지원 알고리즘
 - Random : 랜덤한 추천
 - Baseline : ALS(Alternating Least Square), SGD(Stochastic Gradient Descent)
 - Matrix factorization: SVD, SVD++
 - KNNs
 - Slope one, Co clustering

 - https://surprise.readthedocs.io/en/stable/prediction_algorithms_package.html

In [None]:
from surprise import BaselineOnly
from surprise import KNNWithMeans, KNNBasic, KNNBaseline
from surprise import SVD
from surprise import SVDpp

from surprise import accuracy
from surprise import Dataset
from surprise.model_selection import train_test_split
import matplotlib.pyplot as plt


In [None]:
trainset, testset = train_test_split(data, test_size = 0.25)

In [None]:
recom = KNNBasic()  #CF
recom.fit(trainset)
predictions = recom.test(testset)
accuracy.rmse(predictions)

Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.9505


0.9505290217572812

In [None]:
recom = SVD()  #matrix 분해 (SVD) 이용
recom.fit(trainset)
predictions = recom.test(testset)
accuracy.rmse(predictions)

RMSE: 0.9034


0.9034162368979297

- ml-100k에 대해서 여러 추천 알고리즘을 수행하고 비교

In [None]:
# MovieLens 100K
data = Dataset.load_builtin(name=u'ml-100k', prompt = False)
trainset, testset = train_test_split(data, test_size = 0.25)

Trying to download dataset from https://files.grouplens.org/datasets/movielens/ml-100k.zip...
Done! Dataset ml-100k has been saved to /root/.surprise_data/ml-100k


In [None]:
algorithms = [KNNBasic, SVD]

algos=[]
rmses=[]

for i in algorithms:
  algo = i()
  algos.append(i.__name__)
  algo.fit(trainset)
  predictions = algo.test(testset)
  rmses.append(accuracy.rmse(predictions))

Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.9837
RMSE: 0.9371
