In [1]:
%load_ext autoreload
%autoreload 2
import warnings 
warnings.simplefilter(action='ignore')
!python --version

Python 3.6.9 :: Anaconda, Inc.


In [2]:
%%writefile conda_dependencies.yml
dependencies:
    - python=3.6.9
    - numpy=1.18.*
    - pandas=0.25.*
    - pip:
        - azureml-defaults==1.34.0
        - confuse==1.6.0
        - icecream==2.1.1
        - plotly==5.3.1
        - scikit-surprise==1.1.1
        - kaggle==1.5.12

Overwriting conda_dependencies.yml


In [3]:
%%writefile requirements.txt
confuse==1.6.0
icecream==2.1.1
plotly==5.3.1
kaleido==0.2.1
scikit-surprise==1.1.1
kaggle==1.5.12

Overwriting requirements.txt


### Leverage Kaggle API for Subreddit Dataset

In [2]:
!pip3 install -r requirements.txt



In [2]:
import confuse

config = confuse.Configuration('redditrec',__name__)
config.set_file('config-subreddit.yaml')

1. ```mkdir ~/.kaggle```
2. upload ```kaggle.json``` and ```mv <source> ~/.kaggle/kaggle.json```
3. ```chmod 600 ~/.kaggle/kaggle.json```

In [4]:
from kaggle.api.kaggle_api_extended import KaggleApi

k_api = KaggleApi()
k_api.authenticate()

In [5]:
from icecream import ic
import os

data_folder_name = config['folder_paths']['data_folder_name'].get()
if not (os.path.exists(data_folder_name)):
    os.mkdir(data_folder_name)
os.chdir(data_folder_name)

In [21]:
# reference = k_api.datasets_list(search='Subreddit Recommender')[0]['ref']
reference = 'timschaum/subreddit-recommender'
ic(reference);
k_api.dataset_download_files(reference)

ic| reference: 'timschaum/subreddit-recommender'


In [22]:
from zipfile import ZipFile

zf_name = reference.split('/')[1] + '.zip'
zf = ZipFile(zf_name)
zf.extractall()
zf.close()

In [6]:
original_dir = os.path.dirname(os.getcwd())
os.chdir(original_dir)

In [7]:
import pandas as pd
from IPython.display import display
from icecream import ic

subreddit_df = pd.read_csv(os.path.join(data_folder_name,'reddit_user_data_count.csv'),nrows=int(5e4))
# limiting nrows to 50000 for compute limitations
display(subreddit_df.head())
ic(subreddit_df.shape);

Unnamed: 0,user,subreddit,count
0,------Username------,AskReddit,20
1,------Username------,Barca,9
2,------Username------,FIFA,4
3,------Username------,MMA,5
4,------Username------,RioGrandeValley,3


ic| subreddit_df.shape: (50000, 3)


In [8]:
import plotly.graph_objects as go

plot_output_folder = config['folder_paths']['plot_output_folder_name'].get()
if not os.path.exists(plot_output_folder):
    os.mkdir(plot_output_folder)
group_col, count_col, top_n = 'subreddit', 'count', 5
hist_df = subreddit_df.groupby(by=[group_col])[count_col].sum()\
.sort_values(ascending=False)[0:top_n]
hist_trace = go.Bar(
    x = hist_df.index,
    text = [f'{val:.1f}' for val in hist_df.values],
    textposition = 'auto',
    textfont = dict(color='blue'),
    y = hist_df.values
)
hist_layout = dict(
    title = f'Distribution of Top {top_n} {group_col}',
    xaxis = dict(title=f'{group_col}'),
    yaxis = dict(title='count')
)
fig = go.Figure(data=[hist_trace],layout=hist_layout)
fig.write_image(os.path.join(plot_output_folder,f"top_{top_n}_eda.png"))

### Azureml Workspace and Experiment

In [9]:
import azureml.core
import numpy as np
import logging
from azureml.core import Workspace, Dataset, Experiment, Environment, Run

ws = Workspace.from_config()
experiment_name = config['azureml']['experiment_name'].get()
experiment = Experiment(ws,name=experiment_name)
env_name = config['azureml']['environment_name'].get()
subreddit_env = Environment.from_conda_specification(name=env_name,file_path='./conda_dependencies.yml')

### Register dataset

In [10]:
datastore = ws.get_default_datastore()
dataset_name = config['azureml']['dataset_name'].get()
dataset = Dataset.Tabular.register_pandas_dataframe(
    subreddit_df,
    datastore,
    dataset_name,
    show_progress=True
)

Validating arguments.
Arguments validated.
Successfully obtained datastore reference and path.
Uploading file to managed-dataset/fe001378-4c3b-4426-b466-78b69fb8fa4a/
Successfully uploaded file to datastore.
Creating and registering a new dataset.
Successfully created and registered a new dataset.


### Test registered dataset

In [11]:
subreddit_df = Dataset.get_by_name(ws,name=dataset_name).to_pandas_dataframe()
display(subreddit_df.head())

Unnamed: 0,user,subreddit,count
0,------Username------,AskReddit,20
1,------Username------,Barca,9
2,------Username------,FIFA,4
3,------Username------,MMA,5
4,------Username------,RioGrandeValley,3


### Fit models leveraging **scikit-surprise**

In [13]:
from surprise import (
    # collaborative filtering models
    NormalPredictor, BaselineOnly, KNNBasic, KNNWithMeans,
    KNNWithZScore, KNNBaseline, SVD, SVDpp, SlopeOne, CoClustering,
    # utility functions
    Reader, Dataset
)
from surprise.accuracy import (
    rmse, mae, mse, fcp
)
from surprise.model_selection import train_test_split

random_state = config['surprise']['random_state'].get()
verbose_bool = config['surprise']['verbose_bool'].get()
test_size = config['surprise']['test_size'].get()
max_count, min_count = max(subreddit_df['count']), min(subreddit_df['count'])
reader = Reader(rating_scale=(min_count,max_count))
full_data = Dataset.load_from_df(subreddit_df,reader)
train_data, test_data = train_test_split(full_data,test_size=test_size)
model_list = [
    NormalPredictor(),
    BaselineOnly(verbose=verbose_bool),
    KNNBasic(k=40, min_k=1, verbose=verbose_bool),
    KNNWithMeans(k=40, min_k=1, verbose=verbose_bool),
    KNNWithZScore(k=40, min_k=1, verbose=verbose_bool),
    KNNBaseline(k=40, min_k=1, verbose=verbose_bool),
    SVD(n_factors=100, n_epochs=20, biased=True, init_mean=0,
    init_std_dev=.1, lr_all=.005,reg_all=.02, lr_bu=None, lr_bi=None, 
    lr_pu=None, lr_qi=None,reg_bu=None, reg_bi=None, reg_pu=None, reg_qi=None,
    random_state=random_state, verbose=verbose_bool),
    SVDpp(n_factors=20, n_epochs=20, init_mean=0, init_std_dev=.1,
    lr_all=.007, reg_all=.02, lr_bu=None, lr_bi=None, lr_pu=None,
    lr_qi=None, lr_yj=None, reg_bu=None, reg_bi=None, reg_pu=None,
    reg_qi=None, reg_yj=None, random_state=None, verbose=verbose_bool),
    SlopeOne(),
    CoClustering(n_cltr_u=3, n_cltr_i=3, n_epochs=20, random_state=random_state,
    verbose=verbose_bool)
]
benchmark = []
for model in model_list:
    model.fit(train_data)
    y_pred = model.test(test_data)
    model_dict = {
        'model': str(model).split(' ')[0].split('.')[-1],
        'rmse': rmse(y_pred,verbose=verbose_bool),
        'mse': mse(y_pred,verbose=verbose_bool),
        'mae': mae(y_pred,verbose=verbose_bool),
        'fcp': fcp(y_pred,verbose=verbose_bool)
    }
    benchmark.append(model_dict)
benchmark_df = pd.DataFrame(benchmark)
display(benchmark_df)

Unnamed: 0,model,rmse,mse,mae,fcp
0,NormalPredictor,59.838843,3580.687149,29.929177,0.433464
1,BaselineOnly,47.308215,2238.067234,14.010158,0.49855
2,KNNBasic,54.200222,2937.664116,14.883123,0.522803
3,KNNWithMeans,54.604941,2981.699585,16.603188,0.427097
4,KNNWithZScore,57.31238,3284.708916,16.433479,0.487474
5,KNNBaseline,53.918238,2907.176366,15.3899,0.433184
6,SVD,989.90343,979908.801,988.690172,0.0
7,SVDpp,989.90343,979908.801,988.690172,0.0
8,SlopeOne,50.222751,2522.324678,15.524001,0.491153
9,CoClustering,64.915045,4213.963032,17.734558,0.427848
