In [1]:
%load_ext autoreload
%autoreload 2
import warnings 
warnings.simplefilter(action='ignore')
!python --version

Python 3.6.9 :: Anaconda, Inc.


In [2]:
%%writefile conda_dependencies.yml
dependencies:
    - python=3.6.9
    - numpy=1.18.*
    - pandas=0.25.*
    - pip:
        - azureml-defaults==1.34.0
        - confuse==1.6.0
        - icecream==2.1.1
        - plotly==5.3.1
        - scikit-surprise==1.1.1
        - kaggle==1.5.12

Overwriting conda_dependencies.yml


In [3]:
%%writefile requirements.txt
confuse==1.6.0
icecream==2.1.1
plotly==5.3.1
kaleido==0.2.1
scikit-surprise==1.1.1
kaggle==1.5.12

Overwriting requirements.txt


### Leverage Kaggle API for Subreddit Dataset

In [2]:
!pip3 install -r requirements.txt



In [2]:
import confuse

config = confuse.Configuration('redditrec',__name__)
config.set_file('config-subreddit.yaml')
data_folder_name = config['folder_paths']['data_folder_name'].get()

1. ```mkdir ~/.kaggle```
2. upload ```kaggle.json``` and ```mv <source> ~/.kaggle/kaggle.json```
3. ```chmod 600 ~/.kaggle/kaggle.json```

In [4]:
from kaggle.api.kaggle_api_extended import KaggleApi

k_api = KaggleApi()
k_api.authenticate()

In [5]:
from icecream import ic
import os

if not (os.path.exists(data_folder_name)):
    os.mkdir(data_folder_name)
os.chdir(data_folder_name)

In [21]:
# reference = k_api.datasets_list(search='Subreddit Recommender')[0]['ref']
reference = 'timschaum/subreddit-recommender'
ic(reference);
k_api.dataset_download_files(reference)

ic| reference: 'timschaum/subreddit-recommender'


In [22]:
from zipfile import ZipFile

zf_name = reference.split('/')[1] + '.zip'
zf = ZipFile(zf_name)
zf.extractall()
zf.close()

In [6]:
original_dir = os.path.dirname(os.getcwd())
os.chdir(original_dir)

In [3]:
import pandas as pd
from IPython.display import display
from icecream import ic

subreddit_df = pd.read_csv(os.path.join(data_folder_name,'reddit_user_data_count.csv'))
# limiting user number for compute limitations
# display(list(subreddit_df['user'].unique()[:100]))
subreddit_df = subreddit_df[subreddit_df['user'].isin(list(subreddit_df['user'].unique()[:100]))]
display(subreddit_df.head())
ic(subreddit_df.shape);

Unnamed: 0,user,subreddit,count
0,------Username------,AskReddit,20
1,------Username------,Barca,9
2,------Username------,FIFA,4
3,------Username------,MMA,5
4,------Username------,RioGrandeValley,3


ic| subreddit_df.shape: (4201, 3)


In [6]:
import plotly.graph_objects as go

plot_output_folder = config['folder_paths']['plot_output_folder_name'].get()
if not os.path.exists(plot_output_folder):
    os.mkdir(plot_output_folder)
group_col, count_col, top_n = 'subreddit', 'count', 5
hist_df = subreddit_df.groupby(by=[group_col])[count_col].sum()\
.sort_values(ascending=False)[0:top_n]
hist_trace = go.Bar(
    x = hist_df.index,
    text = [f'{val:.1f}' for val in hist_df.values],
    textposition = 'auto',
    textfont = dict(color='blue'),
    y = hist_df.values
)
hist_layout = dict(
    title = f'Distribution of Top {top_n} {group_col}',
    xaxis = dict(title=f'{group_col}'),
    yaxis = dict(title='count')
)
fig = go.Figure(data=[hist_trace],layout=hist_layout)
fig.write_image(os.path.join(plot_output_folder,f"top_{top_n}_eda.png"))

### Azureml Workspace and Experiment

In [16]:
import azureml.core
import numpy as np
import logging, json
from azureml.core import Workspace, Dataset, Experiment, Environment, Run

ws = Workspace.from_config()
experiment_name = config['azureml']['experiment_name'].get()
experiment = Experiment(ws,name=experiment_name)
env_name = config['azureml']['environment_name'].get()
subreddit_env = Environment.from_conda_specification(
    name=env_name,
    file_path='./conda_dependencies.yml'
)

### Register dataset

In [5]:
datastore = ws.get_default_datastore()
dataset_name = config['azureml']['dataset_name'].get()
dataset = Dataset.Tabular.register_pandas_dataframe(
    subreddit_df,
    datastore,
    dataset_name,
    show_progress=True
)

Validating arguments.
Arguments validated.
Successfully obtained datastore reference and path.
Uploading file to managed-dataset/d76ea2f3-1e88-4769-9788-cd485240bb43/
Successfully uploaded file to datastore.
Creating and registering a new dataset.
Successfully created and registered a new dataset.


### Test registered dataset

In [6]:
subreddit_df = Dataset.get_by_name(ws,name=dataset_name).to_pandas_dataframe()
display(subreddit_df.head())

Unnamed: 0,user,subreddit,count
0,------Username------,AskReddit,20
1,------Username------,Barca,9
2,------Username------,FIFA,4
3,------Username------,MMA,5
4,------Username------,RioGrandeValley,3


### Fit models leveraging **scikit-surprise**

In [5]:
import surprise
from surprise import (
    # collaborative filtering models
    NormalPredictor, BaselineOnly, KNNBasic, KNNWithMeans,
    KNNWithZScore, KNNBaseline, SVD, SlopeOne, CoClustering
)
from surprise.accuracy import (
    rmse, mae, mse, fcp
)
from surprise.model_selection import train_test_split

random_state = config['surprise']['random_state'].get()
verbose_bool = config['surprise']['verbose_bool'].get()
test_size = config['surprise']['test_size'].get()
max_count, min_count = max(subreddit_df['count']), min(subreddit_df['count'])
reader = surprise.Reader(rating_scale=(min_count,max_count))
full_data = surprise.Dataset.load_from_df(subreddit_df,reader)
train_data, test_data = train_test_split(full_data,test_size=test_size,random_state=random_state)
model_list = [
    NormalPredictor(),
    KNNBasic(k=40, min_k=1, verbose=verbose_bool),
    KNNWithMeans(k=40, min_k=1, verbose=verbose_bool),
    KNNWithZScore(k=40, min_k=1, verbose=verbose_bool),
    KNNBaseline(k=40, min_k=1, verbose=verbose_bool),
    SVD(n_factors=100, n_epochs=20, biased=True, init_mean=0,
    init_std_dev=.1, lr_all=.005,reg_all=.02, lr_bu=None, lr_bi=None, 
    lr_pu=None, lr_qi=None,reg_bu=None, reg_bi=None, reg_pu=None, reg_qi=None,
    random_state=random_state, verbose=verbose_bool),
    SlopeOne(),
    CoClustering(n_cltr_u=3, n_cltr_i=3, n_epochs=20, random_state=random_state,
    verbose=verbose_bool)
]
benchmark = []
for model in model_list:
    model.fit(train_data)
    y_pred = model.test(test_data)
    model_dict = {
        'model': str(model).split(' ')[0].split('.')[-1],
        'rmse': rmse(y_pred,verbose=verbose_bool),
        'mse': mse(y_pred,verbose=verbose_bool),
        'mae': mae(y_pred,verbose=verbose_bool),
        'fcp': fcp(y_pred,verbose=verbose_bool)
    }
    benchmark.append(model_dict)
benchmark_df = pd.DataFrame(benchmark)
display(benchmark_df)

Unnamed: 0,model,rmse,mse,mae,fcp
0,NormalPredictor,63.453432,4026.338093,31.997717,0.444297
1,KNNBasic,63.050422,3975.35576,17.601135,0.441253
2,KNNWithMeans,60.240092,3628.868744,18.342302,0.398839
3,KNNWithZScore,56.452901,3186.930065,17.430292,0.417661
4,KNNBaseline,61.730841,3810.6967,17.32874,0.375936
5,SVD,950.393987,903248.731087,945.669678,0.011393
6,SlopeOne,61.845622,3824.880995,17.950825,0.402733
7,CoClustering,68.400337,4678.606141,19.032222,0.370516


### KNNWithZScore Model performs the best on the test data based off:

* rmse
* mse

In [6]:
from azureml.core.compute import AmlCompute, ComputeTarget

amlcompute_cluster_name = config['azureml']['compute_cluster']['name'].get()
provisioning_config = AmlCompute.provisioning_configuration(
    vm_size=config['azureml']['compute_cluster']['vm_size'].get(),
    max_nodes=config['azureml']['compute_cluster']['max_nodes'].get()
)
compute_target = ComputeTarget.create(ws,amlcompute_cluster_name,provisioning_config)
compute_target.wait_for_completion(
    show_output=True,
    min_node_count=None,
    timeout_in_minutes=20
)

SucceededProvisioning operation finished, operation "Succeeded"
Succeeded
AmlCompute wait for completion finished

Minimum number of nodes requested have been provisioned


In [17]:
from azureml.core import ScriptRunConfig

src = ScriptRunConfig(
    source_directory='.',
    script='./train_exp/train_rec.py',
    arguments=[
        '--k',40,
        '--min_k',1,
        '--verbose',verbose_bool
    ],
    compute_target=compute_target,
    environment=subreddit_env
)
exp_run = experiment.submit(src)
exp_run.wait_for_completion(show_output=True)

RunId: redditrecsys_1640040863_7001b78d
Web View: https://ml.azure.com/runs/redditrecsys_1640040863_7001b78d?wsid=/subscriptions/65881521-f775-4359-a9d9-a122ba465711/resourcegroups/redditrecrg/workspaces/redditrecws&tid=f26668e5-2944-4a48-abc0-4060b08570e2

Execution Summary
RunId: redditrecsys_1640040863_7001b78d
Web View: https://ml.azure.com/runs/redditrecsys_1640040863_7001b78d?wsid=/subscriptions/65881521-f775-4359-a9d9-a122ba465711/resourcegroups/redditrecrg/workspaces/redditrecws&tid=f26668e5-2944-4a48-abc0-4060b08570e2

This run might be using a new job runtime with improved performance and error reporting. The logs from your script are in user_logs/std_log.txt. Please let us know if you run into any issues, and if you would like to opt-out, please add the environment variable AZUREML_COMPUTE_USE_COMMON_RUNTIME to the environment variables section of the job and set its value to the string "false"




{'runId': 'redditrecsys_1640040863_7001b78d',
 'target': 'cpu-cluster',
 'status': 'Completed',
 'startTimeUtc': '2021-12-20T22:54:41.316764Z',
 'endTimeUtc': '2021-12-20T22:54:58.509775Z',
 'services': {},
 'properties': {'_azureml.ComputeTargetType': 'amlcompute',
  'ContentSnapshotId': '78ac5550-c3ef-4890-bc12-c25d2377d9ab',
  'azureml.git.repository_uri': 'git@github.com:bcheng004/reddit-recommender.git',
  'mlflow.source.git.repoURL': 'git@github.com:bcheng004/reddit-recommender.git',
  'azureml.git.branch': 'main',
  'mlflow.source.git.branch': 'main',
  'azureml.git.commit': 'da6ea7403a08f0b64ae0522fcf2246da7c2e0755',
  'mlflow.source.git.commit': 'da6ea7403a08f0b64ae0522fcf2246da7c2e0755',
  'azureml.git.dirty': 'True',
  'ProcessInfoFile': 'azureml-logs/process_info.json',
  'ProcessStatusFile': 'azureml-logs/process_status.json'},
 'inputDatasets': [{'dataset': {'id': '0d50a630-e809-4c92-83be-65d6cac08cca'}, 'consumptionDetails': {'type': 'Reference'}}],
 'outputDatasets': []

In [7]:
exp_run = Run(experiment,run_id='redditrecsys_1640040863_7001b78d')
model_output_folder = config['azureml']['model_output_folder'].get()
model_path = [item for item in exp_run.get_file_names() if model_output_folder in item][0]
model_name = config['azureml']['model_name'].get()
model = exp_run.register_model(model_name=model_name,model_path=model_path)

In [8]:
from azureml.core.model import InferenceConfig, Model

model_check = Model(ws,model_name)
model_check.download(target_dir=os.getcwd(),exist_ok=True)

'/mnt/batch/tasks/shared/LS_root/mounts/clusters/redditreccompute/code/Users/bcheng004/reddit-recommender/azureml/model.joblib'

In [9]:
from azureml.core.webservice import AciWebservice, Webservice

script_file_name = config['azureml']['inference_script_path'].get()
inference_config = InferenceConfig(
    entry_script = script_file_name,
    environment=exp_run.get_environment()
)
aciconfig = AciWebservice.deploy_configuration(
    cpu_cores=config['azureml']['aciservice']['cpu_cores'].get(),
    memory_gb=config['azureml']['aciservice']['memory_gb'].get()
)
aci_service_name = config['azureml']['aciservice']['endpoint_name'].get()
aci_service = Model.deploy(
    ws,
    aci_service_name,
    [model],
    inference_config,
    aciconfig
)
aci_service.wait_for_deployment(True)
print(aci_service.state)

Tips: You can try get_logs(): https://aka.ms/debugimage#dockerlog or local deployment: https://aka.ms/debugimage#debug-locally to debug if deployment takes longer than 10 minutes.
Running
2021-12-23 01:09:24+00:00 Creating Container Registry if not exists.
2021-12-23 01:09:24+00:00 Registering the environment.
2021-12-23 01:09:27+00:00 Use the existing image.
2021-12-23 01:09:27+00:00 Generating deployment configuration.
2021-12-23 01:09:28+00:00 Submitting deployment to compute.
2021-12-23 01:09:32+00:00 Checking the status of deployment subreddit-api..
2021-12-23 01:12:41+00:00 Checking the status of inference endpoint subreddit-api.
Succeeded
ACI service creation operation finished, operation "Succeeded"
Healthy


In [15]:
top_user_count = subreddit_df['user'].value_counts().sort_values(ascending=False)[:1]
top_user = list(top_user_count.index)[0]
unique_items = subreddit_df['subreddit'].unique()
item_user_x = subreddit_df.loc[(subreddit_df['user']==top_user),'subreddit']
items_to_pred = np.setdiff1d(unique_items,item_user_x)

In [17]:
endpoint_service = AciWebservice(ws,name=aci_service_name)
recom_list = []
for iid in items_to_pred:
    pred_df = pd.DataFrame(columns=['user','subreddit'])
    pred_df.loc[0] = [top_user,iid]
    x_pred = json.dumps({'data': pred_df.to_dict(orient='records')})
    pred_response = endpoint_service.run(input_data=x_pred)
    recom_list.append(json.loads(pred_response))

In [18]:
cleaned_recom_list = [pred['prediction'] for pred in recom_list]
pred_recom_df = pd.DataFrame(cleaned_recom_list)
top_10_recom = pred_recom_df.sort_values('est',ascending=False)[:10]
display(top_10_recom)

Unnamed: 0,uid,iid,est,details
1249,-j4ckK-,brighton,40.720899,"{'actual_k': 2, 'was_impossible': False}"
226,-j4ckK-,CallOfDutyMobile,39.978305,"{'actual_k': 1, 'was_impossible': False}"
1430,-j4ckK-,fordranger,39.75249,"{'actual_k': 1, 'was_impossible': False}"
1231,-j4ckK-,boardgames,38.46887,"{'actual_k': 3, 'was_impossible': False}"
889,-j4ckK-,Rochester,32.935398,"{'actual_k': 1, 'was_impossible': False}"
67,-j4ckK-,AnarchoGaming,32.406958,"{'actual_k': 1, 'was_impossible': False}"
345,-j4ckK-,DenverBroncos,30.704654,"{'actual_k': 1, 'was_impossible': False}"
1880,-j4ckK-,soccer,27.316266,"{'actual_k': 4, 'was_impossible': False}"
734,-j4ckK-,NBA2k,26.645153,"{'actual_k': 1, 'was_impossible': False}"
914,-j4ckK-,SchoolIdolFestival,26.527418,"{'actual_k': 1, 'was_impossible': False}"
