In [1]:
%load_ext autoreload
%autoreload 2
import warnings 
warnings.simplefilter(action='ignore')
!python --version

Python 3.6.9 :: Anaconda, Inc.


In [2]:
%%writefile conda_dependencies.yml
dependencies:
    - python=3.6.9
    - numpy=1.18.*
    - pandas=0.25.*
    - pip:
        - azureml-defaults==1.34.0
        - confuse==1.6.0
        - icecream==2.1.1
        - plotly==5.3.1
        - scikit-surprise==1.1.1
        - kaggle==1.5.12

Writing conda_dependencies.yml


In [3]:
%%writefile requirements.txt
confuse==1.6.0
icecream==2.1.1
plotly==5.3.1
kaleido==0.2.1
scikit-surprise==1.1.1
kaggle==1.5.12

Writing requirements.txt


### Leverage Kaggle API for Subreddit Dataset

In [5]:
!pip3 install -r requirements.txt



In [37]:
import confuse

config = confuse.Configuration('redditrec',__name__)
config.set_file('config-subreddit.yaml')

In [9]:
import json
from kaggle.api.kaggle_api_extended import KaggleApi

with open('kaggle.json') as k_f:
    k_config_data = json.load(k_f)
k_api = KaggleApi()
k_api._load_config(config_data=k_config_data)

In [31]:
from icecream import ic
import os

data_folder_name = config['folder_paths']['data_folder_name'].get()
if not (os.path.exists(data_folder_name)):
    os.mkdir(data_folder_name)
os.chdir(data_folder_name)

In [11]:
reference = k_api.datasets_list(search='Subreddit Recommender')[0]['ref']
ic(reference);
k_api.dataset_download_files(reference)

ic| reference: 'timschaum/subreddit-recommender'


In [12]:
from zipfile import ZipFile

zf_name = reference.split('/')[1] + '.zip'
zf = ZipFile(zf_name)
zf.extractall()
zf.close()

In [36]:
original_dir = os.path.dirname(os.getcwd())
os.chdir(original_dir)

In [21]:
import pandas as pd
from IPython.display import display
from icecream import ic

subreddit_df = pd.read_csv(os.path.join(data_folder_name,'reddit_user_data_count.csv'))
display(subreddit_df.head())
ic(subreddit_df.shape);

Unnamed: 0,user,subreddit,count
0,------Username------,AskReddit,20
1,------Username------,Barca,9
2,------Username------,FIFA,4
3,------Username------,MMA,5
4,------Username------,RioGrandeValley,3


ic| subreddit_df.shape: (1738737, 3)


In [32]:
import plotly.graph_objects as go

plot_output_folder = config['folder_paths']['plot_output_folder_name'].get()
if not os.path.exists(plot_output_folder):
    os.mkdir(plot_output_folder)
group_col, count_col, top_n = 'user', 'count', 5
hist_df = subreddit_df.groupby(by=[group_col])[count_col].sum()\
.sort_values(ascending=False)[0:top_n]
hist_trace = go.Bar(
    x = hist_df.index,
    text = [f'{val:.1f}' for val in hist_df.values],
    textposition = 'auto',
    textfont = dict(color='blue'),
    y = hist_df.values
)
hist_layout = dict(
    title = f'Distribution of Top {top_n} Active {group_col}',
    xaxis = dict(title=f'{group_col}'),
    yaxis = dict(title='count')
)
fig = go.Figure(data=[hist_trace],layout=hist_layout)
fig.write_image(os.path.join(plot_output_folder,f"top_{top_n}_eda.png"))

### Azureml Workspace and Experiment

In [38]:
import azureml.core
import numpy as np
import logging
from azureml.core import Workspace, Dataset, Experiment, Environment, Run

ws = Workspace.from_config()
experiment_name = config['azureml']['experiment_name'].get()
experiment = Experiment(ws,name=experiment_name)
env_name = config['azureml']['environment_name'].get()
subreddit_env = Environment.from_conda_specification(name=env_name,file_path='./conda_dependencies.yml')