In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
# default_exp firsttry
# all_slow

# Steroids or Not

Aspirationally, a machine learning classifier which takes an images of a person as input and says whether they're using steroids or not

This notebook is a script that grabs images from r/nattyorjuice to make a steroid detector

In [None]:
#hide
from nbdev.showdoc import *

## Install/import stuff

In [None]:
#hide
!pip install .[dev]


In [None]:
#export
from fastbook import *
from fastai.vision.widgets import *
# Loads environment variables from .env file

import os
import praw
import requests
from PIL import Image
from io import BytesIO
from dotenv import load_dotenv

In [None]:
#export
class PrawClient():
    def __init__(self):
        load_dotenv()
        self.client_id = os.environ.get('REDDIT_CLIENT_ID')
        self.client_secret = os.environ.get('REDDIT_CLIENT_SECRET')
        self.user_agent = 'User-Agent: Steroid detector bot by /u/thetreecycle'
        # Output client_id to see if it's working
        print(f'Starting instance with client_id {self.client_id}')

    def reddit(self):
        return praw.Reddit(
            client_id=self.client_id,
            client_secret=self.client_secret,
            user_agent=self.user_agent,
        )
    def subreddit(self, subreddit):
        return self.reddit().subreddit(subreddit)        

In [None]:
nattyorjuice = PrawClient().subreddit('nattyorjuice')

# Create directory structure

In [None]:
storage_path = Path('/storage/steroidsornot')

natural_path = storage_path / 'natural'

steroids_path = storage_path / 'steroids'

for path in [natural_path, steroids_path]:
    if not path.exists():
        path.mkdir()

## Queries

In [None]:
images_only = ' url:i.redd.it'

natty_query = 'flair:Natty AND NOT flair:FAKE AND NOT flair:Juice' + images_only

juicy_query = 'flair:Juicy' + images_only

fake_natty_query = 'flair:FAKE NATTY' + images_only

## Try it with one natty picture

In [None]:
natty_submission = next(nattyorjuice.search(query=natty_query,limit=1))

In [None]:
natty_image_url = natty_submission.preview['images'][0]['resolutions'][2]['url']

response = requests.get(natty_image_url)

In [None]:
filename = response.url.split('/')[-1].split('?')[0]

In [None]:
filename

In [None]:
bytesio_image = BytesIO(response.content)
image = Image.open(bytesio_image)
image

In [None]:
if response.status_code == 200:
    with open(natural_path / filename, 'wb') as f:
        f.write(response.content)


## Try it with many pictures

In [None]:
def get_thumbnail_index(submission):
    '''
    Picks which thumbnail to download from list. thumbnails at index 2 
    all seem to have a width of 320 pixels, which is perfect for training.
    Some original pictures are smaller than this though, so we just grab the next
    biggest size
    '''
    if submission.preview['images'][0]['source']['width'] >= 320:
        return 2
    else:
        return -1

In [None]:
def get_images(subreddit, query, path, limit=3):
    '''Downloads multiple images from subreddit for training, of just big enough size for training'''
    submissions = subreddit.search(query=query,limit=limit)

    images_count = 0
    for submission in submissions:
    #     print(submission.preview['images'][0])
        last_submission = submission
        thumbnail_index = get_thumbnail_index(submission)
        image_url = submission.preview['images'][0]['resolutions'][thumbnail_index]['url']

        response = requests.get(image_url)
        filename = response.url.split('/')[-1].split('?')[0]

        if response.status_code == 200:
            with open(path / filename, 'wb') as f:
                f.write(response.content)

#         print(filename)
#         print('\n')

        images_count += 1
    
    print(f"I downloaded {images_count} images to {path}")

In [None]:
# get_images(nattyorjuice, natty_query, natural_path, 1000)
# I downloaded 51 images to /storage/steroidsornot/natural

In [None]:
# get_images(nattyorjuice, juicy_query, steroids_path, 1000)
# I downloaded 243 images to /storage/steroidsornot/steroids

In [None]:
# get_images(nattyorjuice, fake_natty_query, steroids_path, 1000)
# I downloaded 162 images to /storage/steroidsornot/steroids

## Load data into dataloaders

In [None]:
filenames = get_image_files(storage_path)
filenames

In [None]:
failed = verify_images(filenames)
failed

In [None]:
failed.map(Path.unlink)

## From Data to DataLoaders

In [None]:
fit_people = DataBlock(
    blocks=(ImageBlock, CategoryBlock), 
    get_items=get_image_files, 
    splitter=RandomSplitter(valid_pct=0.2, seed=42),
    get_y=parent_label,
    item_tfms=Resize(128))

In [None]:
dataloaders = fit_people.dataloaders(storage_path)

In [None]:
dataloaders.valid.show_batch(max_n=4, nrows=1)

### Data Augmentation

In [None]:
fit_people = fit_people.new(item_tfms=Resize(128), batch_tfms=aug_transforms(mult=2))
dataloaders = fit_people.dataloaders(storage_path)
dataloaders.train.show_batch(max_n=8, nrows=2, unique=True)

## Training Your Model, and Using It to Clean Your Data

In [None]:
fit_people = fit_people.new(
    item_tfms=RandomResizedCrop(224, min_scale=0.5),
    batch_tfms=aug_transforms())
dataloaders = fit_people.dataloaders(path)

In [None]:
learn = cnn_learner(dataloaders, resnet18, metrics=error_rate)
learn.fine_tune(4)

In [None]:
interp = ClassificationInterpretation.from_learner(learn)
interp.plot_confusion_matrix()

In [None]:
interp.plot_top_losses(5, nrows=2)

In [None]:
cleaner = ImageClassifierCleaner(learn)
cleaner

In [None]:
#hide
# for idx in cleaner.delete(): cleaner.filenames[idx].unlink()
# for idx,cat in cleaner.change(): shutil.move(str(cleaner.filenames[idx]), path/cat)

In [None]:
learn.predict('natural.jpg')

## From Data to DataLoaders

### Data Augmentation

## Training Your Model, and Using It to Clean Your Data

## Turning Your Model into an Online Application

### Using the Model for Inference

### Creating a Notebook App from the Model

### Turning Your Notebook into a Real App

### Deploying your app

## How to Avoid Disaster

### Unforeseen Consequences and Feedback Loops

## Get Writing!

## Questionnaire

1. Provide an example of where the bear classification model might work poorly in production, due to structural or style differences in the training data.
1. Where do text models currently have a major deficiency?
1. What are possible negative societal implications of text generation models?
1. In situations where a model might make mistakes, and those mistakes could be harmful, what is a good alternative to automating a process?
1. What kind of tabular data is deep learning particularly good at?
1. What's a key downside of directly using a deep learning model for recommendation systems?
1. What are the steps of the Drivetrain Approach?
1. How do the steps of the Drivetrain Approach map to a recommendation system?
1. Create an image recognition model using data you curate, and deploy it on the web.
1. What is `DataLoaders`?
1. What four things do we need to tell fastai to create `DataLoaders`?
1. What does the `splitter` parameter to `DataBlock` do?
1. How do we ensure a random split always gives the same validation set?
1. What letters are often used to signify the independent and dependent variables?
1. What's the difference between the crop, pad, and squish resize approaches? When might you choose one over the others?
1. What is data augmentation? Why is it needed?
1. What is the difference between `item_tfms` and `batch_tfms`?
1. What is a confusion matrix?
1. What does `export` save?
1. What is it called when we use a model for getting predictions, instead of training?
1. What are IPython widgets?
1. When might you want to use CPU for deployment? When might GPU be better?
1. What are the downsides of deploying your app to a server, instead of to a client (or edge) device such as a phone or PC?
1. What are three examples of problems that could occur when rolling out a bear warning system in practice?
1. What is "out-of-domain data"?
1. What is "domain shift"?
1. What are the three steps in the deployment process?

### Further Research

1. Consider how the Drivetrain Approach maps to a project or problem you're interested in.
1. When might it be best to avoid certain types of data augmentation?
1. For a project you're interested in applying deep learning to, consider the thought experiment "What would happen if it went really, really well?"
1. Start a blog, and write your first blog post. For instance, write about what you think deep learning might be useful for in a domain you're interested in.