# Imports

In [1]:
##########
# basics #
##########

import warnings
warnings.filterwarnings('ignore')
import collections
import datetime
import glob
import hashlib
import itertools
import math
import operator
import os
import pickle
import random
import re
import string
import sys
import time

###########
# science #
###########

import scipy as sp
import numpy as np
import pandas as pd
import fbprophet
import statsmodels
rseed = random.seed(42)
np.random.seed(rseed)

######
# ml #
######

import lime
import xgboost as xgb
import theano as thno
import keras as krs
import tensorflow as tf

###################
# sklearn tooling #
###################

from sklearn import decomposition
from sklearn import preprocessing
from sklearn import metrics
from sklearn import model_selection
from sklearn import grid_search
from sklearn import pipeline
from sklearn import feature_selection

#################
# visualization #
#################

# plotly
import plotly.plotly as py
import plotly.tools as tls
from plotly.graph_objs import *
import cufflinks as cf
tls.set_credentials_file(username=os.environ.get('PLOTLY_USERNAME'),
                         api_key=os.environ.get('PLOTLY_APIKEY'))
cf.set_config_file(offline=False, world_readable=True, theme='pearl')

# matplotlib
import matplotlib.pyplot as plt
import matplotlib as mpl
from IPython.display import set_matplotlib_formats
set_matplotlib_formats('pdf', 'svg')
mpl.rcParams['figure.figsize']=(12.0,4.0)
%matplotlib inline

# seaborn
import seaborn as sns
#sns.set_style('darkgrid')
#sns.set_palette('muted', n_colors=15, desat=None)
#sns.set_context("notebook", font_scale=1.5,
#                rc={"lines.linewidth": 2.5})

Using TensorFlow backend.


In [2]:
############
# sys info #
############

%reload_ext watermark
%watermark -a "Ken Cavagnolo" -n -u -v -m -h -g -p numpy,scipy,pandas,\
fbprophet,statsmodels,\
sklearn,theano,tensorflow,keras,xgboost,\
matplotlib,seaborn,plotly

Ken Cavagnolo 
last updated: Fri Dec 15 2017 

CPython 3.5.3
IPython 6.1.0

numpy 1.13.3
scipy 1.0.0
pandas 0.21.1
fbprophet 0.2.1
statsmodels 0.8.0
sklearn 0.19.1
theano 1.0.1
tensorflow 1.4.1
keras 2.1.2
xgboost 0.6
matplotlib 2.1.1
seaborn 0.7.1
plotly 2.0.9

compiler   : GCC 4.2.1 Compatible Apple LLVM 6.0 (clang-600.0.57)
system     : Darwin
release    : 17.2.0
machine    : x86_64
processor  : i386
CPU cores  : 4
interpreter: 64bit
host name  : DrGonzo.local
Git hash   : a1ec8abe367505fb8c2353ed53430e84a262ec82


# Define the problem

## What's the problem?
* Describe the problem informally, e.g. "I need a program that will tell me which tweets will get retweets."

* Describe the problem formally, e.g.
    * Task (T): Classify a tweet that has not been published as going to get retweets or not.
    * Experience (E): A corpus of tweets for an account where some have retweets and some do not.
    * Performance (P): Classification accuracy, the number of tweets predicted correctly out of all tweets considered as a percentage.

* List assumptions, e.g.
    * The specific words used in the tweet matter to the model.
    * The specific user that retweets does not matter to the model.
    * The number of retweets may matter to the model.
    * Older tweets are less predictive than more recent tweets.

* List similar problems, e.g. "A related problem would be email spam discrimination that uses text messages as input data and needs binary classification decision."

## Why do I care about this problem?
* What is my motivation for studying this problem?
* What are the benefits a solution provides?
* How does will a solution be used, e.g. value proposition?

## How should I solve this problem?
* Brainstorm! Literally type everything as a stream of consciousness (coffee helps here)

These are my thoughts. These are my thoughts. These are my thoughts. These are my thoughts. These are my thoughts. These are my thoughts. These are my thoughts. These are my thoughts. These are my thoughts. These are my thoughts.

* Describe how to solve problem manually
* What data do I have?
* What data do I need?
* What data I **don't** need?
* What do I know the least about in the solution process above?
    * knowledge gap 1
    * knowledge gap 2
* What am I concerned is incorrect in above solution? Find an expert online and ask them about these items
    * concern 1
    * concern 2

# Data Preparation

## Basics and ETL
* What data sources am I using?
* Format, Clean, Sample
* Diagnose missing values and anomalies (MVA's)
* What am I going to do w/ MVA's, e.g. simulate? averages? discard records?
* Replace missing values
* Remove duplicates
* One-Hot encode categorical features
* Find outliers and explain
* Scale
* Standardize

## Summarize and visualize attributes
* Data structure
* Distributions
* Histograms
* Pairwise scatterplots

## Feature Engineering
* Data has complex multi-dimensional structures that ML algos know how to find and exploit to make decisions
* You want to best expose those structures to the algo
* Some structures may be too dense or too complex for the algo to find without help
*Domain expertise comes into play here
* Attribute decomposition into simpler components
* Attribute aggregation, e.g. hyperattributes

# Algorithm Evaluation

## Experiments
* Test harness = test, train, and validation splits
* Loading up a bunch of standard machine learning algorithms into test harness
* Run 5-10 standard algorithms from each [major algo family](http://machinelearningmastery.com/a-tour-of-machine-learning-algorithms/) using transformed and scaled versions of the dataset
* Find types of algorithms and dataset combinations that give structure to the problem

## Measure Performance
* Select standard performance measures for the selected classes of algos
* Report mean and standard deviation of the accuracy
* Run statistical significance tests
* Which results are meaningful and not just noise?
* Create figures for each algo summarizing the distribution of accuracy
* Consider 10-fold cross-validation, if applicable

# Results

## Algorithm Tuning
* Which is the best model?
* What's my method forsearching model param space?
* Usually gonna be grid search
* Don't forget, the more tuning, the greaters chance of overfitting

## Ensemble Methods
* Should results of several models be combined?
* What's the justification?
* If so, how?
    * Bagging
    * Boosting
    * Blending

# Communicate

## Context
* Intro and history
* Statement of problem
* Goal of this project

## Methods and Findings
* Data sources
* Interesting things about data
* Outline of methods
* Implementation techniques
* Evaluation of implementation
* Analysis

## Results and Caveats
* What did I learn?
* What is interesting?
* What is useful?
* Where does the model not work?
* What questions does the model not answer?

## Conclusions
* Craft blog post from results
* Post code to Github
* Drop into social media stream

In [None]:
import pandas as pd

data = '/Library/Python/2.7/site-packages/pandas/tests/data/iris.csv'
iris_data = pd.read_csv(data, na_values=['NA'])
iris_data.describe()

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sb

sb.pairplot(iris_data.dropna(), hue='Name')

In [None]:
iris_data.loc[
    (iris_data['SepalLength'] < 5.25) &
    (iris_data['PetalWidth'] > 0.75)
]