## Utils

---

You are currently looking at **version 1.0** of this notebook.  
Project/objective:  
Date: 24-may-2018  
Author: Frank Ebbers  
Contributors:   
License: MIT https://opensource.org/licenses/MIT  

---

## Changelog

- 

Useful resources:

https://www.dataquest.io/blog/jupyter-notebook-tips-tricks-shortcuts/

http://blog.juliusschulz.de/blog/ultimate-ipython-notebook

http://nbviewer.jupyter.org/github/jupyter/nbconvert/blob/master/docs/source/customizing.ipynb


In [None]:
# %pylab inline         # imports numpy, pandas, scipy, matplotlib inline
# %matplotlib inline    # shows plots in notebook
# %matplotlib notebook  # same, interactive but slower

### Markdown on Output

In [None]:
from IPython.display import display, Markdown, Latex
text = '**some markdown** `truly nice` $\phi$ {}'.format(' what the hack')
display(Markdown(text))

In [None]:
def printmd(text):
    """Prints out as markdown
    Example:
    text = '**some markdown** `truly nice` $\phi$ {}'.format(' what the hack')
    """
    from IPython.display import display, Markdown, Latex
    return display(Markdown(text))
printmd(text)

In [None]:
def printmd(text):
    """Prints out as markdown
    Examples:
    md_text = '** green::some markdown ** `truly nice` $\phi$ {}'.format(' red::hack')
    colored_text = 'red::green'
    """
    import re
    from IPython.display import display, Markdown, Latex

    try:
        for col_item in re.findall(r"(\w+::\S+)", text):
            col, item = col_item.split('::')
            text = re.sub(col_item, '<span style="color:{}">{}</span>'.format(col, item), text)
    except: pass
    return display(Markdown(text))

md_text = '** green::some markdown ** `truly nice` $\phi$ {}'.format(' red::hack')
printmd(md_text)
printmd('** {}::{} **'.format('green', 'bold_green_text'))
printmd('<green::red> more text')
printmd('red::red, black:orange::orange : blue: :green::c')


In [None]:
# in flask env
!touch requirements.txt && pip freeze > requirements.txt

In [None]:
# Running servers
!jupyter notebook list

In [None]:
# Automatically reload modules - for working with own modules
# https://ipython.org/ipython-doc/3/config/extensions/autoreload.html
%load_ext autoreload
%autoreload 2

In [None]:
!pwd

In [None]:
# Ipython.__file__

In [None]:
%psearch?

In [None]:
%config
# https://ipython.org/ipython-doc/3/interactive/magics.html

### Load aliases 
from my_lib/aliases.py

In [None]:
%alias aliases \%load my_lib/aliases.py

In [None]:
%aliases

In [None]:
# %load my_lib/aliases.py
# Generic aliases
%alias show echo
%alias mop echo Monty Python
%alias tree pwd && tree . %s
%alias find find ~ | grep -i %s
%alias find.. find .. | grep -i %s
%alias find. find . | grep -i %s
%alias findx find ~ | grep -Ei %s
%alias read cat %s
%alias aug ls -al | grep Aug | grep -v 2018

# !brew install cowsay
%alias say cowsay
%alias_magic t timeit

# Create directory with packages for importing
%alias make_lib mkdir my_lib && touch my_lib/__init__.py  
%alias add_lib cp %s my_lib/my_module.py
%alias to_lib mkdir my_lib && touch my_lib/__init__.py && cp %s my_lib/my_module.py

---
## Setup notebook
---

### Import the generic libraries used in this notebook

In [None]:
import numpy as np
import pandas as pd

%matplotlib inline
# import matplotlib
import matplotlib.pyplot as plt

from collections import OrderedDict, Counter
import string
import re
import datetime
import requests
import json
import pprint

### Import interactive widgets library

In [None]:
from ipywidgets import interact, fixed
from IPython.html import widgets
from IPython.display import HTML, IFrame

### Toggle code dispay

In [None]:
HTML('''<script>
code_show=true; 
function code_toggle() {
 if (code_show){
 $('div.input').hide();
 } else {
 $('div.input').show();
 }
 code_show = !code_show
} 
$( document ).ready(code_toggle);
</script>
<form action="javascript:code_toggle()"><input type="submit" value="Show/hide code"></form>''')

### Manage warnings

In [None]:
import warnings
warnings.filterwarnings("ignore")
# warnings.filterwarnings("once") # always will print every occurance

### Set defaults and constants

In [None]:
# seed the random number generator so we all get the same results
rand_seed = 2018
np.random.seed(rand_seed)
epsy = 1e-10

In [None]:
matplotlib.rc('xtick', labelsize=14) 
matplotlib.rc('ytick', labelsize=14)

### Check current working directory and file structure

In [None]:
%%!
# directories with token
ls -FGlAhp | grep 'ml'

In [None]:
today = datetime.now().strftime(r'%Y-%m-%d')
dirname = os.path.join('../_data/news-{}'.format(today, '_name'))
os.makedirs(dirname, exist_ok=True)

In [None]:
def sys_info():
    !echo "You are logged on:$NC "; scutil --get ComputerName
    !echo "The current datetime is:$NC " ; date
    !echo  "\nMachine stats:$NC " ; uptime; macstats
    !echo  "\nDisk usage:$NC "; df -l
    !echo "\nPublic facing IP address:$NC "; myip
    !echo

In [None]:
sys_info()

In [None]:
# !brew install cowsay
!cowsay 'Hello Awesome Notebook'

In [None]:
# !brew install tree
!tree . | head -10

In [None]:
!find .. | grep -i edit

In [None]:
!pwd
!ls data | head -5

In [None]:
# Find file
def get_file_path(filename, first=5):
    !pwd
    !find . | grep '$filename' | head -'$first'
    return None

In [None]:
filename = 'full_set.txt'
get_file_path(filename, 10)

### Run helper notebooks

In [None]:
%run 'organ_helper.py'

---
## Utils
---

In [None]:
# Mute terminal Output / Out[]:

_ = 3
3;

### Debug, test, 

In [None]:
%timeit

### Save as Excel spreadsheet

In [None]:
writer = pd.ExcelWriter('data/excel_test.xlsx')
df = pd.DataFrame(np.ones(10).reshape(-1, 5))

# index startrow/startcol N means value is inserted at N+1
df.to_excel(writer, 'sheet_name', startrow=5, startcol=5, header=False, index=False)
writer.save()

### Print output at same position

In [None]:
print('\b\b\b{}'.format(i), sep='', end='', flush=True)
# or
print('{}{}'.format('\b'*len(str(len(text))), len(text)), end='')

### Download files from list

In [None]:
!cat url-list.txt | xargs wget –c

### Show all Out from cell - iso only the last

In [None]:
from IPython.core.interactiveshell import InteractiveShell

InteractiveShell.ast_node_interactivity = "all"

In [None]:
# !find ~ | grep .ipython/profile_default

#### Run all nodes interactively for all instances of Jupyter (Notebook and Console)

 - simply create a file ~/.ipython/profile_default/ipython_config.py with the lines below.

In [None]:
%%writefile ~/.ipython/profile_default/ipython_config.py
get_config()

# Run all nodes interactively
c.InteractiveShell.ast_node_interactivity = "all"

In [None]:
!cat ~/.ipython/profile_default/ipython_config.py

### Open TextEdit to edit file

In [None]:
!open -e $ipython_config

### RISE slideshow

In [None]:
!pip install RISE

In [None]:
!jupyter-nbextension install rise --py --sys-prefix

In [None]:
!jupyter-nbextension enable rise --py --sys-prefix

In [None]:
# matplotlib legend without repeat

In [None]:
# Legend without repeat
handles, labels = plt.gca().get_legend_handles_labels()
by_label = OrderedDict(zip(labels, handles))
plt.legend(by_label.values(), by_label.keys())

In [None]:
plt.rcParams.find_all

In [None]:
%%writefile ../../_data/standard_import.txt

%matplotlib inline
plt.style.use('seaborn-white')

import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# %load ../_data/standard_import.txt

%matplotlib inline
plt.style.use('seaborn-white')

import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:

# https://pandas.pydata.org/pandas-docs/stable/generated/pandas.set_option.html
# pd.set_option('display.notebook_repr_html', False)


<font  style="color:blue"> **Code**</font>
```python
# test function 
testy_L2 = NN_L2(trainx, trainy, testx)
print( type( testy_L2) )
print( len(testy_L2) )
print( testy_L2[40:50] )
```

<font  style="color:magenta"> **Output**</font>
```
<class 'numpy.ndarray'>
62
[ 2.  2.  1.  0.  0.  2.  0.  0.  0.  0.]
```


In [None]:
train_digits, train_counts = np.unique(train_labels, return_counts=True)
print("Training set distribution:")
print(dict(zip(train_digits, train_counts)))

In [None]:
def tree_plot(clf, features, labels=None):
    """Plot decision tree in Notebook"""
    from sklearn import tree
    import graphviz
    from pydotplus import graph_from_dot_data

    dot_data = tree.export_graphviz(clf, out_file=None, feature_names=features, class_names=labels,
                             filled=True, rounded=True, special_characters=True, impurity=True, proportion=False)
    return Image(graph_from_dot_data(dot_data).create_png())

In [None]:
# nxn similarity matrix
x = np.arange(-5, 5.1, 0.1)
y = np.arange(-4, 4.1, 0.1)

K = np.zeros((x.shape[0], y.shape[0]))        
for (i,j), _ in np.ndenumerate(K):
    K[i,j] = (x[i] - y[j])
K

In [None]:

import numpy as np
import matplotlib.pyplot as plt

# Create all possible pairs (x, y)
x = np.arange(-1, 1.1, 0.1)
y = np.arange(-2, 2.1, 0.1)
xx, yy = np.meshgrid(x, y, sparse=False)
xx[:3, :3]
yy[:3, :3]
[(x.round(1), y.round(1)) for x, y in zip(xx.ravel(), yy.ravel())][:25]

z = np.sin(xx**2 + yy**2) / (xx**2 + yy**2)

print(x.shape, y.shape, xx.shape, yy.shape, z.shape)
h = plt.contourf(x, y, z)

In [None]:
def xy_grid(x, y, ax_pad=0, density=0.1):
    """returns grid of (xx, yy) pairs, w.r.t. density and padding"""
    xmin, xmax = min(x)-ax_pad, max(x)+ax_pad
    ymin, ymax = min(y)-ax_pad, max(y)+ax_pad
    xx, yy = np.meshgrid(np.arange(xmin, xmax+density, density), np.arange(ymin, ymax+density, density))
    return np.c_[xx.ravel(), yy.ravel()]

x = np.arange(-1, 1.1)
y = np.arange(-2, 2.1)
xy_grid(x, y, ax_pad=0, density=0.05)

In [None]:
def xy_grid(x, y, ax_pad=0, density=0.1):
    """returns grid of (xx, yy) pairs, w.r.t. density and padding"""
    xmin, xmax = min(x)-ax_pad, max(x)+ax_pad
    ymin, ymax = min(y)-ax_pad, max(y)+ax_pad
    xx, yy = np.meshgrid(np.arange(xmin, xmax+density, density), np.arange(ymin, ymax+density, density))
    return {'array': np.c_[xx.ravel(), yy.ravel()], 
            'xx': xx, 'yy': yy,
            'xmin': xmin, 'xmax': xmax, 
            'ymin': ymin, 'ymax': ymax}

In [None]:
def confusion_df(y_true, y_pred):
    cm = confusion_matrix(y_true, y_pred)
    cm_df = pd.DataFrame(cm.T, index=svc.classes_, columns=svc.classes_)
    cm_df.index.name = 'Predicted'
    cm_df.columns.name = 'True'
    return cm_df

In [None]:
# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_test, y_pred)
cm
# Accuracy
np.sum(np.eye(2) * cm) / np.sum(cm)

In [None]:
# confusion matrix/pivot table
def pivot(df, ftr1, ftr2):
    return df.groupby([ftr1,ftr2]).size().unstack(ftr2).T

In [None]:
def pivot(y, y_pred):
    df = pd.DataFrame({'True label': y, 'Predicted label': y_pred})
    return df.groupby(['True label', 'Predicted label']).size().unstack('Predicted label').T

In [None]:
def pivot(y, y_pred, labels=['Negative', 'Positive']):
    dict_labels = {k:v for k, v in zip(np.unique(y), labels)}
    df = pd.DataFrame({y.name: y, y_pred.name: y_pred})
    df.replace(to_replace=dict_labels, inplace=True)
    return df.groupby([y.name, y_pred.name]).size().unstack(y_pred.name).T

In [None]:
def pivot(y, y_pred, labels=['Negative', 'Positive']):
    dict_labels = {k:v for k, v in zip(np.unique(y), labels)}
    try:
        y_name, y_pred_name = y.name, y_pred.name
    except:
        y_name, y_pred_name = 'True label', 'Predicted label'
    df = pd.DataFrame({y_name: y, y_pred_name: y_pred})  
    df.replace(to_replace=dict_labels, inplace=True)
    return df.groupby([y_name, y_pred_name]).size().unstack(y_pred_name)

In [None]:
# Workaround to fix bug in statsmodels .summary()
# https://github.com/statsmodels/statsmodels/issues/3931
from scipy import stats
stats.chisqprob = lambda chisq, df: stats.chi2.sf(chisq, df)

## Bug fixes - statsmodels not compatible with current stable version of scipy/pandas

In [None]:
# Statsmodels bug fix:
from pandas.core import datetools

import statsmodels.api as sm
import statsmodels.formula.api as smf

# Workaround to fix bug in statsmodels .summary() - missing stats.chisqprob function
# https://github.com/statsmodels/statsmodels/issues/3931
from scipy import stats
stats.chisqprob = lambda chisq, df: stats.chi2.sf(chisq, df)

In [None]:
# parse datetime 
df = pd.read_csv('../../_data/Smarket.csv', usecols=range(1, 10), index_col=0, parse_dates=True) 
# select columns
pd.read_csv('../../_data/Advertising.csv', usecols=[1,2,3,4])
pd.read_csv('../../_data/Credit.csv', usecols=list(range(1,12)))
pd.read_csv('../../_data/Auto.csv', na_values='?').dropna()

In [None]:
# Probability counter with threshold
thres = 0.9
band = [0.4, 0.6]

pred_p = clf.predict_proba(X_test)
np.unique(pred_p[:, 1]>thres, return_counts=True)
np.unique(band[0] > pred_p[:, 1] > band[-1], return_counts=True)

In [None]:
margin = 0.03
np.unique(pred_p[:, 1] < 0.5+margin, return_counts=True), 'smaller than .5+margin'
np.unique(0.5-margin < pred_p[:, 1], return_counts=True), 'bigger than .5-margin'
'Predictions between margin({}) [false true] : {}'.format(margin, 
np.unique(pred_p[:, 1] < 0.5+margin, return_counts=True)[1] + np.unique(0.5-margin < pred_p[:, 1], return_counts=True)[1])

In [None]:
np.set_printoptions(suppress=True, precision=3, threshold=10)
# np.set_printoptions()  # formatter gets reset

In [None]:
# Output numpy as laTex
import IPython
ip = IPython.core.getipython.get_ipython()
from IPython.display import display, Math, Latex
import sympy as sp
from IPython.display import Latex, display, display_latex
from sympy.interactive import printing
printing.init_printing(use_latex='png')

from sympy import Matrix as spx
sp.init_printing(use_unicode=True)
# sp.init_printing(use_latex='png')
ip.display_formatter.formatters['text/latex'].enabled = True

spx([[1, -1], [3, 4], [0, 2]])

In [None]:
# http://docs.sympy.org/0.7.2/tutorial.html#printing-tutorial
from sympy.printing import print_latex as plx
from sympy.printing import print_mathml as plm

from sympy.printing import latex as slx
from sympy.printing import mathml as slm

ip.display_formatter.formatters['text/latex'].enabled = True

plx([[1, -1], [3, 4], [0, 2]])
ltx = slx([[1, -1],
           [3, 4], 
           [0, 2]])
display(Latex('${}$'.format(ltx)))

display(Latex(slx([[1, -1], [3, 4], [0, 2]])))


In [None]:
import IPython
ip = IPython.core.getipython.get_ipython()

# Sympy are setting:
ip.display_formatter.formatters['text/latex'].enabled = False
# to disable all latex output. You can revert this with 
ip.display_formatter.formatters['text/latex'].enabled = True

In [None]:
from IPython.display import Latex, display, display_latex
from sympy.interactive import printing
printing.init_printing(use_latex='png')
display(Latex('$\\alpha^2 + \\eta$'))

display(Latex(slx([[1, -1], [3, 4], [0, 2]])))

In [None]:
# set precision
pd.set_option('float_format', '{:.2f}'.format)

In [None]:
# qgrid.version_info
!jupyter notebook --version
!jupyter lab --version
!pip list | grep widgets
!pip list | grep jupyter
!pip list | grep qgrid

In [None]:
N = 5
Z = np.zeros(N*N).reshape(-1, N)
Z = [distance(MNIST.data[i], MNIST.data[j]) for (i,j), v in np.ndenumerate(np.zeros(N*N).reshape(-1, N))]
Z = np.array(Z).reshape(-1, N)
Z

In [None]:
N = 5
Z = np.zeros(N*N).reshape(-1, N)
for (i,j), v in np.ndenumerate(Z):
    Z[i,j] = distance(MNIST.data[i], MNIST.data[j])
Z

In [None]:
def pairwise_map(data, fn, N=0):
    if N == 0: N = len(data)
    Z = np.zeros(N*N).reshape(-1, N)
    for (i,j), v in np.ndenumerate(Z):
        Z[i,j] = fn(data[i], data[j])
    return Z

In [None]:
import numpy as np
# Peak to peak - (max - min)
b1 = np.array([2,1,1])
b2 = np.array([1,3,4])
b3 = np.array([1,4,9])

B = np.stack((b1, b2, b3), axis=0)
np.ptp(B, axis=0)
np.ptp(B, axis=1)

In [None]:
# Piecewise binning and clipping
x = np.arange(50)
x
y = 20
np.piecewise(x, [x>0 , x>10, x>20], [0, 10, y])
np.piecewise(x, [x<10, ((10<=x) & (x<40)), x>=40], [-1, lambda x: x, 1])

In [None]:
def cut_outliers(pd_series, z_value=1.96):
    X = pd_series
    lb = np.mean(X) - z_value * np.std(X)
    ub = np.mean(X) + z_value * np.std(X)
    return np.piecewise(X, [X < lb, ((lb <= X) & (X < ub)), X >= ub], [lb, lambda X: X, ub])

In [None]:
def cut_min_max(pd_series, mini, maxi):
    X = pd_series
    lb = mini
    ub = maxi
    return np.piecewise(X, [X < lb, ((lb <= X) & (X < ub)), X >= ub], [lb, lambda X: X, ub])

#### DataFrame to dictionary - key is index

Drops NaN's from index!

In [None]:
cities = pd.read_excel('../_data/global-city-population-estimates.xlsx', 'CITIES-OVER-300K')
cities.sample(5)
cities_dict = cities[['Urban Agglomeration', '2015']].set_index('Urban Agglomeration').to_dict()['2015']
cities_dict

In [None]:
df.to_dict
df.to_dict('index')
df.to_dict('index').items()
airport_dict = airports[['lon', 'lat']].to_dict('index') 

In [None]:
https://www.census.gov/popclock/world/nl

Next we can use a lambda to pull out the outcome from the attributes dictionary.

In [None]:
df['outcome'] = df['outcome'].map(lambda x: x['outcome'])
df.head()

In [None]:
def extract_dict_value(x, ftr):
    try: return x[ftr]
    except: return 0

df['score'] = df['score_weight'].map(lambda x: extract_dict_value(x, 'score'))

In [None]:
# [:] copies the slice

In [None]:
X_train.select_dtypes('float')

### open notebook link
[Open Notebook](./nlp_part2_sentiment_topic_similarity_classification_.ipynb)

In [None]:
file = '../_credentials/twitter_credentials.txt'
pickle_file = '../_credentials/twitter_credentials.pkl'

In [None]:
%%writefile {file}
p6mTTVJjjBi0G59xIbpkooJXH
Y44ZH5ME8SndWYSCk7WpegQKwyPecnmLOUX2UAIFEGaC3TZayR
27198609-TDq5p3hTA8VTNJtwUASxUT21nnTs6jOxYlDONgkeQ
1wDvWkmZnoxeQnzaylH00FIhgNBhlX4DOy5vWLBUIow53

In [None]:
def recursive_keys(json, i=2):
    """Recursively get keys and list items"""
    if type(json) == dict:
        for key, value in json.items():
#                 print('-'*i, key)
                if type(json[key]) == str: 
                    print('--', key)
                    continue
                elif type(json[key]) == list: 
                    print('list', key)
                    recursive_keys(json[key]) # [0]
                elif type(json[key]) == dict: 
                    print('dict', key)
                    recursive_keys(json[key])

    elif type(json) == list:
        for i, item in enumerate(json):
            print(i, end=' ')
            recursive_keys(json[i])

In [None]:
def xml2dict(xml):
    """Convert XML to dictionary format."""
    import plistlib
    return plistlib.loads(str.encode(xml), fmt=plistlib.FMT_XML)

In [None]:
def wifi_positioning(saved_map=False):
    """Return current location based on nearby wifi-hotspots."""
    
    payload = wifi_hotspots()
    
    if payload is None:
        print('WIFI positioning failed, returning IP location from provider')
        _, lat, lon, radius = ip_location()
        latlon = '{}, {}'.format(lat, lon)
    else:
        api_url_locate = 'https://pos.cit.api.here.com/positioning/v1/locate'
        json_result = geo_location(api_url_locate, payload)
        lat, lon = json_result['location']['lat'], json_result['location']['lng']
        latlon = '{}, {}'.format(lat, lon)
        radius = json_result['location']['accuracy']

    # Print address and lat, lon, radius and open/show location on map
#     print('\ngeoPy Nominatim location: ', geolocator.reverse(latlon))
#     rev_geocoder(lat, lon, radius)
#     here_maps = 'https://wego.here.com/directions/mix/{},{}/?map={},{},15'.format(lat, lon, lat, lon)
#     google_maps = 'https://www.google.com/maps/search/{},{}/@{},{},17z'.format(lat, lon, lat, lon)
#     !open -a Safari $here_maps; open -a Safari $google_maps
#     if saved_map: mia_saved_map(lon, lat, radius)
        
    return lat, lon

In [None]:
import quilt
# https://quiltdata.com
quilt.install("ResidentMario/geoplot_data")

IndexError: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices