# Synthetic datasets

---

You are currently looking at **version 1.0** of this notebook. 

---

## Import packages

In [None]:
%matplotlib inline

import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt

from sklearn.datasets import (make_moons, make_circles, make_friedman1,
                              make_classification, make_blobs, make_regression)

### Constants and defaults

In [None]:
np.set_printoptions(precision=2)
N = 500 # sample size

## Datasets

### Helper functions

#### Sanity check dataset format

In [None]:
def format_check(X, y):
    import numpy as np
    assert type(X) == type(np.zeros(2))
#     assert X.shape[1] > 0
    assert type(y) == type(np.zeros(y.shape))
    try:
        y.shape[1]
        print('{} must be of shape: (n,)'.format(y.shape))  
    except:
        pass
    if len(set(y)) > 9:
        classes = 'regression'
    else:
        classes = set(y)
    print('X:\t {} {}\ny:\t {} {}\nclasses: {}\n'.format(X.shape, type(X), y.shape, type(y), classes))  
    

#### Plot datasets

def plot_dataset(X_train, X_test, y_train, y_test, ax):
    
    # Plot training points
    ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm_bright, edgecolors='k')
    
    # Plot testing points
    ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm_bright, alpha=0.6, edgecolors='k')
    
    ax.set_xlim(xx_min, xx_max)
    ax.set_ylim(yy_min, yy_max)
    ax.set_xticks(())
    ax.set_yticks(())
    plt.tight_layout()
                
    return None

In [None]:
def plot_dataset(X, y, **kwargs):
    import matplotlib.cm as cm
    import seaborn as sns
    from matplotlib.colors import ListedColormap, BoundaryNorm
    
    title = kwargs.get('title', 'default Title')
    label = kwargs.get('c', None)
    
    cmap = kwargs.get('cmap', cm.jet)
    cols = ['#00570d', '#1dea5a', '#0762f1', '#cb00f7', '#ff02a0']  # very bright
    col_pal = sns.color_palette(cols).as_hex()
    cmap = ListedColormap(col_pal)
    
    plt.scatter(X, y, c=label, cmap=cmap, alpha=0.6)
    plt.title(title)
    plt.tight_layout();
    plt.show();

## Syntethic datasets

### Simple regression

In [None]:
X_R1, y_R1 = make_regression(n_samples=N, 
                             n_features=1,
                             n_informative=1, 
                             bias=150.0,
                             noise=30, 
                             random_state=0)
format_check(X_R1, y_R1)
plot_dataset(X_R1, y_R1, title='Simple regression distribution')

### Complex regression

In [None]:
X_F1, y_F1 = make_friedman1(n_samples=N,
                            n_features=7, 
                            random_state=0)

format_check(X_F1[:, 2], y_F1)
plot_dataset(X_F1[:, 2], y_F1, title='Complex regression problem with one input variable')

### Classification

In [None]:
X_C2, y_C2 = make_classification(n_samples=N, 
                                 n_features=2,
                                 n_redundant=0, 
                                 n_informative=2,
                                 n_clusters_per_class=1, 
                                 flip_y=0.1,
                                 class_sep=0.5, 
                                 random_state=0)

format_check(X_C2[:, 0], X_C2[:, 1])
plot_dataset(X_C2[:, 0], X_C2[:, 1], c=y_C2, title='Classification distribution')

### Blobs

In [None]:
from sklearn.datasets import make_blobs

X_blob, y_blob = make_blobs(n_samples=N,
                              n_features=2, 
                              centers=3, 
                              cluster_std=1.0,
                              shuffle=True, 
                              random_state=0)

format_check(X_blob, y_blob)
plot_dataset(X_blob[:,0], X_blob[:,1], c=y_blob, title='Blob shaped distribution')

### Complex Blobs

In [None]:
X_D2, y_D2 = make_blobs(n_samples=N, 
                        n_features=2, 
                        centers=8,
                        cluster_std=1.3, 
                        random_state=4)
y_D2 = y_D2 % 2

format_check(X_D2[:,0], X_D2[:,1])
plot_dataset(X_D2[:,0], X_D2[:,1], c=y_D2, title='Complex blob shaped distribution')

### Noisy Linear

1. create linear separable data set
2. add random noise to X

In [None]:
X_lin, y_lin = make_classification(n_samples=N,
                                   n_features=2, 
                                   n_redundant=0, 
                                   n_informative=2,
                                   n_clusters_per_class=1,
                                   random_state=1)

rng = np.random.RandomState(2)
X_lin += 2 * rng.uniform(size=X_lin.shape)

format_check(X_lin[:,0], X_lin[:,1])
plot_dataset(X_lin[:,0], X_lin[:,1], c=y_lin, title='Noisy linear shaped distribution')

### Moons

In [None]:
X_moon, y_moon = make_moons(n_samples=N, 
                            noise=0.3, 
                            random_state=0)

format_check(X_moon, y_moon)
plot_dataset(X_moon[:,0], X_moon[:,1], c=y_moon, title='Moon shaped distribution')

### Circles

In [None]:
X_circle, y_circle = make_circles(n_samples=N,
                                  noise=0.3, 
                                  factor=0.5, 
                                  random_state=1)

format_check(X_circle, y_circle)
plot_dataset(X_circle[:,0], X_circle[:,1], c=y_circle, title='Circle shaped distribution')

---

## Sklearn datasets

### Breast cancer

In [None]:
# Breast cancer dataset for classification
from sklearn.datasets import load_breast_cancer

cancer = load_breast_cancer()
X_cancer, y_cancer = load_breast_cancer(return_X_y=True)

format_check(X_cancer, y_cancer)
plot_dataset(X_cancer[:,0], X_cancer[:,1], c=y_cancer, title='Breast Cancer dataset')