In [None]:
##################################################
### Author: Anthony Igel                       ###
### Team: Category Management Transformation   ###
### Project: Developing practical Python Tools ###
### Purpose: Generating data for modeling      ###
### Date: 06/04/2018                           ###
##################################################

#################################################################################################################################

######## sys information ########
### Before starting, ensure that your version of Python is up to date
import sys
assert sys.version_info.major == 3

######################################################################
########                     Import Modules                   ########
######################################################################
import py_effo as py_effo

### pandas
# Pandas is for structured data operations and manipulations, extensively used for data preparation
import pandas as pd

### sklearn
from sklearn import datasets
from sklearn.datasets.samples_generator import make_blobs
from matplotlib import pyplot as plt

In [None]:
######################################################################
########            Generate Linerar Regression Data          ########
######################################################################

### Generate regression data from scikit-learn
# n_samples: number of test data points created
# n_features: number of input features
# noise: variation amongst the generated data
x, y = datasets.make_regression(n_samples = 2000, n_features = 15, noise = 1.5)

### Convert data set into data frames
x = pd.DataFrame(data = x, columns = ['f1', 'f2', 'f3', 'f4', 'f5', 'f6', 'f7', 'f8', 'f9', 'f10', 'f11', 'f12', 'f13', 'f14', 'f15'])
y = pd.DataFrame(data = y, columns = ['target'])

### Join them together for hollistic view
df = y.join(x)

### View descriptive statistics for data frame
df.describe()

In [None]:
######################################################################
########                Generate Clustering Data              ########
######################################################################

### Generate clustering data from scikit-learn.datasets make_blobs function
# n_samples: number of test data points created
# centers: number of clusters
# features: number of input features
x, y = make_blobs(n_samples = 200, centers = 4, n_features = 2)

### Create data frame from generated data set
x = pd.DataFrame(data = x, columns = ['x1', 'x2'])
y = pd.DataFrame(data = y, columns = ['label'])
df = y.join(x)
groups = df.groupby('label')

# Plot the blobs
fig, ax = plt.subplots()
colors = ["blue", "red", "green", "purple"]
for idx, classification in groups:
    classification.plot(ax = ax, kind = 'scatter', x = 'x1', y = 'x2', label = idx, color = colors[idx])
plt.show()
