In [15]:
##################################################
### Author: Anthony Igel                       ###
### Team: Category Management Transformation   ###
### Project: Developing practical Python Tools ###
### Purpose: Multivariate Regression           ###
### Date: 06/01/2018                           ###
##################################################

######################################################################
########                     Import Modules                   ########
######################################################################
import py_effo as py_effo

### pandas
# Pandas is for structured data operations and manipulations, extensively used for data preparation
import pandas as pd

### numpy
# NumPy stands for Numerical Python, a library contains basic linear algebra functions, Fourier Transforms and advanced random
# number capabilities
import numpy as np 

### Scipy
# Scipy performs a host of statistical calculations, built on top of Numpy, thus we do not need to import Numpy as all Numpy
# functions are contained in Scipy
# https://oneau.wordpress.com/2011/02/28/simple-statistics-with-scipy/
import scipy as sp

### sklearn
# Sklearn contains basic statistical models
from sklearn.datasets import load_linnerud
from sklearn.datasets import make_friedman1

# As well as a module to calculate model performance statistics
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error
from sklearn import ensemble
from sklearn import metrics, model_selection, tree

### Statsmodels
# Sklearn contains basic statistical models and data sets
import statsmodels.api as sm

### Matplotlib
# Matplotlib is a Python based plotting library with complete 2D support and limited 3D support
%matplotlib inline
import matplotlib as mlb
import matplotlib.pyplot as plt

### Seaborn
# Seaborn is a Python visualization library based on Matplolib, providing high-level interface for statistcial graphing
# Seaborn supports numpy and pandas data structures as well as statistical routines from scipy and statsmodels
# Note: https://seaborn.pydata.org/introduction.html
import seaborn as sns

### String
# Allows for more flexible solutions for dealing with string characters
import string as st

In [12]:
######################################################################
########                      Example 1                       ########
######################################################################

######################################################################
########                     Import Data                      ########
######################################################################

### Linnerud Data
# linnerud is a data set comprised of exercise data and physiological data
# load dataset from statsmodel modules
df = load_linnerud()

### Since linnerud IS a numpy ndarry we must first make it a dataframe 
x = pd.DataFrame(data = df.data, columns = ['chins', 'situps', 'jumps'])
y = pd.DataFrame(data = df.target, columns = ['weight', 'waist', 'pulse'])

######################################################################
########                  Data Expoloration                   ########
######################################################################
print("Exercise Data")
print(x.head(3))
print()
print("Physiological Data")
print(y.head(3))
print()
print("Exercise Data Summary")
print(x.describe())
print()
print("Physiological Data Summary")
print(y.describe())
print()
print("Exercise Null Values")
print(x.isnull().sum())
print()
print("Physiological Null Values")
print(y.isnull().sum())

Exercise Data
   chins  situps  jumps
0    5.0   162.0   60.0
1    2.0   110.0   60.0
2   12.0   101.0  101.0

Physiological Data
   weight  waist  pulse
0   191.0   36.0   50.0
1   189.0   37.0   52.0
2   193.0   38.0   58.0

Exercise Data Summary
           chins      situps      jumps
count  20.000000   20.000000   20.00000
mean    9.450000  145.550000   70.30000
std     5.286278   62.566575   51.27747
min     1.000000   50.000000   25.00000
25%     4.750000  101.000000   39.50000
50%    11.500000  122.500000   54.00000
75%    13.250000  210.000000   85.25000
max    17.000000  251.000000  250.00000

Physiological Data Summary
           weight      waist      pulse
count   20.000000  20.000000  20.000000
mean   178.600000  35.400000  56.100000
std     24.690505   3.201973   7.210373
min    138.000000  31.000000  46.000000
25%    160.750000  33.000000  51.500000
50%    176.000000  35.000000  55.000000
75%    191.500000  37.000000  60.500000
max    247.000000  46.000000  74.000000

Ex

In [16]:
######################################################################
########                 Random Forest Modeling               ########
######################################################################

### Initialize Random Forest object
rf_multreg = ensemble.RandomForestRegressor(n_estimators=  500)

### Fit the model on the data frame
rf_multreg.fit(x, y)

######## Model Scoring ########
rf_multreg.score(x, y)

0.82006107885377677