In [None]:
##################################################
### Author: Anthony Igel                       ###
### Team: Category Management Transformation   ###
### Project: Developing practical Python Tools ###
### Purpose: Data Sampling                     ###
### Date: 05/24/2018                           ###
##################################################

######################################################################
########                     Import Modules                   ########
######################################################################
import py_effo as py_effo

### pandas
# Pandas is for structured data operations and manipulations, extensively used for data preparation
import pandas as pd
pd.set_option('compute.use_bottleneck', False)
pd.set_option('compute.use_numexpr', False)

### numpy
# NumPy stands for Numerical Python, a library contains basic linear algebra functions, Fourier Transforms and advanced random
# number capabilities
import numpy as np 

### Scipy
# Scipy performs a host of statistical calculations, built on top of Numpy, thus we do not need to import Numpy as all Numpy
# functions are contained in Scipy
# https://oneau.wordpress.com/2011/02/28/simple-statistics-with-scipy/
import scipy as sp


### sklearn
# Sklearn contains basic statistical models
from sklearn.linear_model import LinearRegression
from sklearn.model_selection  import train_test_split
# As well as a module to calculate model performance statistics
from sklearn import metrics

### Statsmodels
import statsmodels.formula.api as sm

### Matplotlib
# Matplotlib is a Python based plotting library with complete 2D support and limited 3D support
%matplotlib inline
import matplotlib as mlb
import matplotlib.pyplot as plt

### Seaborn
# Seaborn is a Python visualization library based on Matplolib, providing high-level interface for statistcial graphing
# Seaborn supports numpy and pandas data structures as well as statistical routines from scipy and statsmodels
# Note: https://seaborn.pydata.org/introduction.html
import seaborn as sns

### String
# Allows for more flexible solutions for dealing with string characters
import string as st


In [None]:
######################################################################
########                     Import Data                      ########
######################################################################

### Advertising Data
## data_link = "http://www-bcf.usc.edu/~gareth/ISL/Advertising.csv"
advertising = pd.read_csv("/nfs/analysis/analysis/kroger/category_management_transformation/mini_hack_days/python/Advertising.csv")

######################################################################
########                  Data Expoloration                   ########
######################################################################

### Let's determine how the data types look for this data frame
print('Data Type Information')
print(advertising.info())
print()
### View descriptive statistics about data set
print('Descriptive Statistics of Advertising Data')
print(advertising.describe())
print()
### View if any of our data is null
print('Summary of Nulls in Advertising Data')
print(advertising.isnull().sum())

In [None]:
######################################################################
########                  Data Preparation                    ########
######################################################################

### This will ensure that all column names are stripped of whitespace

### The lambda operator or lambda function is a way to create small anonymous functions, as seen below

advertising.rename(columns = lambda x: x.strip(), inplace = True)

### We can also adjust the case of our metrics table columns
advertising.rename(columns = lambda x: x.lower(), inplace = True)

### We will want to separate the sales data, since we are trying to predict it based off our features
### Let's create a Python list of feature names
feature_names = ['tv', 'radio', 'newspaper']

### We can use the Python list we just created to create a data frame comprised of our independent variables
x = advertising[feature_names]

### Let's create a vector of our dependent variable
y = advertising.sales

### View dependent and independent variables
print("Dependent Variables")
print(x.head(3))
print()
print("Independent Variables")
print(y.head(3))

In [None]:
######## Data Splitting ########

### We want to split our independent and dependent data sets into training and testing
### train_test_split is a function from the sklearn.model_selection module
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state = 1)

print('Dependent Variables')
print('Training: ' + str(len(x_train)))
print('Testing: ' + str(len(x_test)))
print()
print('Independent Variables')
print('Training: ' + str(len(y_train)))
print('Testing: ' + str(len(y_test)))

In [None]:
######################################################################
########                     Modeling                         ########
######################################################################

### From the sklearn module we need to define our LinearRegression model as a variable object
linreg = LinearRegression()

### Fit the model to the training set, as to learn the coefficients
linreg.fit(x_train, y_train)

### Then we can make predictions on the testing set
y_pred = linreg.predict(x_test)

######## Model Evaluations ########

### Compute the RMSE of our predictions
rsme_1 = np.sqrt(metrics.mean_squared_error(y_test, y_pred))
print('RMSE: '+ str(np.sqrt(metrics.mean_squared_error(y_test, y_pred))))

In [None]:
######################################################################
########                 Feature Selection                    ########
######################################################################

### Hypothesis: Newspaper does not improve the quality of our predictions
### Next step: remove newspaper from the model and verify if the RMSE is lower
# Aim to minimize RSME since this would indicate that the error is lower

### We will want to separate the sales data, since we are trying to predict it based off our features
### Let's create a Python list of feature names
feature_names_v2 = ['tv', 'radio']

### We can use the Python list we just created to create a data frame comprised of our independent variables
x2 = advertising[feature_names_v2]

### Let's create a vector of our dependent variable
y2 = advertising.sales

### We want to split our independent and dependent data sets into training and testing
x2_train, x2_test, y2_train, y2_test = train_test_split(x2, y2, random_state = 1)

In [None]:
######################################################################
########                     Modeling                         ########
######################################################################

### Fit the model to the training set, as to learn the coefficients
linreg.fit(x2_train, y2_train)

### Then we can make predictions on the testing set
y2_pred = linreg.predict(x2_test)

######## Model Evaluations ########

### Compute the RMSE of our predictions
### Compute the RMSE of our predictions
rsme_2 = np.sqrt(metrics.mean_squared_error(y2_test, y2_pred))
print('RMSE_v2: '+ str(np.sqrt(metrics.mean_squared_error(y2_test, y2_pred))))
print()

### Compare RSME of Models
rsme_compare = {'RSME_v1' : rsme_1 , 'RSME_v2' : rsme_2}

print(rsme_compare)

In [None]:
### Identify which RSME is better using a boolean operator
rsme_1 > rsme_2