## Overview
Explore data in order to find how to help wine company.

In [22]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.formula.api as sm
RANDOM_STATE = 42
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV


In [2]:
df = pd.read_csv('winequalityN.csv')
df.head()

Unnamed: 0,type,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,white,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6
1,white,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6
2,white,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,6
3,white,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6
4,white,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6


In [3]:
df.describe()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
count,6487.0,6489.0,6494.0,6495.0,6495.0,6497.0,6497.0,6497.0,6488.0,6493.0,6497.0,6497.0
mean,7.216579,0.339691,0.318722,5.444326,0.056042,30.525319,115.744574,0.994697,3.218395,0.531215,10.491801,5.818378
std,1.29675,0.164649,0.145265,4.758125,0.035036,17.7494,56.521855,0.002999,0.160748,0.148814,1.192712,0.873255
min,3.8,0.08,0.0,0.6,0.009,1.0,6.0,0.98711,2.72,0.22,8.0,3.0
25%,6.4,0.23,0.25,1.8,0.038,17.0,77.0,0.99234,3.11,0.43,9.5,5.0
50%,7.0,0.29,0.31,3.0,0.047,29.0,118.0,0.99489,3.21,0.51,10.3,6.0
75%,7.7,0.4,0.39,8.1,0.065,41.0,156.0,0.99699,3.32,0.6,11.3,6.0
max,15.9,1.58,1.66,65.8,0.611,289.0,440.0,1.03898,4.01,2.0,14.9,9.0


## Business Problem
Want more higher quality. What makes a wine high quality? Change to classification, predict and use the feature importance to pull information about what makes a wine high quality. "Using this info we now know what makes a wine good, now we can do reseach on how to make those things happen" ie: how to have higher pH if thats what makes it good, or how to have more residual sugars, or less. Using this info we can produce more higher quality wine and make more money off of it. 


In [10]:
df['quality'].replace(to_replace=[3,4,5,6,7,8,9], value=[0,0,0,0,1,1,1], inplace=True)
df.describe()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
count,6487.0,6489.0,6494.0,6495.0,6495.0,6497.0,6497.0,6497.0,6488.0,6493.0,6497.0,6497.0
mean,7.216579,0.339691,0.318722,5.444326,0.056042,30.525319,115.744574,0.994697,3.218395,0.531215,10.491801,0.196552
std,1.29675,0.164649,0.145265,4.758125,0.035036,17.7494,56.521855,0.002999,0.160748,0.148814,1.192712,0.397421
min,3.8,0.08,0.0,0.6,0.009,1.0,6.0,0.98711,2.72,0.22,8.0,0.0
25%,6.4,0.23,0.25,1.8,0.038,17.0,77.0,0.99234,3.11,0.43,9.5,0.0
50%,7.0,0.29,0.31,3.0,0.047,29.0,118.0,0.99489,3.21,0.51,10.3,0.0
75%,7.7,0.4,0.39,8.1,0.065,41.0,156.0,0.99699,3.32,0.6,11.3,0.0
max,15.9,1.58,1.66,65.8,0.611,289.0,440.0,1.03898,4.01,2.0,14.9,1.0


In [None]:
def performance(y_true, y_predict):
    """ 
    Calculates and returns the two performance scores between 
    true and predicted values - first R-Squared, then accuracy
    From Regression Trees and Model Optimization - Lab
    """

    f1 = f1_score(y_true, y_predict)
    accuracy = accuracy_score(y_true, y_predict)
    roc_auc = roc_auc_score(y_true, y_predict)

    return [f1, accuracy, roc_auc]
def get_score(model, x_train, x_test, y_train, y_test):
    """
    Returns score from any given model and train test split data
    """
    
    model.fit(x_train, y_train)
    y_pred = model.predict(x_test)
    score = performance(y_test, y_pred)
    return score
def plot_score(parameter, parameter_name, mse_results, f1_results, roc_auc_results):
    """
    Plots score given a parameter, parameter_name and their results.
    Original code from Hyperparameter tuning - Lab
    """
    plt.figure(figsize=(12, 6))
    plt.plot(parameter, f1_results, 'b', label='f1')
    plt.xlabel(parameter_name)
    plt.ylabel('f1-score')
    plt.legend()
    plt.show()
    plt.figure(figsize=(12, 6))
    plt.plot(parameter, mse_results, 'r', label='accuracy')
    plt.xlabel(parameter_name)
    plt.ylabel('accuracy')
    plt.legend()
    plt.show()
    plt.figure(figsize=(12, 6))
    plt.plot(parameter, roc_auc_results, 'g', label='ROC AUC')
    plt.xlabel(parameter_name)
    plt.ylabel('ROC AUC')
    plt.legend()
    plt.show()
    
def tuner(classifier_name:str, parameter, parameter_name:str, x_train, x_test, y_train, y_test):
    accuracy_results = []
    f1_results = []
    roc_auc_results = []
    for para in parameter:
        classifier = eval('{}(random_state=RANDOM_STATE, {}=para, class_weight={0:1, 1:5})'.format(classifier_name, parameter_name))
        score = get_score(classifier, x_train, x_test, y_train, y_test)
        f1_results.append(score[0])
        accuracy_results.append(score[1])
        roc_auc_results.append(score[2])
    plot_score(max_depths, p_name, accuracy_results, f1_results, roc_auc_results)

In [24]:
results = {}
for column in df.columns:
    results[column] = df[column].value_counts()
results
DecisionTreeClassifier(class_weight={0:1, 1:5})

DecisionTreeClassifier(class_weight={0: 1, 1: 5})

In [25]:
parameters = {
    'min_samples_split' : np.arange(2, 100),
    'max_depth' : np.linspace(1,30,30),
    'min_samples_leaf' : np.arange(2,40),
    'min_weight_fraction_leaf' : np.linspace(0,.5,30),
    'max_features' : np.linspace(.01,1,30),
    'max_leaf_nodes' : np.arange(2, 100)
}

In [31]:
df.drop('quality', axis=1)


Unnamed: 0,type,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
0,white,7.0,0.270,0.36,20.7,0.045,45.0,170.0,1.00100,3.00,0.45,8.8
1,white,6.3,0.300,0.34,1.6,0.049,14.0,132.0,0.99400,3.30,0.49,9.5
2,white,8.1,0.280,0.40,6.9,0.050,30.0,97.0,0.99510,3.26,0.44,10.1
3,white,7.2,0.230,0.32,8.5,0.058,47.0,186.0,0.99560,3.19,0.40,9.9
4,white,7.2,0.230,0.32,8.5,0.058,47.0,186.0,0.99560,3.19,0.40,9.9
...,...,...,...,...,...,...,...,...,...,...,...,...
6491,red,6.8,0.620,0.08,1.9,0.068,28.0,38.0,0.99651,3.42,0.82,9.5
6492,red,6.2,0.600,0.08,2.0,0.090,32.0,44.0,0.99490,3.45,0.58,10.5
6494,red,6.3,0.510,0.13,2.3,0.076,29.0,40.0,0.99574,3.42,0.75,11.0
6495,red,5.9,0.645,0.12,2.0,0.075,32.0,44.0,0.99547,3.57,0.71,10.2


In [None]:
target = df[['quality']]
df.drop('quality', axis=1, inplace=Tr)
x_train, x_test, y_train, y_test = train_test_split(df, target, test_size=.2, random_state=RANDOM_STATE)