# Import software libraries and load the dataset #

In [None]:
import sys                             # Read system parameters.
import numpy as np                     # Work with multi-dimensional arrays and matrices.
import pandas as pd                    # Manipulate and analyze data.
import matplotlib as mpl               # Create 2D charts.
import matplotlib.pyplot as plt
import sklearn                         # Perform data mining and analysis.
from sklearn import datasets
from sklearn.utils import shuffle
from time import time                  # Calculate training time.
import math

# Summarize software libraries used.
print('Libraries used in this project:')
print('- Python {}'.format(sys.version))
print('- NumPy {}'.format(np.__version__))
print('- pandas {}'.format(pd.__version__))
print('- Matplotlib {}'.format(mpl.__version__))
print('- scikit-learn {}\n'.format(sklearn.__version__))

# Load the dataset.
wine = datasets.load_wine()
print('Loaded {} records.'.format(len(wine.data)))

# Get acquainted with the dataset #

In [None]:
# Convert array to pandas DataFrame.
data_raw = pd.DataFrame(wine['data'], columns=wine['feature_names'])
data_raw['target'] = wine['target']

# Shuffle the dataset.
data_raw = shuffle(data_raw.copy(), random_state = 765)
data_raw.reset_index(inplace = True, drop = True)

print(data_raw.info())      # View data types and see if there are missing entries.
data_raw.head(10)           # View first 10 records.

# Examine a general summary of statistics

In [None]:
with pd.option_context('float_format', '{:.2f}'.format): 
    print(data_raw.describe())

# Examine the distribution of various features

In [None]:
# Use Matplotlib to plot figures.
%matplotlib inline
mpl.rc('axes', labelsize = 14)
mpl.rc('xtick', labelsize = 12)
mpl.rc('ytick', labelsize = 12)

data_raw.hist(figsize = (20, 15));
plt.figure();

# Split the label from the dataset

In [None]:
# 'target' is the dependent variable (value to be predicted), so it will be
# removed from the training data and put into a separate DataFrame for labels.
label_columns = ['target']

training_columns = ['alcohol', 'malic_acid', 'ash', 'alcalinity_of_ash', 'magnesium', 'total_phenols', 'flavanoids', 'nonflavanoid_phenols', 'proanthocyanins', 'color_intensity', 'hue', 'od280/od315_of_diluted_wines', 'proline']

X, y = data_raw[training_columns], data_raw[label_columns]

# Compare the number of columns in the original data to the new training and label sets.
print(f'Original set:    {data_raw.shape}')
print('------------------------------')
print(f'Training data:   {X.shape}')
print(f'Training labels: {y.shape}')

# Transform `magnesium` and `proline`

In [None]:
# Apply a log transformation to scale 'magnesium' and 'proline'.
X = X.copy()
X['proline'] = np.log(X['proline'])
X['magnesium'] = np.log(X['magnesium'])

# Examine results of the transformation
with pd.option_context('float_format', '{:.2f}'.format): 
    print(X['magnesium'].describe())
    print('\n-----------------------')
    print(X['proline'].describe())
    
X.head()

# Create a multinomial logistic regression model

In [None]:
from sklearn.linear_model import LogisticRegression

log_reg = LogisticRegression(solver='sag', multi_class='multinomial', max_iter=10000)

print('Multinomial logistic regression model created.')

# Train the model using stratified *k*-fold cross-validation to split the dataset

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict

# Train model and make predictions using test data.
start = time()
predict = cross_val_predict(log_reg, X, np.ravel(y), cv = 5)
end = time()
train_time = (end - start) * 1000

# Retrieve mean score of test folds.
score = cross_val_score(log_reg, X, np.ravel(y), cv = 5).mean()

print('Multinomial logistic regression model took {:.2f} milliseconds to fit.'.format(train_time))
print('Mean score on test sets: {:.0f}%'.format(np.round(score * 100)))

In [None]:
# Retrieve prediction probabilities.
proba = cross_val_predict(log_reg, X, np.ravel(y), cv = 5, method = 'predict_proba')

# Use test set to evaluate.
results_comparison = X.copy()
results_comparison['magnesium'] = np.exp(results_comparison['magnesium'])
results_comparison['proline'] = np.exp(results_comparison['proline'])
results_comparison['PredictedWine'] = predict
results_comparison['ActualWine'] = y.copy()
results_comparison['ProbWine0'] = np.round(proba[:, 0] * 100, 2)
results_comparison['ProbWine1'] = np.round(proba[:, 1] * 100, 2)
results_comparison['ProbWine2'] = np.round(proba[:, 2] * 100, 2)

# View examples of the predictions compared to actual wine.
results_comparison.head(20)