## Wine quality prediction

In this notebook we will build a ML model to predict the quality of white wine and save the results.

In [1]:
#Importing the libraries
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score 
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier

### Helper Functions

In [2]:
#Function to read data
def read_data(file_path):
    data = pd.read_csv(file_path, delimiter=';', header=0)
    data.columns = ['fixed_acidity','volatile_acidity','citric_acid','residual_sugar','chlorides',\
                    'free_sulfur_dioxide','total_sulfur_dioxide','density','pH','sulphates','alcohol','quality']
    return data

#Function to remove duplicates
def remove_duplicates(data_frame):
    if data_frame.duplicated().sum()>0:
        data_frame.drop_duplicates(inplace=True)
        return data_frame
    else:
        return data_frame    

#Remove outliers
def remove_outliers(data_frame):
    stat_values = data_frame.quality.describe()
    IQR = stat_values['75%'] - stat_values['25%']
    higher = stat_values['75%'] + 1.5 * IQR
    lower = stat_values['25%'] - 1.5 * IQR
    data_frame = data_frame[data_frame['quality']> lower]
    data_frame = data_frame[data_frame['quality']< higher]
    return data_frame
 
# Encoding the data
def data_encoding(data_frame_column_to_encode, bins, labels):
    data_frame_column_to_encode = pd.cut(data_frame_column_to_encode, bins = bins, labels = labels)
    label_quality = preprocessing.LabelEncoder()
    data_frame_column_to_encode = label_quality.fit_transform(data_frame_column_to_encode)
    return data_frame_column_to_encode

# Splitting the data for train and test
def splitting_data(data_frame):
    X = data_frame.drop(data_frame.columns[-1], inplace = False, axis = 1)
    y = data_frame[data_frame.columns[-1]]
    X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)
    return X_train, X_test, y_train, y_test

#Standardizing the data
def std_data(X_train, X_test):
    sc = StandardScaler()
    X_train = sc.fit_transform(X_train)
    X_test = sc.fit_transform(X_test)
    return X_train, X_test
    
#Learning the best model
def wine_quality_predictor(X_train, X_test, y_train, y_test):
    knn = KNeighborsClassifier(algorithm = 'ball_tree', metric = 'manhattan', n_neighbors  = 3)
    knn.fit(X_train, y_train)
    quality_pred = knn.predict(X_test)
    f1score = f1_score(y_test, quality_pred, average='weighted')
    print('The f1_score of the KNN classifiers prediction is: '+str(f1score))
    return knn

def save_results(model,data_frame,file_path):
    X = data_frame.drop(data_frame.columns[-1], inplace = False, axis = 1)
    model_predictions = model.predict(X)
    model_predictions = list(model_predictions)
    for i in range(len(model_predictions)):
        if model_predictions[i] == 1:
            model_predictions[i] = 'good'
        else:
            model_predictions[i] = 'bad'
    data_frame['predicted_quality'] = model_predictions
    data_frame.to_csv(file_path)
    print("\nThe file is saved!")
    

In [3]:
#Path where the dataset is saved
file_path = r"C:\data\winequality-white.csv"

#Load the dataset as a pandas dataframe
wine_data = read_data(file_path)

# Eliminate duplicate records
wine_data = remove_duplicates(wine_data)

# Remove the outliers
wine_data = remove_outliers(wine_data)

# Encoding the variable 'quality'
wine_data['quality'] = data_encoding(wine_data['quality'],(2,6,8), ['bad', 'good'] )

# Train-Test Split
X_train, X_test, y_train, y_test = splitting_data(wine_data)

#Scaling the data
X_train, X_test = std_data(X_train, X_test)

# Getting the predictions
svc = wine_quality_predictor(X_train, X_test, y_train, y_test)

#path to save the results
save_path = r"C:\data\whitewinequality-predictions.csv"

#Saving the predictions
save_results(svc,wine_data,save_path)

The f1_score of the KNN classifiers prediction is: 0.813683320323927

The file is saved!


### In this project we performed an EDA on the White wine dataset, performed data cleaning, outlier detection, variable encoding, data standardization, model testing, parameter tuning and model building for predicting the quality of wine alongwith lucid visualizations to better understand the data and results.