### Data Processing

In [3]:
from models import *
from dataclean import *
# from define_classes import *
# # from data_processing import *
# from pca import *
# from csv_clean import *
# from polynomial_regression import *
# from pipe import run_models
%load_ext autoreload
%autoreload 2

In [4]:
import warnings
warnings.filterwarnings('ignore')

### Setting the number of classes for different wine quality points threshold

In [5]:
data = pd.read_csv('data.csv', index_col=0)
print('Possible thresdholds for classification:',np.sort(data['quality'].unique()))

Possible thresdholds for classification: [3 4 5 6 7 8 9]


In [10]:
target = 'cat'
n_c = 3 # bad, average, good wine
thres = [4,6,7]
value = 'quality'

# import define_classes

cd = CleanData()
dt = cd.DefineTarget(data)

data = dt.get_classes(target, n_c, thres, value)

In [11]:
data['cat'].value_counts()

1    4974
2    1277
0     246
Name: cat, dtype: int64

In [12]:
data.columns

Index(['fixedacidity', 'volatileacidity', 'citricacid', 'residualsugar',
       'chlorides', 'freesulfurdioxide', 'totalsulfurdioxide', 'density', 'pH',
       'sulphates', 'alcohol', 'quality', 'type', 'cat'],
      dtype='object')

# Client Features Selection

Since our client doesn't gather the same data as contained in our innitial sample, to train the model we will include only the information that is available for the company

In [13]:
client_data = ['alcohol', 'fixedacidity', 'pH', 'volatileacidity',
       'freesulfurdioxide', 'totalsulfurdioxide', 'residualsugar', 'type','cat']

# rewrite csv with the target variable
data = data[client_data]
data.to_csv('data_client.csv')

#### We can see a large inbalance in the very good(class 3) and bad wines (class 0)

In [17]:
import numpy as np
from sklearn.utils.class_weight import compute_sample_weight
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, VotingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.svm import SVC
from sklearn.metrics import recall_score, classification_report, accuracy_score, precision_score, precision_recall_curve
from sklearn.preprocessing import label_binarize
from ordinal import *
import matplotlib.pyplot as plt
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier 
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder
from sklearn.model_selection import train_test_split as tts
from yellowbrick.classifier import PrecisionRecallCurve
from sklearn import metrics
from sklearn.pipeline import Pipeline

In [19]:
# Splitting Data

df = pd.read_csv('data_client.csv', index_col=0)
y = 'cat'
df_mm = cd.scale(df)
X_train, X_test, y_train, y_test = cd.split(df_mm, y)

In [20]:
X_train.columns

Index(['alcohol', 'fixedacidity', 'pH', 'volatileacidity', 'freesulfurdioxide',
       'totalsulfurdioxide', 'residualsugar', 'type'],
      dtype='object')

In [21]:
y_train.shape

(5197,)

### Balancing the Classes : SMOTE

In [22]:
smt = SMOTE(sampling_strategy='not majority', k_neighbors=3)

X_train_s, y_train_s = smt.fit_sample(np.asarray(X_train), np.asarray(y_train))

col = list(X_train.columns)

X_train_smote = pd.DataFrame(X_train_s, columns=col)
y_train_smote = pd.DataFrame(y_train_s, columns=['cat'])

### Baseline Model : Ordinal Classification 

Ordinal variables are those whose value exists on an arbitrary scale where only the relative ordering between different values is significant. In our case 0 to 3 is the order definining the incremental quality of wine. Therefore the estimation should take into account that our variables are 'order-dependent'.    
A way to tackle this type of classes (ordered) is using the 'OrdinalClassifier' class available for python in GitHub. This will create binary vectors for each class. This can be applied to classifiers that generate classes probability, or in python what we know as 'predict_proba'

In [23]:
y_train_bin = label_binarize(y_train_smote, classes=[0, 1, 2])

In [29]:
mdl = RunModels()
clf = mdl.OrdinalClassifier(LogisticRegression(C =1e9 ,solver='newton-cg',fit_intercept=True, multi_class='multinomial'))

NameError: name 'RunModels' is not defined

In [25]:
clf.fit(X_train_smote, y_train_smote)

In [26]:
lr_prob = clf.predict_proba(X_test)[:,1]
y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.14      0.69      0.24        55
           1       0.88      0.48      0.62       992
           2       0.39      0.77      0.52       253

    accuracy                           0.54      1300
   macro avg       0.47      0.65      0.46      1300
weighted avg       0.75      0.54      0.58      1300



### Results: 
 - High precission for only the average wine class. High recall on the classes with low precision

## Voting Classifier

In [None]:
clf1 = LogisticRegression(multi_class='multinomial', random_state=1, class_weight='balanced')
clf2 = RandomForestClassifier(n_estimators=50, random_state=1, class_weight='balanced' )
clf3 = GaussianNB()
clf4 = KNeighborsClassifier(n_neighbors=5, metric='minkowski', p=2)
clf5 = SVC(kernel="linear", class_weight="balanced", C=1.0, decision_function_shape = 'ovo')
clf6 = DecisionTreeClassifier()

In [None]:
eclf1 = VotingClassifier(estimators=[ ('lr', clf1), ('rf', clf2), ('gnb', clf3), ('knn', clf4), ('svc', clf5)], voting='hard')
eclf1 = eclf1.fit(X_train, y_train)
y_vt = eclf1.predict(X_test)

In [None]:
y_vot = eclf1.score(X_test, y_test)

In [None]:
print(classification_report(y_test, y_vt))

In [None]:
cnf_matrix = metrics.confusion_matrix(y_test, y_vt)
cnf_matrix

class_names=[0,1] # name  of classes
fig, ax = plt.subplots()
tick_marks = np.arange(len(class_names))
plt.xticks(tick_marks, class_names)
plt.yticks(tick_marks, class_names)
# create heatmap
sns.heatmap(pd.DataFrame(cnf_matrix), annot=True, cmap='YlGnBu' ,fmt='g')
ax.xaxis.set_label_position("top")
plt.tight_layout()
plt.title('Confusion matrix', y=1.1)
plt.ylabel('Actual label')
plt.xlabel('Predicted label')

### Pipeline and GridSearchCV

In [27]:
g_p, b_e, y_pred_p = run_models(X_train_smote, y_train_smote, X_test, y_test)

NameError: name 'run_models' is not defined

 ### Results:
 
  - RandomForest seems to perform the best for the three classes
 

In [None]:
viz = PrecisionRecallCurve(b_e[1], 
                    per_class=True, iso_f1_curves=True,
                    fill_area=False, micro=False, classes=['bad','average','good']
                )

viz.fit(X_train_smote, y_train_smote)
viz.score(X_test, y_test)
viz.show()

In [None]:
cnf_matrix = metrics.confusion_matrix(y_test, y_pred_p[1])
cnf_matrix

class_names=[0,1] # name  of classes
fig, ax = plt.subplots()
tick_marks = np.arange(len(class_names))
plt.xticks(tick_marks, class_names)
plt.yticks(tick_marks, class_names)
# create heatmap
sns.heatmap(pd.DataFrame(cnf_matrix), annot=True, cmap='YlGnBu' ,fmt='g')
ax.xaxis.set_label_position("top")
plt.tight_layout()
plt.title('Confusion matrix', y=1.1)
plt.ylabel('Actual label')
plt.xlabel('Predicted label')

### Results: Voting vs Gridsearch
- Pipeline and GridsearchCV shows better predictive results

### Feature Importance

Model

In [None]:
m = list(b_e[1])[0]

In [None]:
modelrf = m
modelrf.fit(X_train_smote, y_train_smote)

In [None]:
y_pred = modelrf.predict(X_test)

importances = modelrf.feature_importances_

dt_feature_importance = [(X_train_smote.columns[i[0]], i[1]) for i in list(enumerate(importances))]
dt_feature_importance = sorted(dt_feature_importance, key = lambda x: x[1], reverse = True)[:5]
fig = plt.figure(figsize = (8,5))
plt.bar([i[0] for i in dt_feature_importance], [i[1] for i in dt_feature_importance], color='skyblue')

In [None]:
y_fit = modelrf.predict(X_test)
print(classification_report(y_test, y_fit))

## Predicting Client's Data

In [None]:
modelrf = RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=1000,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [None]:
modelrf.fit(X_train_smote,y_train_smote)

In [None]:
wr = pd.read_csv('wine_reports.csv')

lea = wr.iloc[:,:8].fillna(0)

client_df = d.scale(lea)

modelrf.predict(client_df)

# Model Performance: PCA - 3 classes

In [None]:
df=pd.read_csv('data_client.csv', index_col=0)
target = 'cat'
n_com = 5
p = PcaAnalysis()
pca_df = p.pca_features(df, target, n_com)

In [None]:
df = df.reset_index(drop=True)
pca = pca_df.join(df['cat'])

In [None]:
# Splitting Data
d = DataClean()
pca_scale = d.scale(pca)
X_train_p, X_test_p, y_train_p, y_test_p = d.split(pca_scale,target)

In [None]:
pca_scale.groupby('cat').count().iloc[:,0]

### Rebalancing Classes

In [None]:
smt = SMOTE(sampling_strategy='not majority', k_neighbors=3)
X_train_s, y_train_s = smt.fit_sample(np.asarray(X_train_p), np.asarray(y_train_p))

col = list(X_train_p.columns)

X_train_p = pd.DataFrame(X_train_s, columns=col)
y_train_p= pd.DataFrame(y_train_s, columns=['cat'])

In [None]:
g_p_p, b_e_p, y_pred_p_p = run_models(X_train_p, y_train_p, X_test_p, y_test_p)

### Results:
    - Compared to the previous estimation PCA components does not improve significantly the performance. 
    - Random forest continues to perform the best. Small improvement from non-PCA model
    

### PRECISION-RECALL Curve

In [None]:
pr = PrecisionRecallCurve(b_e_p[1], 
                    per_class=True, iso_f1_curves=True,
                    fill_area=False, micro=False, classes=['bad','average','good']
                )

pr.fit(X_train_p, y_train_p)
pr.score(X_test_p, y_test_p)
pr.show()

## Feature Importance

In [None]:
model = RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                        class_weight=None, criterion='gini',
                                        max_depth=None, max_features='auto',
                                        max_leaf_nodes=None, max_samples=None,
                                        min_impurity_decrease=0.0,
                                        min_impurity_split=None,
                                        min_samples_leaf=1, min_samples_split=2,
                                        min_weight_fraction_leaf=0.0,
                                        n_estimators=1000, n_jobs=None,
                                        oob_score=False, random_state=None,
                                        verbose=0, warm_start=False)
model.fit(X_train_p, y_train_p)

In [None]:
y_pred = model.predict(X_test_p)

importances = model.feature_importances_

y_pred = model.predict(X_test_p)

importances = model.feature_importances_

dt_feature_importance = [(X_train_p.columns[i[0]], i[1]) for i in list(enumerate(importances))]
dt_feature_importance = sorted(dt_feature_importance, key = lambda x: x[1], reverse = True)[:5]
fig = plt.figure(figsize = (8,5))
plt.bar([i[0] for i in dt_feature_importance], [i[1] for i in dt_feature_importance], color='skyblue')

#### Results:

- Free and total SO2 (sulfur dioxide) as we could expect shows the hights VIF. We will remove total sulfure and test our models agian in a separate notebook.
- After removing the feature in a separete notebook we could see that there is no improvement in our accuracy