In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

In [2]:
DATASET = '/dsa/data/all_datasets/wine-quality/winequality-red.csv'
os.path.exists(DATASET)

True

# Load and shuffle

In [3]:
dataset=pd.read_csv(DATASET, sep= ';').sample(frac=1).reset_index(drop=True)


In [4]:
dataset.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1599 entries, 0 to 1598
Data columns (total 12 columns):
fixed acidity           1599 non-null float64
volatile acidity        1599 non-null float64
citric acid             1599 non-null float64
residual sugar          1599 non-null float64
chlorides               1599 non-null float64
free sulfur dioxide     1599 non-null float64
total sulfur dioxide    1599 non-null float64
density                 1599 non-null float64
pH                      1599 non-null float64
sulphates               1599 non-null float64
alcohol                 1599 non-null float64
quality                 1599 non-null int64
dtypes: float64(11), int64(1)
memory usage: 150.0 KB


In [5]:
dataset.shape, dataset.columns

((1599, 12),
 Index(['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar',
        'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density',
        'pH', 'sulphates', 'alcohol', 'quality'],
       dtype='object'))

In [6]:
X=np.array(dataset.iloc[:, :10])
y=np.array(dataset.quality)
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=.2)

In [7]:
X_train.shape

(1279, 10)

In [8]:
X_test.shape

(320, 10)

In [9]:
X.shape

(1599, 10)

# Dumb way

In [10]:
model = GaussianNB()
model.fit(X[:3], y[:3])
model.score(X[:3], y[:3])


1.0

In [11]:
y[:4]

array([6, 6, 5, 6])

In [12]:
model.score(X[:-3], y[:-3])

0.39974937343358397

# Wrong way

In [13]:
model.fit(X, y)
model.score(X, y)


0.51907442151344585

# Train/validate

In [14]:
model.fit(X_train, y_train)
model.score(X_test, y_test)

0.5

# Confusion matirx

In [15]:
{i:np.sum(y==i) for i in  np.unique(y)}

{3: 10, 4: 53, 5: 681, 6: 638, 7: 199, 8: 18}

In [16]:
y_pred =model.predict(X_test)>=6

In [17]:
confusion_matrix(y_test>=6, y_pred)

array([[ 68,  80],
       [ 30, 142]])

In [18]:
y_true1=[0 , 1, 1,   0,0,0,  1,1,1,1]
y_pred1=[0 , 0, 0,   1,1,1,  1,1,1,1]

In [19]:
confusion_matrix(y_true1, y_pred1)

array([[1, 3],
       [2, 4]])

# Take-aways

* Tools: python, pandas, numpy, sklearn, jupyter notebook
* Randomization
* Train/validate
* Overfitting
* Scores, confusion matrix