In [5]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.cross_validation import cross_val_score

dataset = pd.read_csv('datasets/wine.data', header=None)
array = dataset.values
X = array[:, 1:]
Y = array[:, 0]

#Split data
X_train, X_test, y_train, y_test = train_test_split(X,Y, test_size=0.25)
#Model
model = LogisticRegression()

In [6]:
"""
class sklearn.model_selection.KFold(n_splits=3, shuffle=False, random_state=None)
    K-Folds cross-validator

    Provides train/test indices to split data in train/test sets. Split dataset into k consecutive 
    folds (without shuffling by default).

    Each fold is then used once as a validation while the k - 1 remaining folds form the training set.
    
    Read more in the User Guide.

Parameters:
    - n_splits : int, default=3
        Number of folds. Must be at least 2.
    - shuffle : boolean, optional
        Whether to shuffle the data before splitting into batches.
    - random_state : int, RandomState instance or None, optional, default=None
        If int, random_state is the seed used by the random number generator; If RandomState instance, 
        random_state is the random number generator; If None, the random number generator is the 
        RandomState instance used by np.random. Used when shuffle == True.
"""
from sklearn.model_selection import KFold
num_folds = 10
num_instances = len(X)
seed = 7
k_fold = KFold(num_folds, random_state=seed)
scoring = 'accuracy'
result = cross_val_score(model, X, Y, cv = k_fold.split(X), scoring=scoring)
print('Accuracy: {:0.3f} ({:0.3f})'.format(result.mean(), result.std()))

Accuracy: 0.944 (0.061)


In [8]:
"""
CONFUSION MATRIX 

sklearn.metrics.confusion_matrix(y_true, y_pred, labels=None, sample_weight=None)
    
    Compute confusion matrix to evaluate the accuracy of a classification

    By definition a confusion matrix C is such that C_{i, j} is equal to the number of observations known 
    to be in group i but predicted to be in group j.

    Thus in binary classification, the count of true negatives is C_{0,0}, false negatives is C_{1,0}, 
    true positives is C_{1,1} and false positives is C_{0,1}.

    Read more in the User Guide.
Parameters:
    - y_true : array, shape = [n_samples]
        Ground truth (correct) target values.
    - y_pred : array, shape = [n_samples]
        Estimated targets as returned by a classifier.
    - labels : array, shape = [n_classes], optional
        List of labels to index the matrix. This may be used to reorder or select a subset of labels.
        If none is given, those that appear at least once in y_true or y_pred are used in sorted order.
    - sample_weight : array-like of shape = [n_samples], optional
        Sample weights.
Returns:
    - C : array, shape = [n_classes, n_classes]
        Confusion matrix
"""
from sklearn.metrics import confusion_matrix
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
matrix = confusion_matrix(y_test, y_pred)
print(matrix)

[[15  1  0]
 [ 0 18  0]
 [ 0  0 11]]


In [10]:
"""
CLASSIFICATION REPORT

sklearn.metrics.classification_report(y_true, y_pred, labels=None, target_names=None, 
                                    sample_weight=None, digits=2)
    
    Build a text report showing the main classification metrics
    
    Read more in the User Guide.

Parameters:
    - y_true : 1d array-like, or label indicator array / sparse matrix
       Ground truth (correct) target values.
    - y_pred : 1d array-like, or label indicator array / sparse matrix
        Estimated targets as returned by a classifier.
    - labels : array, shape = [n_labels]
        Optional list of label indices to include in the report.
    - target_names : list of strings
        Optional display names matching the labels (same order).
    - sample_weight : array-like of shape = [n_samples], optional
        Sample weights.
    - digits : int
        Number of digits for formatting output floating point values
Returns:
    - report : string
        Text summary of the precision, recall, F1 score for each class.
    
        The reported averages are a prevalence-weighted macro-average across classes (equivalent to 
        precision_recall_fscore_support with average='weighted').
    
        Note that in binary classification, recall of the positive class is also known as “sensitivity”; 
        recall of the negative class is “specificity”.
"""
from sklearn.metrics import classification_report
report = classification_report(y_test, y_pred)
print(report)

             precision    recall  f1-score   support

        1.0       1.00      0.94      0.97        16
        2.0       0.95      1.00      0.97        18
        3.0       1.00      1.00      1.00        11

avg / total       0.98      0.98      0.98        45

