In [1]:
"""
TRAIN AND TEST
"""
import pandas as pd
from sklearn.linear_model import LogisticRegression

dataset = pd.read_csv('datasets/wine.data', header=None)
array = dataset.values
X = array[:, 1:]
Y = array[:, 0]
model = LogisticRegression()

In [2]:

"""
sklearn.model_selection.train_test_split(*arrays, **options)
    
    Split arrays or matrices into random train and test subsets
    
    Quick utility that wraps input validation and next(ShuffleSplit().split(X, y))and application to 
    input data into a single call for splitting (and optionally subsampling) data in a oneliner.
    
    Read more in the User Guide.
Parameters:
    - *arrays : sequence of indexables with same length / shape[0]
        Allowed inputs are lists, numpy arrays, scipy-sparse matrices or pandas dataframes.
    - test_size : float, int, None, optional
        If float, should be between 0.0 and 1.0 and represent the proportion of the dataset to include 
        in the test split. If int, represents the absolute number of test samples. If None, the value is set 
        to the complement of the train size. By default, the value is set to 0.25. The default will change 
        in version 0.21.It will remain 0.25 only if train_size is unspecified, otherwise it will 
        complement the specified train_size.
     - train_size : float, int, or None, default None
        If float, should be between 0.0 and 1.0 and represent the proportion of the dataset to include 
        in the train split. If int, represents the absolute number of train samples. If None, the value 
        is automatically set to the complement of the test size.
    - random_state : int, RandomState instance or None, optional (default=None)
        If int, random_state is the seed used by the random number generator; If RandomState instance, 
        random_state is the random number generator; If None, the random number generator is the RandomState 
        instance used by np.random.
    - shuffle : boolean, optional (default=True)
        Whether or not to shuffle the data before splitting. If shuffle=False then stratify must be None.
    - stratify : array-like or None (default is None)
        If not None, data is split in a stratified fashion, using this as the class labels.
Returns:
    - splitting : list, length=2 * len(arrays)
        List containing train-test split of inputs.
        
        New in version 0.16: If the input is sparse, the output will be a scipy.sparse.csr_matrix. 
        Else, output type is the same as the input type.
"""
from sklearn.model_selection import train_test_split
test_size = 0.33
seed = 7
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = test_size, random_state = seed)
model.fit(X_train, y_train)
result = model.score(X_test, y_test)
print(result)

0.932203389831


In [3]:
"""
sklearn.model_selection.cross_val_score(estimator, X, y=None, groups=None, scoring=None, cv=None, n_jobs=1, 
                                        verbose=0, fit_params=None, pre_dispatch=‘2*n_jobs’)

    Evaluate a score by cross-validation

    Read more in the User Guide.

Parameters:
    - estimator : estimator object implementing ‘fit’
        The object to use to fit the data.
    - X : array-like
        The data to fit. Can be for example a list, or an array.
    - y : array-like, optional, default: None
            The target variable to try to predict in the case of supervised learning.
    - groups : array-like, with shape (n_samples,), optional
            Group labels for the samples used while splitting the dataset into train/test set.
    - scoring : string, callable or None, optional, default: None
        A string (see model evaluation documentation) or a scorer callable object / function with signature scorer(estimator, X, y).
    - cv : int, cross-validation generator or an iterable, optional
        Determines the cross-validation splitting strategy. Possible inputs for cv are:
        
            * None, to use the default 3-fold cross validation, 
            * integer, to specify the number of folds in a (Stratified)KFold,
            * An object to be used as a cross-validation generator.
            * An iterable yielding train, test splits.
    
        For integer/None inputs, if the estimator is a classifier and y is either binary or multiclass, 
        StratifiedKFold is used. In all other cases, KFold is used.
    
        Refer User Guide for the various cross-validation strategies that can be used here.
    - n_jobs : integer, optional
        The number of CPUs to use to do the computation. -1 means ‘all CPUs’.
    - verbose : integer, optional
        The verbosity level.
    - fit_params : dict, optional
        Parameters to pass to the fit method of the estimator.
    - pre_dispatch : int, or string, optional
        Controls the number of jobs that get dispatched during parallel execution. Reducing this 
        number can be useful to avoid an explosion of memory consumption when more jobs get dispatched 
        than CPUs can process. This parameter can be:
    
        None, in which case all the jobs are immediately created and spawned. Use this for lightweight 
        and fast-running jobs, to avoid delays due to on-demand spawning of the jobs
        
        An int, giving the exact number of total jobs that are spawned
        
        A string, giving an expression as a function of n_jobs, as in ‘2*n_jobs’

Returns:
    - scores : array of float, shape=(len(list(cv)),)
        Array of scores of the estimator for each run of the cross validation.
"""
from sklearn.model_selection import cross_val_score
num_folds = 10
result = cross_val_score(model, X, Y, cv = num_folds)
print('Accuracy: {:.03f} ({:.03f})'.format(result.mean()*100.0, result.std()*100.0))

Accuracy: 95.643 (5.249)


In [4]:
"""
class sklearn.model_selection.LeaveOneOut

    Leave-One-Out cross-validator
    
    Provides train/test indices to split data in train/test sets. Each sample is used once as a test set (singleton) while the remaining samples form the training set.

    Note: LeaveOneOut() is equivalent to KFold(n_splits=n) and LeavePOut(p=1) where n is the number of samples.

    Due to the high number of test sets (which is the same as the number of samples) this cross-validation 
    method can be very costly. For large datasets one should favor KFold, ShuffleSplit or StratifiedKFold.

    Read more in the User Guide.
"""
from sklearn.model_selection import LeaveOneOut
loocv = LeaveOneOut()
result = cross_val_score(model, X, Y, cv = loocv)
print('Accuracy: {:.03f} ({:.03f})'.format(result.mean()*100.0, result.std()*100.0))

Accuracy: 95.506 (20.718)
