Skip to content

Commit

Permalink
Merge pull request #187 from RNAer/master
Browse files Browse the repository at this point in the history
fix future warnings from scikit-learn
  • Loading branch information
amnona committed Jun 29, 2020
2 parents fb703e3 + 8f42e10 commit 95cb121
Show file tree
Hide file tree
Showing 8 changed files with 91 additions and 42 deletions.
39 changes: 20 additions & 19 deletions calour/experiment.py
Original file line number Diff line number Diff line change
Expand Up @@ -342,27 +342,28 @@ def reorder(self, new_order, axis=0, inplace=False):
return exp

def to_pandas(self, sample_field=None, feature_field=None, sparse=None):
'''Get a pandas dataframe of the abundances
Samples are rows, features are columns. Can specify the metadata fields
'''Convert Experiment object to a pandas DataFrame.
Samples are rows and features are columns. You can specify the metadata fields
for the index (default is sample_metadata index) and column labels
(default is feature_metadata index)
(default is feature_metadata index).
Parameters
----------
sample_field : str or None, optional
Name of the sample_metadata column to use for index.
Column name of the sample_metadata to use as the index for the resulting pandas DataFrame.
None (default) is the sample_metadata index
feature_field : str or None, optional
Name of the feature_metadata column to use for column names.
Column name of the feature_metadata to use for column labels for the resulting pandas DataFrame.
None (default) is the feature_metadata index
sparse: bool or None, optional
None (default) to get sparsity based on the underlying Experiment sparsity
True to force to sparse pandas.Dataframe
False to force to standard pandas.Dataframe
None (default) to get sparsity based on the underlying Experiment sparsity.
True to force to sparse pandas.DataFrame;
False to force to standard pandas.DataFrame
Returns
-------
pandas.Dataframe or pandas.SparseDataFrame
pandas.Dataframe
'''
if sample_field is None:
ind = self.sample_metadata.index
Expand All @@ -373,12 +374,12 @@ def to_pandas(self, sample_field=None, feature_field=None, sparse=None):
else:
cols = self.feature_metadata[feature_field]

if sparse is not None:
self.sparse = sparse

if self.sparse:
# create list of sparse rows
if self.sparse and sparse:
df = pd.DataFrame.sparse.from_spmatrix(self.data, index=ind, columns=cols)
elif self.sparse:
df = pd.DataFrame(self.data.todense(), index=ind, columns=cols)
elif sparse:
df = pd.DataFrame(scipy.sparse.csr_matrix(self.data), index=ind, columns=cols)
else:
df = pd.DataFrame(self.data, index=ind, columns=cols, copy=True)
return df
Expand All @@ -387,14 +388,15 @@ def to_pandas(self, sample_field=None, feature_field=None, sparse=None):
def from_pandas(cls, df, exp=None):
'''Convert a Pandas DataFrame into an experiment.
Can use an existing calour Experimebt (exp) (if supplied) to
obtain feature and sample metadata. Note currently only works
with non-sparse DataFrame
It take an existing Calour Experiment object (if supplied) to
obtain its feature and sample metadata while replacing the
data with the values from the pandas dataframe. Note currently
only works with non-sparse DataFrame
Parameters
----------
df : Pandas.DataFrame
The dataframe to use. should contain samples in rows, features in columns.
The dataframe to use. should contain samples in rows and features in columns.
Index values will be used for the sample_metadata index and column names will be used for feature_metadata index
exp : Experiment, optional
If not None, use sample and feature metadata from the experiment
Expand All @@ -419,7 +421,6 @@ def from_pandas(cls, df, exp=None):
feature_metadata = exp.feature_metadata.loc[df.columns.values, ]
cls = exp.__class__

# print(sample_metadata)
newexp = cls(df.values, sample_metadata, feature_metadata,
exp_metadata=exp_metadata, description=description, sparse=False)
return newexp
37 changes: 37 additions & 0 deletions calour/tests/data/test_classify.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
setosa versicolor virginica Y_TRUE SAMPLE CV
0 0.0 1.0 0.0 versicolor 2 0
1 0.0 0.2 0.8 virginica 10 0
2 0.0 1.0 0.0 versicolor 11 0
3 0.0 1.0 0.0 versicolor 15 0
4 1.0 0.0 0.0 setosa 16 0
5 0.0 0.2 0.8 virginica 20 0
6 0.0 1.0 0.0 versicolor 22 0
7 0.0 0.4 0.6 virginica 27 0
8 1.0 0.0 0.0 setosa 29 0
9 1.0 0.0 0.0 setosa 30 0
10 0.0 0.6 0.4 virginica 31 0
11 1.0 0.0 0.0 setosa 35 0
12 0.0 0.0 1.0 virginica 1 1
13 1.0 0.0 0.0 setosa 5 1
14 0.0 1.0 0.0 versicolor 7 1
15 0.0 0.0 1.0 versicolor 8 1
16 1.0 0.0 0.0 setosa 12 1
17 0.0 1.0 0.0 versicolor 13 1
18 0.0 1.0 0.0 versicolor 14 1
19 0.0 0.4 0.6 versicolor 17 1
20 0.0 0.0 1.0 virginica 26 1
21 0.0 0.2 0.8 virginica 28 1
22 1.0 0.0 0.0 setosa 32 1
23 1.0 0.0 0.0 setosa 33 1
24 1.0 0.0 0.0 setosa 0 2
25 0.0 0.2 0.8 virginica 3 2
26 1.0 0.0 0.0 setosa 4 2
27 1.0 0.0 0.0 setosa 6 2
28 0.0 1.0 0.0 versicolor 9 2
29 0.0 0.8 0.2 versicolor 18 2
30 1.0 0.0 0.0 setosa 19 2
31 0.0 0.0 1.0 virginica 21 2
32 0.0 1.0 0.0 versicolor 23 2
33 0.0 1.0 0.0 versicolor 24 2
34 1.0 0.0 0.0 setosa 25 2
35 0.0 0.2 0.8 virginica 34 2
10 changes: 10 additions & 0 deletions calour/tests/data/test_regress.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
Y_PRED Y_TRUE SAMPLE CV
0 137.2 75.0 1 0
1 148.0 141.0 2 0
2 148.0 63.0 7 0
3 134.0 135.0 4 1
4 134.0 138.0 6 1
5 127.2 110.0 8 1
6 117.4 151.0 0 2
7 119.8 206.0 3 2
8 119.8 97.0 5 2
2 changes: 1 addition & 1 deletion calour/tests/test_amplicon_experiment.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from unittest import main
from copy import deepcopy

import pandas.util.testing as pdt
import pandas.testing as pdt
import numpy as np
import numpy.testing as npt

Expand Down
2 changes: 1 addition & 1 deletion calour/tests/test_dendrogram.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
import pandas as pd
from skbio import DistanceMatrix, TreeNode
from scipy.cluster.hierarchy import ward
import pandas.util.testing as pdt
import pandas.testing as pdt

from calour._dendrogram import (Dendrogram, UnrootedDendrogram,
SquareDendrogram)
Expand Down
2 changes: 1 addition & 1 deletion calour/tests/test_sorting.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from unittest import main
from os.path import join

import pandas.util.testing as pdt
import pandas.testing as pdt
import numpy as np

import calour as ca
Expand Down
27 changes: 14 additions & 13 deletions calour/tests/test_training.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
from numpy.testing import assert_array_equal, assert_almost_equal
import numpy as np
import pandas as pd
import pandas.util.testing as pdt
import pandas.testing as pdt
from sklearn import datasets
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.model_selection import KFold
Expand Down Expand Up @@ -49,7 +49,7 @@ def test_add_sample_metadata_as_features_dense(self):

def test_split_train_test(self):
train, test = self.test2_dense.split_train_test(
test_size=3, stratify='categorical', random_state=7)
test_size=3, stratify='categorical', shuffle=True, random_state=7)

assert_experiment_equal(
test, self.test2_dense.filter_ids(['S3', 'S8', 'S1'], axis='s'))
Expand All @@ -62,11 +62,12 @@ def test_regress(self):
y = diabetes.target[:9]
smd = pd.DataFrame({'diabetes': y})
exp = ca.Experiment(X, smd, sparse=False)
run = exp.regress('diabetes', KNeighborsRegressor(), KFold(3, random_state=0))
res = next(run)
obs = pd.read_table(join(self.test_data_dir, 'diabetes_pred.txt'), index_col=0)
run = exp.regress('diabetes', KNeighborsRegressor(), KFold(3, shuffle=True, random_state=0))
observed = next(run)
expected = pd.read_table(join(self.test_data_dir, 'test_regress.txt'), index_col=0)

# make sure the column order are the same for comparison
pdt.assert_frame_equal(res.sort_index(axis=1), obs.sort_index(axis=1))
pdt.assert_frame_equal(observed.sort_index(axis=1), expected.sort_index(axis=1))

def test_plot_scatter(self):
res = pd.read_table(join(self.test_data_dir, 'diabetes_pred.txt'), index_col=0)
Expand All @@ -93,11 +94,11 @@ def test_classify(self):
exp = ca.Experiment(X, smd, sparse=False)
run = exp.classify('plant', KNeighborsClassifier(),
predict='predict_proba',
cv=KFold(3, random_state=0))
res = next(run)
obs = pd.read_table(join(self.test_data_dir, 'iris_pred.txt'), index_col=0)
pdt.assert_frame_equal(res, obs)
# plot_roc(res)
cv=KFold(3, shuffle=True, random_state=0))
observed = next(run)
expected = pd.read_table(join(self.test_data_dir, 'test_classify.txt'), index_col=0)
pdt.assert_frame_equal(expected, observed)
# plot_roc(observed)
# from matplotlib import pyplot as plt
# plt.show()

Expand Down Expand Up @@ -145,7 +146,7 @@ def test_plot_roc_warning(self):
'neg': prob,
'Y_TRUE': ['pos'] * 9 + ['neg'],
'CV': [0, 1] * 5})
# re-enable logging because it is disabled in parent setUp
# re-enable logging because it is disabled in the parent setUp
logging.disable(logging.NOTSET)
with self.assertLogs(level='WARNING') as cm:
plot_roc(result)
Expand Down Expand Up @@ -253,7 +254,7 @@ def test_sorted_stratified(self):
def test_rep_sorted_strtified(self):
n = self.y.shape[0]
for k in (3, 2):
ssk = RepeatedSortedStratifiedKFold(k, 2)
ssk = RepeatedSortedStratifiedKFold(n_splits=k, n_repeats=2)
for train, test in ssk.split(self.X, self.y):
# check the size of the test fold
ni = int(n / k)
Expand Down
14 changes: 7 additions & 7 deletions calour/training.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@
from sklearn.model_selection._split import check_cv, _RepeatedSplits
from sklearn.base import is_classifier, clone
from sklearn.metrics import precision_recall_curve, average_precision_score, roc_curve, auc, confusion_matrix
from scipy import interp, stats
from scipy import stats
from scipy.sparse import hstack
import pandas as pd
import numpy as np
Expand Down Expand Up @@ -166,7 +166,7 @@ class SortedStratifiedKFold(StratifiedKFold):
'''Stratified K-Fold cross validator.
Please see :class:`sklearn.model_selection.StratifiedKFold` for
documentation for parameters, etc. It is very similar to that
documentation for parameters, etc. It is very similar to that class
except this is for regression of numeric values.
This implementation basically assigns a unique label (int here) to
Expand All @@ -179,7 +179,7 @@ class SortedStratifiedKFold(StratifiedKFold):
RepeatedSortedStratifiedKFold
'''
def __init__(self, n_splits=3, shuffle=False, random_state=None):
super().__init__(n_splits, shuffle, random_state)
super().__init__(n_splits=n_splits, shuffle=shuffle, random_state=random_state)

def _sort_partition(self, y):
n = len(y)
Expand Down Expand Up @@ -207,10 +207,10 @@ class RepeatedSortedStratifiedKFold(_RepeatedSplits):
SortedStratifiedKFold
'''
def __init__(self, n_splits=5, n_repeats=10, random_state=None):
super().__init__(SortedStratifiedKFold, n_repeats, random_state, n_splits=n_splits)
super().__init__(SortedStratifiedKFold, n_splits=n_splits, n_repeats=n_repeats, random_state=random_state)


def regress(exp: Experiment, field, estimator, cv=RepeatedSortedStratifiedKFold(3, 1), params=None):
def regress(exp: Experiment, field, estimator, cv=RepeatedSortedStratifiedKFold(n_splits=3, n_repeats=1), params=None):
'''Evaluate regression during cross validation.
Parameters
Expand Down Expand Up @@ -327,7 +327,7 @@ def plot_scatter(result, title='', cmap=None, cor=stats.pearsonr, cv=False, ax=N
return ax


def classify(exp: Experiment, fields, estimator, cv=RepeatedStratifiedKFold(3, 1),
def classify(exp: Experiment, fields, estimator, cv=RepeatedStratifiedKFold(n_splits=3, n_repeats=1),
predict='predict_proba', params=None):
'''Evaluate classification during cross validation.
Expand Down Expand Up @@ -656,7 +656,7 @@ def plot_roc(result, classes=None, title='ROC', cv=True, cmap=None, ax=None):
'have either no true positive or no negative samples in this '
'cross validation for the class %r' % (grp, cls))
continue
mean_tpr = interp(mean_fpr, fpr, tpr)
mean_tpr = np.interp(mean_fpr, fpr, tpr)
tprs.append(mean_tpr)
tprs[-1][0] = 0.0
roc_auc = auc(mean_fpr, mean_tpr)
Expand Down

0 comments on commit 95cb121

Please sign in to comment.