Merge pull request #187 from RNAer/master

fix future warnings from scikit-learn
biocore · Jun 29, 2020 · 95cb121 · 95cb121
2 parents fb703e3 + 8f42e10
commit 95cb121
Show file tree

Hide file tree

Showing 8 changed files with 91 additions and 42 deletions.
diff --git a/calour/experiment.py b/calour/experiment.py
@@ -342,27 +342,28 @@ def reorder(self, new_order, axis=0, inplace=False):
         return exp
 
     def to_pandas(self, sample_field=None, feature_field=None, sparse=None):
-        '''Get a pandas dataframe of the abundances
-        Samples are rows, features are columns. Can specify the metadata fields
+        '''Convert Experiment object to a pandas DataFrame.
+
+        Samples are rows and features are columns. You can specify the metadata fields
         for the index (default is sample_metadata index) and column labels
-        (default is feature_metadata index)
+        (default is feature_metadata index).
 
         Parameters
         ----------
         sample_field : str or None, optional
-            Name of the sample_metadata column to use for index.
+            Column name of the sample_metadata to use as the index for the resulting pandas DataFrame.
             None (default) is the sample_metadata index
         feature_field : str or None, optional
-            Name of the feature_metadata column to use for column names.
+            Column name of the feature_metadata to use for column labels for the resulting pandas DataFrame.
             None (default) is the feature_metadata index
         sparse: bool or None, optional
-            None (default) to get sparsity based on the underlying Experiment sparsity
-            True to force to sparse pandas.Dataframe
-            False to force to standard pandas.Dataframe
+            None (default) to get sparsity based on the underlying Experiment sparsity.
+            True to force to sparse pandas.DataFrame;
+            False to force to standard pandas.DataFrame
 
         Returns
         -------
-        pandas.Dataframe or pandas.SparseDataFrame
+        pandas.Dataframe
         '''
         if sample_field is None:
             ind = self.sample_metadata.index
@@ -373,12 +374,12 @@ def to_pandas(self, sample_field=None, feature_field=None, sparse=None):
         else:
             cols = self.feature_metadata[feature_field]
 
-        if sparse is not None:
-            self.sparse = sparse
-
-        if self.sparse:
-            # create list of sparse rows
+        if self.sparse and sparse:
             df = pd.DataFrame.sparse.from_spmatrix(self.data, index=ind, columns=cols)
+        elif self.sparse:
+            df = pd.DataFrame(self.data.todense(), index=ind, columns=cols)
+        elif sparse:
+            df = pd.DataFrame(scipy.sparse.csr_matrix(self.data), index=ind, columns=cols)
         else:
             df = pd.DataFrame(self.data, index=ind, columns=cols, copy=True)
         return df
@@ -387,14 +388,15 @@ def to_pandas(self, sample_field=None, feature_field=None, sparse=None):
     def from_pandas(cls, df, exp=None):
         '''Convert a Pandas DataFrame into an experiment.
 
-        Can use an existing calour Experimebt (exp) (if supplied) to
-        obtain feature and sample metadata.  Note currently only works
-        with non-sparse DataFrame
+        It take an existing Calour Experiment object (if supplied) to
+        obtain its feature and sample metadata while replacing the
+        data with the values from the pandas dataframe. Note currently
+        only works with non-sparse DataFrame
 
         Parameters
         ----------
         df : Pandas.DataFrame
-            The dataframe to use. should contain samples in rows, features in columns.
+            The dataframe to use. should contain samples in rows and features in columns.
             Index values will be used for the sample_metadata index and column names will be used for feature_metadata index
         exp : Experiment, optional
             If not None, use sample and feature metadata from the experiment
@@ -419,7 +421,6 @@ def from_pandas(cls, df, exp=None):
             feature_metadata = exp.feature_metadata.loc[df.columns.values, ]
             cls = exp.__class__
 
-        # print(sample_metadata)
         newexp = cls(df.values, sample_metadata, feature_metadata,
                      exp_metadata=exp_metadata, description=description, sparse=False)
         return newexp
diff --git a/calour/tests/data/test_classify.txt b/calour/tests/data/test_classify.txt
@@ -0,0 +1,37 @@
+	setosa	versicolor	virginica	Y_TRUE	SAMPLE	CV
+0	0.0	1.0	0.0	versicolor	2	0
+1	0.0	0.2	0.8	virginica	10	0
+2	0.0	1.0	0.0	versicolor	11	0
+3	0.0	1.0	0.0	versicolor	15	0
+4	1.0	0.0	0.0	setosa	16	0
+5	0.0	0.2	0.8	virginica	20	0
+6	0.0	1.0	0.0	versicolor	22	0
+7	0.0	0.4	0.6	virginica	27	0
+8	1.0	0.0	0.0	setosa	29	0
+9	1.0	0.0	0.0	setosa	30	0
+10	0.0	0.6	0.4	virginica	31	0
+11	1.0	0.0	0.0	setosa	35	0
+12	0.0	0.0	1.0	virginica	1	1
+13	1.0	0.0	0.0	setosa	5	1
+14	0.0	1.0	0.0	versicolor	7	1
+15	0.0	0.0	1.0	versicolor	8	1
+16	1.0	0.0	0.0	setosa	12	1
+17	0.0	1.0	0.0	versicolor	13	1
+18	0.0	1.0	0.0	versicolor	14	1
+19	0.0	0.4	0.6	versicolor	17	1
+20	0.0	0.0	1.0	virginica	26	1
+21	0.0	0.2	0.8	virginica	28	1
+22	1.0	0.0	0.0	setosa	32	1
+23	1.0	0.0	0.0	setosa	33	1
+24	1.0	0.0	0.0	setosa	0	2
+25	0.0	0.2	0.8	virginica	3	2
+26	1.0	0.0	0.0	setosa	4	2
+27	1.0	0.0	0.0	setosa	6	2
+28	0.0	1.0	0.0	versicolor	9	2
+29	0.0	0.8	0.2	versicolor	18	2
+30	1.0	0.0	0.0	setosa	19	2
+31	0.0	0.0	1.0	virginica	21	2
+32	0.0	1.0	0.0	versicolor	23	2
+33	0.0	1.0	0.0	versicolor	24	2
+34	1.0	0.0	0.0	setosa	25	2
+35	0.0	0.2	0.8	virginica	34	2
diff --git a/calour/tests/data/test_regress.txt b/calour/tests/data/test_regress.txt
@@ -0,0 +1,10 @@
+	Y_PRED	Y_TRUE	SAMPLE	CV
+0	137.2	75.0	1	0
+1	148.0	141.0	2	0
+2	148.0	63.0	7	0
+3	134.0	135.0	4	1
+4	134.0	138.0	6	1
+5	127.2	110.0	8	1
+6	117.4	151.0	0	2
+7	119.8	206.0	3	2
+8	119.8	97.0	5	2
diff --git a/calour/tests/test_amplicon_experiment.py b/calour/tests/test_amplicon_experiment.py
@@ -9,7 +9,7 @@
 from unittest import main
 from copy import deepcopy
 
-import pandas.util.testing as pdt
+import pandas.testing as pdt
 import numpy as np
 import numpy.testing as npt
 

diff --git a/calour/tests/test_dendrogram.py b/calour/tests/test_dendrogram.py
@@ -11,7 +11,7 @@
 import pandas as pd
 from skbio import DistanceMatrix, TreeNode
 from scipy.cluster.hierarchy import ward
-import pandas.util.testing as pdt
+import pandas.testing as pdt
 
 from calour._dendrogram import (Dendrogram, UnrootedDendrogram,
                                 SquareDendrogram)

diff --git a/calour/tests/test_sorting.py b/calour/tests/test_sorting.py
@@ -9,7 +9,7 @@
 from unittest import main
 from os.path import join
 
-import pandas.util.testing as pdt
+import pandas.testing as pdt
 import numpy as np
 
 import calour as ca

diff --git a/calour/tests/test_training.py b/calour/tests/test_training.py
@@ -13,7 +13,7 @@
 from numpy.testing import assert_array_equal, assert_almost_equal
 import numpy as np
 import pandas as pd
-import pandas.util.testing as pdt
+import pandas.testing as pdt
 from sklearn import datasets
 from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
 from sklearn.model_selection import KFold
@@ -49,7 +49,7 @@ def test_add_sample_metadata_as_features_dense(self):
 
     def test_split_train_test(self):
         train, test = self.test2_dense.split_train_test(
-            test_size=3, stratify='categorical', random_state=7)
+            test_size=3, stratify='categorical', shuffle=True, random_state=7)
 
         assert_experiment_equal(
             test, self.test2_dense.filter_ids(['S3', 'S8', 'S1'], axis='s'))
@@ -62,11 +62,12 @@ def test_regress(self):
         y = diabetes.target[:9]
         smd = pd.DataFrame({'diabetes': y})
         exp = ca.Experiment(X, smd, sparse=False)
-        run = exp.regress('diabetes', KNeighborsRegressor(), KFold(3, random_state=0))
-        res = next(run)
-        obs = pd.read_table(join(self.test_data_dir, 'diabetes_pred.txt'), index_col=0)
+        run = exp.regress('diabetes', KNeighborsRegressor(), KFold(3, shuffle=True, random_state=0))
+        observed = next(run)
+        expected = pd.read_table(join(self.test_data_dir, 'test_regress.txt'), index_col=0)
+
         # make sure the column order are the same for comparison
-        pdt.assert_frame_equal(res.sort_index(axis=1), obs.sort_index(axis=1))
+        pdt.assert_frame_equal(observed.sort_index(axis=1), expected.sort_index(axis=1))
 
     def test_plot_scatter(self):
         res = pd.read_table(join(self.test_data_dir, 'diabetes_pred.txt'), index_col=0)
@@ -93,11 +94,11 @@ def test_classify(self):
         exp = ca.Experiment(X, smd, sparse=False)
         run = exp.classify('plant', KNeighborsClassifier(),
                            predict='predict_proba',
-                           cv=KFold(3, random_state=0))
-        res = next(run)
-        obs = pd.read_table(join(self.test_data_dir, 'iris_pred.txt'), index_col=0)
-        pdt.assert_frame_equal(res, obs)
-        # plot_roc(res)
+                           cv=KFold(3, shuffle=True, random_state=0))
+        observed = next(run)
+        expected = pd.read_table(join(self.test_data_dir, 'test_classify.txt'), index_col=0)
+        pdt.assert_frame_equal(expected, observed)
+        # plot_roc(observed)
         # from matplotlib import pyplot as plt
         # plt.show()
 
@@ -145,7 +146,7 @@ def test_plot_roc_warning(self):
                                'neg': prob,
                                'Y_TRUE': ['pos'] * 9 + ['neg'],
                                'CV': [0, 1] * 5})
-        # re-enable logging because it is disabled in parent setUp
+        # re-enable logging because it is disabled in the parent setUp
         logging.disable(logging.NOTSET)
         with self.assertLogs(level='WARNING') as cm:
             plot_roc(result)
@@ -253,7 +254,7 @@ def test_sorted_stratified(self):
     def test_rep_sorted_strtified(self):
         n = self.y.shape[0]
         for k in (3, 2):
-            ssk = RepeatedSortedStratifiedKFold(k, 2)
+            ssk = RepeatedSortedStratifiedKFold(n_splits=k, n_repeats=2)
             for train, test in ssk.split(self.X, self.y):
                 # check the size of the test fold
                 ni = int(n / k)

diff --git a/calour/training.py b/calour/training.py
@@ -39,7 +39,7 @@
 from sklearn.model_selection._split import check_cv, _RepeatedSplits
 from sklearn.base import is_classifier, clone
 from sklearn.metrics import precision_recall_curve, average_precision_score, roc_curve, auc, confusion_matrix
-from scipy import interp, stats
+from scipy import stats
 from scipy.sparse import hstack
 import pandas as pd
 import numpy as np
@@ -166,7 +166,7 @@ class SortedStratifiedKFold(StratifiedKFold):
     '''Stratified K-Fold cross validator.
 
     Please see :class:`sklearn.model_selection.StratifiedKFold` for
-    documentation for parameters, etc. It is very similar to that
+    documentation for parameters, etc. It is very similar to that class
     except this is for regression of numeric values.
 
     This implementation basically assigns a unique label (int here) to
@@ -179,7 +179,7 @@ class SortedStratifiedKFold(StratifiedKFold):
     RepeatedSortedStratifiedKFold
     '''
     def __init__(self, n_splits=3, shuffle=False, random_state=None):
-        super().__init__(n_splits, shuffle, random_state)
+        super().__init__(n_splits=n_splits, shuffle=shuffle, random_state=random_state)
 
     def _sort_partition(self, y):
         n = len(y)
@@ -207,10 +207,10 @@ class RepeatedSortedStratifiedKFold(_RepeatedSplits):
     SortedStratifiedKFold
     '''
     def __init__(self, n_splits=5, n_repeats=10, random_state=None):
-        super().__init__(SortedStratifiedKFold, n_repeats, random_state, n_splits=n_splits)
+        super().__init__(SortedStratifiedKFold, n_splits=n_splits, n_repeats=n_repeats, random_state=random_state)
 
 
-def regress(exp: Experiment, field, estimator, cv=RepeatedSortedStratifiedKFold(3, 1), params=None):
+def regress(exp: Experiment, field, estimator, cv=RepeatedSortedStratifiedKFold(n_splits=3, n_repeats=1), params=None):
     '''Evaluate regression during cross validation.
 
     Parameters
@@ -327,7 +327,7 @@ def plot_scatter(result, title='', cmap=None, cor=stats.pearsonr, cv=False, ax=N
     return ax
 
 
-def classify(exp: Experiment, fields, estimator, cv=RepeatedStratifiedKFold(3, 1),
+def classify(exp: Experiment, fields, estimator, cv=RepeatedStratifiedKFold(n_splits=3, n_repeats=1),
              predict='predict_proba', params=None):
     '''Evaluate classification during cross validation.
 
@@ -656,7 +656,7 @@ def plot_roc(result, classes=None, title='ROC', cv=True, cmap=None, ax=None):
                         'have either no true positive or no negative samples in this '
                         'cross validation for the class %r' % (grp, cls))
                     continue
-                mean_tpr = interp(mean_fpr, fpr, tpr)
+                mean_tpr = np.interp(mean_fpr, fpr, tpr)
                 tprs.append(mean_tpr)
                 tprs[-1][0] = 0.0
                 roc_auc = auc(mean_fpr, mean_tpr)