aetilley · aetilley · Dec 3, 2015 · Nov 21, 2015
diff --git a/enwiki_data/read.py b/enwiki_data/read.py
@@ -1,27 +1,38 @@
-from sys import argv
+import numpy as np
+
+from editquality.feature_lists import enwiki
+from sigclust.sigclust import sigclust
+
 
 def read_data(f, features, rids=False):
     """
-    Expects f to have tsv format.
+    Expect f to have tsv format.
+
     Reads a set of features and a label from a file one row at a time.
     rids says to expect the first column to be id numbers.
     """
-    for line in f: # Implicitly splits rows on \n
-        parts = line.strip().split("\t") # Splits columns on \t
+    # Implicitly splits rows on \n
+    for line in f:
+        # Splits columns on \t
+        parts = line.strip().split("\t")
 
         if rids:
             rev_id = parts[0]
             parts = parts[1:]
-
-        values = parts[:-1] # All but the last column are feature values.
-        label = parts[-1] # Last column is a label
+
+        # All but the last column are feature values.
+        values = parts[:-1]
+
+        # Last column is a label
+        label = parts[-1]
 
         feature_values = []
-        for feature, value in zip(features, values): 
+        for feature, value in zip(features, values):
             # Each feature knows its type and will perform the right conversion
-            
+
             if feature.returns == bool:
-                # Booleans are weird.  bool("False") == True, so you need to string match "True"
+                # Booleans are weird. bool("False") == True.
+                # so you need to string match "True"
                 feature_values.append(value == "True")
             else:
                 feature_values.append(feature.returns(value))
@@ -30,20 +41,19 @@ def read_data(f, features, rids=False):
         row.append(label == "True")
         if rids:
             row.insert(0, int(rev_id))
-
-        yield row
-
 
+        yield row
 
-from editquality.feature_lists import enwiki
-import numpy as np
-from sigclust.sigclust import sigclust
 
-def get_mat(file_name, rids = True):
+def get_mat(file_name, rids=True):
     """
-    Reads data in file_name into a np. array.
-    When rids == False, assumes all columns of the file from file_name are feature data except for the last colume which is assumed to be labels.
-    When rids==True  assumes in addition that the first column is rev_ids and returns a *tuple* of that colum of ids together with the usual output np.array
+    Read data in file_name into a np. array.
+
+    When rids == False, assumes all columns of the file from file_name are
+        feature data except for the last colume which is assumed to be labels.
+    When rids==True  assumes in addition that the first column is rev_ids
+        and returns a *tuple* of that colum of ids together with the usual
+        output np.array
     """
 
     f = open(file_name)
@@ -52,51 +62,47 @@ def get_mat(file_name, rids = True):
 
     mat = np.array(rows).astype(float)
 
-    #Last column is the label
+    # Last column is the label
     labels = mat[:, -1]
     result = mat[:, :-1]
 
-    #if rids then expect first colun to be rev_ids
+    # If rids then expect first colun to be rev_ids
     if rids:
         rid_col = result[:, 0]
         return rid_col, result[:, 1:], labels
     else:
         return result, labels
 
 
-def sig_test1(shape, iters = 20):
+def sig_test1(shape, iters=20):
     result = np.zeros(iters)
     for i in np.arange(iters):
         X = np.random.rand(shape[0], shape[1])
-        p = sigclust(X, verbose = False)[0]
+        p = sigclust(X, verbose=False)[0]
         result[i] = p
     return result
 
 
-
-
-def RSC(file, rids=True, verbose = True, scale = False):
-    rid_col, X = get_mat(file, rids = rids)
+def RSC(file, rids=True, verbose=True, scale=False):
+    rid_col, X = get_mat(file, rids=rids)
     while(True):
-        p, clust = sigclust(X, verbose = verbose, scale = scale)
+        p, clust = sigclust(X, verbose=verbose, scale=scale)
         print("p-value: %f" % p)
 
         s = sum(clust)
         n_samps = X.shape[0]
         print("The clusters have sizes %d, %d" %
               (n_samps - s, s))
-        in0 = input("Remove all points in smallest cluster and re-run sigclust?  (Enter 'n' to terminate.):")
-
+        in0 = input("Remove all points in smallest cluster and re-run "
+                    "sigclust? (Enter 'n' to terminate.):")
+
         if in0 is 'n':
             break
 
-
         sec_small = s < (n_samps / 2)
         print("Removing %s cluster (of size %d)." %
               ("SECOND" if sec_small else "FIRST",
                s if sec_small else n_samps - s))
-
-
 
         f_clust = clust.astype(bool)
         if sec_small:
@@ -107,7 +113,7 @@ def RSC(file, rids=True, verbose = True, scale = False):
         print(to_remove)
         print("These samples correspond to the following rev ids:")
         rem_rids = rid_col[to_remove]
-        
+
         print(rem_rids)
-    
-        X = np.delete(X, to_remove, axis = 0)
+
+        X = np.delete(X, to_remove, axis=0)
diff --git a/enwiki_data/similarity.py b/enwiki_data/similarity.py
@@ -3,53 +3,51 @@
 from sigclust.sigclust import sigclust
 
 
-"""Data File Name"""
 DFILE = "enwiki_data/data2.tsv"
+"""Data File Name"""
 
-"""A predicate to define a default
-subset of the set of row indices"""
 
 def pos_labels(i):
+    """A predicate to define a default subset of the set of row indices."""
     return labels[i] == 1
 
+
+SUB_PRED = pos_labels
 """
 Set a  one place predicate (boolean valued)
 defining the subpopulation of interest.
 """
 
-SUB_PRED = pos_labels
-
-"""Number of iterations for the monte carlo 
-step of the sigclust algorithm to be run on
- the subset (_S) and whole population (_B)"""
-
 ITERS_S = 50
 ITERS_B = 100
+"""Number of iterations for the monte carlo
+    step of the sigclust algorithm to be run on
+    the subset (_S) and whole population (_B)"""
+
 
 ids, features, labels = get_mat(DFILE)
 
-#Sets of row indices for the whole set
+# Sets of row indices for the whole set
 # and for the subset of interest.
 
 popul_ind = (np.arange(ids.shape[0]))
 subpop_bool = SUB_PRED(popul_ind)
 subpop_ind = popul_ind[subpop_bool]
 
-#Sizes of two populations
+# Sizes of two populations
 BIG = ids.shape[0]
 SMALL = subpop_ind.shape[0]
 
 # The incoming data restricted
 # to the subpopulation
 ids_sub = ids[subpop_bool]
-features_sub = features[subpop_bool,:]
+features_sub = features[subpop_bool, :]
 labels_sub = labels[subpop_bool]
 
-#First we cluster the whole population
+# First we cluster the whole population
 print("Computing p-value for population.")
 
-p, clust = sigclust(features,
-                    mc_iters = ITERS_B)
+p, clust = sigclust(features, mc_iters=ITERS_B)
 
 print("p-value for population:  %f" % p)
 clust0_bool = clust == 0
@@ -59,20 +57,16 @@ def pos_labels(i):
 clust1_ind = popul_ind[clust1_bool]
 
 
-
+ss_of_clust0_bool = clust0_bool & subpop_bool
+ss_of_clust1_bool = clust1_bool & subpop_bool
 """
 The clusters of the whole population
- determine two subsets (their respective 
+ determine two subsets (their respective
 intersections with the subpopulation)"""
-ss_of_clust0_bool = clust0_bool & subpop_bool
-ss_of_clust1_bool = clust1_bool & subpop_bool
-
-
 
-#Now to cluster the smaller set directly
+# Now to cluster the smaller set directly
 print("Computing p-value for sub-population.")
-p_sub, clust_sub = sigclust(features_sub,
-                            mc_iters = ITERS_S)
+p_sub, clust_sub = sigclust(features_sub, mc_iters=ITERS_S)
 print("p-value for subpopulation:  %f" % p_sub)
 
 # ndarrays to hold boolean representation of
@@ -85,11 +79,11 @@ def pos_labels(i):
         subclust0_bool[i] = True
     else:
         subclust1_bool[i] = True
-        
+
 subclust0_ind = popul_ind[subclust0_bool]
 subclust1_ind = popul_ind[subclust1_bool]
 
-# Now want to test the extent to which 
+# Now want to test the extent to which
 # subclust{0,1}_ind contained in clust{0,1}_ind
 inter_0_0 = subclust0_bool & ss_of_clust0_bool
 inter_0_1 = subclust0_bool & ss_of_clust1_bool
@@ -104,50 +98,45 @@ def pos_labels(i):
 jac_1_0 = inter_1_0.sum() / union_1_0.sum()
 jac_1_1 = inter_1_1.sum() / union_1_1.sum()
 
-print("Clustering and then taking the \
-respective intersections of the clusters \
-with the subset in question gives a \
-partition of size (%d, %d)." %
-(ss_of_clust0_bool.astype(int).sum(),
- ss_of_clust1_bool.astype(int).sum()))
+print("Clustering and then taking the respective intersections "
+      "of the clusters with the subset in question gives a "
+      "partition of size (%d, %d)." %
+      (ss_of_clust0_bool.astype(int).sum(),
+       ss_of_clust1_bool.astype(int).sum()))
 
-print("On the other hand, \
-clustering the subset in question \
-directly gives a partition of size (%d, %d)." %
+print("On the other hand, clustering the subset in question "
+      "directly gives a partition of size (%d, %d)." %
       (subclust0_bool.astype(int).sum(),
-      subclust1_bool.astype(int).sum()))
+       subclust1_bool.astype(int).sum()))
 
 print("""
-These two partitions determine 
-a four member partition refinement. 
-Specifically, letting 
+These two partitions determine
+a four member partition refinement.
+Specifically, letting
 S = the defined subset of indices
 A = The intersection of S with
 the first outer cluster
 B = The intersection of S with
 the second outer cluster
 C = First (inner) cluster of S
 D = Second (inner) cluster of S
-The four possible intersections have sizes 
+The four possible intersections have sizes
  |A&C|: %d, |A&D|: %d,
  |B&C|: %d, |B&D|: %d.""" %
-      (inter_0_0.astype(int).sum(),      
-      inter_0_1.astype(int).sum(),
-      inter_1_0.astype(int).sum(),
+      (inter_0_0.astype(int).sum(),
+       inter_0_1.astype(int).sum(),
+       inter_1_0.astype(int).sum(),
        inter_1_1.astype(int).sum()))
 
-
-
-
-print("""The four possible unions have sizes 
+print("""The four possible unions have sizes
  |A|C|: %d, |A|D|: %d,
  |B|C|: %d, |B|D|: %d.""" %
-      (union_0_0.astype(int).sum(),      
-      union_0_1.astype(int).sum(),
-      union_1_0.astype(int).sum(),
+      (union_0_0.astype(int).sum(),
+       union_0_1.astype(int).sum(),
+       union_1_0.astype(int).sum(),
        union_1_1.astype(int).sum()))
 
-print("""And so the four corresponding 
+print("""And so the four corresponding
 Jaccard indices are
 J(A,C): %f, J(A,D): %f,
 J(B,C): %f, J(B,D): %f""" %