Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Flake8 & PEP257, Part II #4

Merged
merged 1 commit into from Dec 3, 2015
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
80 changes: 43 additions & 37 deletions enwiki_data/read.py
@@ -1,27 +1,38 @@
from sys import argv
import numpy as np

from editquality.feature_lists import enwiki
from sigclust.sigclust import sigclust


def read_data(f, features, rids=False):
"""
Expects f to have tsv format.
Expect f to have tsv format.

Reads a set of features and a label from a file one row at a time.
rids says to expect the first column to be id numbers.
"""
for line in f: # Implicitly splits rows on \n
parts = line.strip().split("\t") # Splits columns on \t
# Implicitly splits rows on \n
for line in f:
# Splits columns on \t
parts = line.strip().split("\t")

if rids:
rev_id = parts[0]
parts = parts[1:]

values = parts[:-1] # All but the last column are feature values.
label = parts[-1] # Last column is a label

# All but the last column are feature values.
values = parts[:-1]

# Last column is a label
label = parts[-1]

feature_values = []
for feature, value in zip(features, values):
for feature, value in zip(features, values):
# Each feature knows its type and will perform the right conversion

if feature.returns == bool:
# Booleans are weird. bool("False") == True, so you need to string match "True"
# Booleans are weird. bool("False") == True.
# so you need to string match "True"
feature_values.append(value == "True")
else:
feature_values.append(feature.returns(value))
Expand All @@ -30,20 +41,19 @@ def read_data(f, features, rids=False):
row.append(label == "True")
if rids:
row.insert(0, int(rev_id))

yield row


yield row

from editquality.feature_lists import enwiki
import numpy as np
from sigclust.sigclust import sigclust

def get_mat(file_name, rids = True):
def get_mat(file_name, rids=True):
"""
Reads data in file_name into a np. array.
When rids == False, assumes all columns of the file from file_name are feature data except for the last colume which is assumed to be labels.
When rids==True assumes in addition that the first column is rev_ids and returns a *tuple* of that colum of ids together with the usual output np.array
Read data in file_name into a np. array.

When rids == False, assumes all columns of the file from file_name are
feature data except for the last colume which is assumed to be labels.
When rids==True assumes in addition that the first column is rev_ids
and returns a *tuple* of that colum of ids together with the usual
output np.array
"""

f = open(file_name)
Expand All @@ -52,51 +62,47 @@ def get_mat(file_name, rids = True):

mat = np.array(rows).astype(float)

#Last column is the label
# Last column is the label
labels = mat[:, -1]
result = mat[:, :-1]

#if rids then expect first colun to be rev_ids
# If rids then expect first colun to be rev_ids
if rids:
rid_col = result[:, 0]
return rid_col, result[:, 1:], labels
else:
return result, labels


def sig_test1(shape, iters = 20):
def sig_test1(shape, iters=20):
result = np.zeros(iters)
for i in np.arange(iters):
X = np.random.rand(shape[0], shape[1])
p = sigclust(X, verbose = False)[0]
p = sigclust(X, verbose=False)[0]
result[i] = p
return result




def RSC(file, rids=True, verbose = True, scale = False):
rid_col, X = get_mat(file, rids = rids)
def RSC(file, rids=True, verbose=True, scale=False):
rid_col, X = get_mat(file, rids=rids)
while(True):
p, clust = sigclust(X, verbose = verbose, scale = scale)
p, clust = sigclust(X, verbose=verbose, scale=scale)
print("p-value: %f" % p)

s = sum(clust)
n_samps = X.shape[0]
print("The clusters have sizes %d, %d" %
(n_samps - s, s))
in0 = input("Remove all points in smallest cluster and re-run sigclust? (Enter 'n' to terminate.):")

in0 = input("Remove all points in smallest cluster and re-run "
"sigclust? (Enter 'n' to terminate.):")

if in0 is 'n':
break


sec_small = s < (n_samps / 2)
print("Removing %s cluster (of size %d)." %
("SECOND" if sec_small else "FIRST",
s if sec_small else n_samps - s))



f_clust = clust.astype(bool)
if sec_small:
Expand All @@ -107,7 +113,7 @@ def RSC(file, rids=True, verbose = True, scale = False):
print(to_remove)
print("These samples correspond to the following rev ids:")
rem_rids = rid_col[to_remove]

print(rem_rids)
X = np.delete(X, to_remove, axis = 0)

X = np.delete(X, to_remove, axis=0)
91 changes: 40 additions & 51 deletions enwiki_data/similarity.py
Expand Up @@ -3,53 +3,51 @@
from sigclust.sigclust import sigclust


"""Data File Name"""
DFILE = "enwiki_data/data2.tsv"
"""Data File Name"""

"""A predicate to define a default
subset of the set of row indices"""

def pos_labels(i):
"""A predicate to define a default subset of the set of row indices."""
return labels[i] == 1


SUB_PRED = pos_labels
"""
Set a one place predicate (boolean valued)
defining the subpopulation of interest.
"""

SUB_PRED = pos_labels

"""Number of iterations for the monte carlo
step of the sigclust algorithm to be run on
the subset (_S) and whole population (_B)"""

ITERS_S = 50
ITERS_B = 100
"""Number of iterations for the monte carlo
step of the sigclust algorithm to be run on
the subset (_S) and whole population (_B)"""


ids, features, labels = get_mat(DFILE)

#Sets of row indices for the whole set
# Sets of row indices for the whole set
# and for the subset of interest.

popul_ind = (np.arange(ids.shape[0]))
subpop_bool = SUB_PRED(popul_ind)
subpop_ind = popul_ind[subpop_bool]

#Sizes of two populations
# Sizes of two populations
BIG = ids.shape[0]
SMALL = subpop_ind.shape[0]

# The incoming data restricted
# to the subpopulation
ids_sub = ids[subpop_bool]
features_sub = features[subpop_bool,:]
features_sub = features[subpop_bool, :]
labels_sub = labels[subpop_bool]

#First we cluster the whole population
# First we cluster the whole population
print("Computing p-value for population.")

p, clust = sigclust(features,
mc_iters = ITERS_B)
p, clust = sigclust(features, mc_iters=ITERS_B)

print("p-value for population: %f" % p)
clust0_bool = clust == 0
Expand All @@ -59,20 +57,16 @@ def pos_labels(i):
clust1_ind = popul_ind[clust1_bool]



ss_of_clust0_bool = clust0_bool & subpop_bool
ss_of_clust1_bool = clust1_bool & subpop_bool
"""
The clusters of the whole population
determine two subsets (their respective
determine two subsets (their respective
intersections with the subpopulation)"""
ss_of_clust0_bool = clust0_bool & subpop_bool
ss_of_clust1_bool = clust1_bool & subpop_bool



#Now to cluster the smaller set directly
# Now to cluster the smaller set directly
print("Computing p-value for sub-population.")
p_sub, clust_sub = sigclust(features_sub,
mc_iters = ITERS_S)
p_sub, clust_sub = sigclust(features_sub, mc_iters=ITERS_S)
print("p-value for subpopulation: %f" % p_sub)

# ndarrays to hold boolean representation of
Expand All @@ -85,11 +79,11 @@ def pos_labels(i):
subclust0_bool[i] = True
else:
subclust1_bool[i] = True

subclust0_ind = popul_ind[subclust0_bool]
subclust1_ind = popul_ind[subclust1_bool]

# Now want to test the extent to which
# Now want to test the extent to which
# subclust{0,1}_ind contained in clust{0,1}_ind
inter_0_0 = subclust0_bool & ss_of_clust0_bool
inter_0_1 = subclust0_bool & ss_of_clust1_bool
Expand All @@ -104,50 +98,45 @@ def pos_labels(i):
jac_1_0 = inter_1_0.sum() / union_1_0.sum()
jac_1_1 = inter_1_1.sum() / union_1_1.sum()

print("Clustering and then taking the \
respective intersections of the clusters \
with the subset in question gives a \
partition of size (%d, %d)." %
(ss_of_clust0_bool.astype(int).sum(),
ss_of_clust1_bool.astype(int).sum()))
print("Clustering and then taking the respective intersections "
"of the clusters with the subset in question gives a "
"partition of size (%d, %d)." %
(ss_of_clust0_bool.astype(int).sum(),
ss_of_clust1_bool.astype(int).sum()))

print("On the other hand, \
clustering the subset in question \
directly gives a partition of size (%d, %d)." %
print("On the other hand, clustering the subset in question "
"directly gives a partition of size (%d, %d)." %
(subclust0_bool.astype(int).sum(),
subclust1_bool.astype(int).sum()))
subclust1_bool.astype(int).sum()))

print("""
These two partitions determine
a four member partition refinement.
Specifically, letting
These two partitions determine
a four member partition refinement.
Specifically, letting
S = the defined subset of indices
A = The intersection of S with
the first outer cluster
B = The intersection of S with
the second outer cluster
C = First (inner) cluster of S
D = Second (inner) cluster of S
The four possible intersections have sizes
The four possible intersections have sizes
|A&C|: %d, |A&D|: %d,
|B&C|: %d, |B&D|: %d.""" %
(inter_0_0.astype(int).sum(),
inter_0_1.astype(int).sum(),
inter_1_0.astype(int).sum(),
(inter_0_0.astype(int).sum(),
inter_0_1.astype(int).sum(),
inter_1_0.astype(int).sum(),
inter_1_1.astype(int).sum()))




print("""The four possible unions have sizes
print("""The four possible unions have sizes
|A|C|: %d, |A|D|: %d,
|B|C|: %d, |B|D|: %d.""" %
(union_0_0.astype(int).sum(),
union_0_1.astype(int).sum(),
union_1_0.astype(int).sum(),
(union_0_0.astype(int).sum(),
union_0_1.astype(int).sum(),
union_1_0.astype(int).sum(),
union_1_1.astype(int).sum()))

print("""And so the four corresponding
print("""And so the four corresponding
Jaccard indices are
J(A,C): %f, J(A,D): %f,
J(B,C): %f, J(B,D): %f""" %
Expand Down