Skip to content
This repository has been archived by the owner on Mar 24, 2021. It is now read-only.

Commit

Permalink
Working page-statistics update script
Browse files Browse the repository at this point in the history
  • Loading branch information
jcbashdown committed Nov 21, 2014
1 parent d3c91ca commit 91ec26d
Showing 1 changed file with 67 additions and 70 deletions.
137 changes: 67 additions & 70 deletions remove_duplicate_records.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,24 +29,6 @@
)


def had_default_id(config):
has = False
data = [i for i in storage._db[config['name']]
.find({
'_updated_at': {
'$lt': datetime.datetime(2014, 11, 13, 0, 0),
'$gte': datetime.datetime(2014, 11, 10, 0, 0)}
})]
if len(data) == 0:
return True
for data_point in data:
if 'humanId' in data_point and 'timeSpan' in data_point:
if(data_point['timeSpan'] in data_point['humanId']):
has = True
break
return has


def generate_id(data_point):
def value_id(value):
value_bytes = value.encode('utf-8')
Expand All @@ -68,23 +50,25 @@ def value_id(value):
{'data_group': u'govuk-info', 'data_type': u'page-statistics'}
]

if __name__ == '__main__':
total_changed = 0
total_okay_records = 0
no_timespan = 0
no_humanid = 0
i_dont_know_what_this_is = 0
count_of_data_sets_that_may_intentionally_miss_proper_ids = 0
data_sets_that_may_intentionally_miss_proper_ids = []
data_set_changes_dicts = []
may_never_have_had_id_with_timespan = []
incorrectly_formatted_ids = []
capped_collection_error = []
already_fixed = []
timespans_of_okay_records = Set([])
total_changed = 0
total_okay_records = 0
no_timespan = 0
no_humanid = 0
i_dont_know_what_this_is = 0
count_of_data_sets_that_may_intentionally_miss_proper_ids = 0
data_sets_that_may_intentionally_miss_proper_ids = []
data_set_changes_dicts = []
may_never_have_had_id_with_timespan = []
incorrectly_formatted_ids = []
capped_collection_error = []
already_fixed = []
timespans_of_okay_records = Set([])


def main_function():

for group_and_type in groups_and_types:
print "DATA SET {}".format(group_and_type)

def get_config_from_admin_app():
return admin_api.get_data_set(
group_and_type['data_group'],
Expand All @@ -93,31 +77,30 @@ def get_config_from_admin_app():
def current_mongo_collection():
return storage._db[data_set_config['name']]

def data_query():
def data_query(skip=0):
return current_mongo_collection().find({
'_updated_at': {
'$gte': datetime.datetime(2014, 11, 13, 0, 0)}
}, limit=100)

def get_data_set_data_since_change():
return [i for i in data_query()]

print "getting config"
data_set_config = get_config_from_admin_app()

print "getting data"
import pdb; pdb.set_trace()
data = get_data_set_data_since_change()
print "data got"

changes_in_collection = 0
okay_records_in_collections = 0

print "what we doing?"
if had_default_id(data_set_config):
print "we doing"
'$gte': datetime.datetime(2014, 11, 13, 0, 0),
'$lt': datetime.datetime(2014, 11, 21, 0, 0)}
}, limit=100, skip=skip, sort=[("_updated_at", pymongo.ASCENDING)])

def change_data(data):
#print "top2==================="
global total_changed
global total_okay_records
global no_timespan
global no_humanid
global i_dont_know_what_this_is
global count_of_data_sets_that_may_intentionally_miss_proper_ids
global data_sets_that_may_intentionally_miss_proper_ids
global data_set_changes_dicts
global may_never_have_had_id_with_timespan
global incorrectly_formatted_ids
global capped_collection_error
global already_fixed
global timespans_of_okay_records
for data_point in data:
print "DATA POINT {}".format(group_and_type)

def no_timespan_in_id():
return data_point['timeSpan'] not in data_point['humanId']

Expand All @@ -127,32 +110,42 @@ def update_with_ids(the_id, humanId):
try:
current_mongo_collection().remove(
{'_id': data_point['_id']})
print "rem"
except pymongo.errors.OperationFailure:
capped_collection_error.append(data_set_config['name'])
update = False
if update:
print "okat"
new_data = data_point
new_data['_id'] = the_id
new_data['humanId'] = humanId
try:
current_mongo_collection().insert(new_data)
print "ins"
except pymongo.errors.DuplicateKeyError:
update = False
already_fixed.append(data_set_config['name'])
return update

if 'humanId' in data_point and 'timeSpan' in data_point:
if no_timespan_in_id():
print "==================="
print total_changed
print total_okay_records
print "==================="
(the_id, humanId) = generate_id(data_point)
if "_{}".format(
data_point['timeSpan']) not in humanId:
incorrectly_formatted_ids.append(humanId)
if update_with_ids(the_id, humanId):
total_changed += 1
changes_in_collection += 1
print humanId.encode('utf-8')
else:
#print "==================="
#print total_changed
#print total_okay_records
#print "==================="
total_okay_records += 1
okay_records_in_collections += 1
timespans_of_okay_records.add(data_point['timeSpan'])
else:
if 'humanId' not in data_point:
Expand All @@ -161,20 +154,24 @@ def update_with_ids(the_id, humanId):
no_timespan += 1
else:
i_dont_know_what_this_is += 1
else:
may_never_have_had_id_with_timespan.append(data_set_config['name'])

data_set_changes_dicts.append({data_set_config['name']: changes_in_collection})
def get_and_update_data_set_data_since_change():
skip = 0
data = [i for i in data_query(skip)]
while(data):
#print "==========================="
#print skip
#print "==========================="
change_data(data)
skip += 100
data = [i for i in data_query(skip)]

def may_intentionally_miss_proper_ids():
return okay_records_in_collections == 0 and not len(data) == 0
print "getting config"
data_set_config = get_config_from_admin_app()
print "got"

if may_intentionally_miss_proper_ids():
data_sets_that_may_intentionally_miss_proper_ids.append(data_set_config['name'])
count_of_data_sets_that_may_intentionally_miss_proper_ids += 1
get_and_update_data_set_data_since_change()

# save some memory?
data = None
print "COUNTS"
print {
'total_changed': total_changed,
Expand All @@ -190,10 +187,7 @@ def may_intentionally_miss_proper_ids():
"they may correctly have no proper id but it seems unlikely, they get overwritten during a run, check the config but we can be pretty sure it's okay")
print data_sets_that_may_intentionally_miss_proper_ids
print "data_set_changes_dicts=================="
#print json.dumps(data_set_changes_dicts, indent=2)
print data_set_changes_dicts
print "may_never_have_had_id_with_timespan=================="
print may_never_have_had_id_with_timespan
print "incorrectly_formatted_ids=================="
print incorrectly_formatted_ids
print len(incorrectly_formatted_ids)
Expand All @@ -203,3 +197,6 @@ def may_intentionally_miss_proper_ids():
print "already_fixed=================="
print already_fixed
print len(already_fixed)

if __name__ == '__main__':
main_function()

0 comments on commit 91ec26d

Please sign in to comment.