Permalink
Switch branches/tags
Nothing to show
Find file Copy path
Fetching contributors…
Cannot retrieve contributors at this time
executable file 223 lines (211 sloc) 7.93 KB
#!/usr/bin/python
###
### cleandirtyf - data cleaning tools
### Copyright (C) <2010>
### <Blanca A. Vargas Govea> <blanca.vg@gmail.com>
###
### This program is free software: you can redistribute it and/or modify
### it under the terms of the GNU General Public License as published by
### the Free Software Foundation, either version 3 of the License, or
### (at your option) any later version.
###
### This program is distributed in the hope that it will be useful,
### but WITHOUT ANY WARRANTY; without even the implied warranty of
### MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
### GNU General Public License for more details.
###
### You should have received a copy of the GNU General Public License
### along with this program. If not, see <http://www.gnu.org/licenses/>.
###
import sys
import re
import argparse
import logging
import logging.config
import time
logging.config.fileConfig("loggingf.config")
# create logger
logger = logging.getLogger("clean")
def parse():
"Parsing command line arguments"
arglist = []
parser = argparse.ArgumentParser()
parser.add_argument('ifilename', action='store', help='File name')
parser.add_argument('-c', action='append', dest='columns',
default=[],
help='Add repeated values to a list',
)
parser.add_argument('-s', action='store', dest='separator',
help='Store a separator')
results = parser.parse_args()
if not results.columns:
print "The whole file will be cleaned (default)"
logger.info("The whole file will be cleaned (default)")
else:
columns = results.columns # list of strings to list of integers
strcol = columns.pop()
strcol = strcol.split(',')
cols = map(eval,strcol)
arglist.append(cols)
logger.info("Cleaning only the listed columns: "+str(cols))
print "Separator: ",results.separator
if not results.separator or results.separator == "comma":
print "Using the default separator (,)"
arglist.append(",")
elif results.separator == "semic":
arglist.append(";")
else:
sys.exit("Not a valid separator. Possible values: comma | semic")
arglist.append(results.ifilename)
print "Arglist",arglist
return arglist
def cleanav(pattern,attval,cleanrow):
"Clean a string"
singlesp = ' '.join(attval.split()) # replace multiple single
# remove all non-alphan chars but semicolon and single spaces
m = pattern.sub('',singlesp)
cleanrow.append(m)
logger.debug("dirty: "+attval+"\tclean: "+m)
logger.debug("cleanrow = " + str(cleanrow))
return cleanrow
def out(INFILE,allrows,empty,longer,longerow,shorter,shorterow,dirty):
REPORT = "../reports/rep_"+INFILE
CSVREPORT = "../data/rep_"+INFILE
# create report file (friendly readable)
try:
hreport = open(REPORT, "w")
except IOError:
logger.error("Error: can\'t create the report file")
sys.exit("Error: can\'t create the report file")
else:
print "OK: Report file created"
logger.info("OK: Report file created")
# create a csv file
try:
csvreport = open(CSVREPORT, "w")
except IOError:
logger.error("Error: can\'t create the csv file")
sys.exit("Error: can\'t create the csv file")
else:
print "OK: csv file created"
logger.info("OK: csv file created")
allrows = allrows - 1 # without the header
usefulrows = allrows - empty - longer - shorter
removed = empty + longer + shorter
pcgusefulrows = (usefulrows*100)/allrows
print "===== Input file (before cleaning) = ",INFILE
print "Examples = ",allrows
print "Empty rows (removed) = ",empty
print "Longer rows (removed) = ",longer
print "Shorter rows (removed) = ",shorter
print "Removed rows = ",removed
print "Dirty values (cleaned) = ", dirty
print "===== Output file (after cleaning)"
print "Useful rows = ",usefulrows + 1 # last eol
print "Useful file = ",pcgusefulrows,"%"
########## write to report file
hreport.write("===== Input file (before cleaning) = "+INFILE+"\n")
hreport.write("Examples = "+str(allrows)+"\n")
hreport.write("Empty rows (removed) = "+str(empty)+"\n")
hreport.write("Longer rows (removed) = "+str(longer)+"\n")
hreport.write("Shorter rows (removed) = "+str(shorter)+"\n")
hreport.write("Removed rows = "+str(removed)+"\n")
hreport.write("Dirty values (cleaned) = "+str(dirty)+"\n")
hreport.write("===== Output file (after cleaning)"+"\n")
hreport.write("Useful rows = "+str(usefulrows+1)+"\n")
hreport.write("Useful file = "+str(pcgusefulrows)+"%"+"\n")
########## write to csv
csvreport.write("file,raw,emptyl,long,short,removed,dirtych,usfl,pcgusfl\n")
csvreport.write(INFILE+","+str(allrows)+","+str(empty)+","+str(longer)+","+str(shorter)+","+str(removed)+","+str(dirty)+","+str(usefulrows+1)+","+str(pcgusefulrows)+"\n")
########## write to logger
logger.debug("Longer rows (removed) = "+str(longer)+" Rows = "+str(longerow)+"\n")
logger.debug("Shorter rows (removed) = "+str(shorter)+" Rows = "+str(shorterow)+"\n")
hreport.close()
csvreport.close()
return
def clean(arglist):
"Clean dirty file"
pattern = re.compile('[^a-zA-Z0-9: ]')
dirtyf = arglist.pop()
separator = arglist.pop()
colist = []
print "colist = ",colist
if len(arglist) > 0:
colist = arglist.pop()
print "colist = ",colist
INFILE = "../data/"+dirtyf
OUTFILE = "../data/clean_"+dirtyf
try:
ifile = open(INFILE, "r")
except IOError:
logger.error("Error: can\'t find file or read data")
sys.exit("Error: can\'t find file or read data")
else:
logger.info("OK: opening file")
try:
ofile = open(OUTFILE, "w")
except IOError:
logger.error("Error: can\'t create the cleaned file")
sys.exit("Error: can\'t create the cleaned file")
else:
print "OK: Cleaned file created"
logger.info("OK: Cleaned file created")
lcount = 0
attcount = 0
newrow = []
writetofile = True
empty = 0 # empty rows counter
longer = 0 # longer rows counter
shorter = 0 # shorter rows counter
dirty = 0 # dirty attribute values counter
longerow = []
shorterow = []
for line in ifile:
print lcount
row = line.rstrip() # remove trailing chars
row = row.split(separator); # row is a list: row[0],row[1]
if lcount == 0:
attnumber = len(row) # attribute number in header
rowlen = len(row)
logger.debug("Attribute number: "+str(rowlen))
if rowlen <> attnumber:
logger.debug("Wrong attribute number, skipping example")
writetofile = False
if rowlen == 1:
empty = empty + 1
elif rowlen > attnumber:
longer = longer + 1
longerow.append(lcount)
else:
shorter = shorter + 1
shorterow.append(lcount)
if writetofile:
if not colist:
for attval in row:
newrow = cleanav(pattern,attval,newrow)
dirty = dirty + 1
else:
for attval in colist:
newrow = cleanav(pattern,row[attval],newrow)
dirty = dirty + 1
s = ",".join(newrow)+'\n' # to string
ofile.write(s)
ifile.flush()
writetofile = True
lcount = lcount + 1
attcount = 0
newrow = []
ifile.close()
ofile.close()
out(dirtyf,lcount,empty,longer,longerow,shorter,shorterow,dirty)
return
def main():
start = time.clock()
arglist = parse()
clean(arglist)
elapsed = (time.clock() - start)
print "Elapsed time: ",elapsed
logger.info("Elapsed time: "+str(elapsed))
pass
if __name__ == "__main__":
main()