# GFF phage file validation program

###README:

This program will check your GFF file. It checks for proper format and use of proper characters  (i.e. no illegal charachters used). 

**Before you run the program:** copy your GFF file into the "input" folder on the Desktop. Make sure you know the EXACT name of the file, it is best to avoid spaces in filenames (rename your files if necessary).  

**To run the program:** click anywhere in the code box below near the "#STEP 1: Click inside this box Here". You should see the blinking curser appear near where you clicked, once you see the curser hit shift-return (i.e. hold shift while hitting the return key).

**What will happen:** the first thing to appear will be a small text entry box at the bottom of the page asking for the name of your gff file. Type the name of the gff file into the box. Once the box has the correct name, hit the return key (no shift). 

Your results should appear directly below the question box.

In [22]:
#STEP 1: Click inside this box Here

gffFileName = raw_input('Enter the name of the gff file (BE EXACT): ')

fullGFFFileName = "/Users/chris/Desktop/input/" + gffFileName

try:
    with open(fullGFFFileName) as gffFile:
        gffFileContents = gffFile.readlines()
except:
    gffFileContents = []
    print '''Well that didn't work! Could not get GFF file. Are you sure it is in the input folder?
    Double check spelling, no spaces in file name, try again. If that fails get help.'''
    sys.exit()
print "\n\n================================== Results ==================================\n"
        
firstline = gffFileContents[0]

print len(firstline)
if not validHeader(firstline):
    printFailureMessage("Check for unix formated file, Header line")
    print firstline
    sys.exit()

    
print "checking line: ", 
for lineNum,wholeLine in enumerate(gffFileContents[1:]):
    lineNumber = str(lineNum+1)
    print " " + lineNumber,
    
    try:
        if not validTabStructure(wholeLine):                          #check whole line for proper 8 tab structure
            failureMessage = "Line number: " + lineNumber + " tab structure (i.e. 9 entries and 8 tabs)"
            printFailureMessage(failureMessage)
            sys.exit()
    except:
        print "double check that all functions are defined!"
    
    
    line = wholeLine.strip().split("\t")
    
    if not validSeqname(line[0]):
        failMessage = "Line number: " + lineNumber + " seq name of " + line[0]
        printFailureMessage(failMessage)
        sys.exit()
    
    if not validSource(line[1]):
        failMessage = "Line number: " + lineNumber + " source definition of " + line[1] 
        printFailureMessage(failMessage)
        sys.exit()

    if not validType(line[2]):
        failMessage = "Line number: " + lineNumber + " type definition of " + line[2]
        printFailureMessage(failMessage)
        sys.exit()

    if not validCoordinates(line[3],line[4]):
        failMessage = "Line number: " + lineNumber + " coordinates left: " + line[3] + " right: " + line[4] + " have"
        printFailureMessage(failMessage)
        sys.exit()

    if not validScore(line[5]):
        failMessage = "Line number: " + lineNumber + " score of " + line[5]
        printFailureMessage(failMessage)
        sys.exit()

    if not validStrand(line[6]):
        failMessage = "Line number: " + lineNumber + " strand entry of " + line[6]
        printFailureMessage(failMessage)
        sys.exit()

    if not validPhase(line[7]):
        failMessage = "Line number: " + lineNumber + " phase of " + line[7]
        printFailureMessage(failMessage)
        sys.exit()
    
    
    if not validAttributes(line[8]):
        failMessage = "Line number: " + lineNumber + " attribute entry of: " + line[8]
        printFailureMessage(failMessage)
        sys.exit()

print "\n\nCongratulations! all tests passed\nEverything below can be used in DNA Master"
print "=========================== DNA Master documentation ========================"
text = createDNAMasterFile(gffFileContents)
print text

Enter the name of the gff file (BE EXACT): Joy99_v2.txt



16
checking line:   1  2  3  4  5  6  7  8  9  10  11  12  13  14  15  16  17  18  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35  36  37  38  39  40  41  42  43  44  45  46  47  48  49  50  51  52  53  54  55  56  57  58  59  60  61  62  63  64  65  66  67  68  69  70  71  72  73  74  75  76  77  78  79  80  81  82  83  84  85  86  87  88  89  90  91  92  93  94 

Congratulations! all tests passed
Everything below can be used in DNA Master
CDS 79 - 273
  /gene=Joy99.1
CDS 270 - 455
  /gene=Joy99.2
CDS 452 - 706
  /gene=Joy99.3
CDS 703 - 930
  /gene=Joy99.4
CDS 1156 - 1359
  /gene=Joy99.5
CDS 1352 - 1579
  /gene=Joy99.6
CDS 1563 - 2987
  /gene=Joy99.7
CDS 2999 - 4555
  /gene=Joy99.8
CDS 4560 - 7070
  /gene=Joy99.9
CDS 7067 - 7252
  /gene=Joy99.10
CDS 7289 - 7825
  /gene=Joy99.11
CDS 7900 - 8835
  /gene=Joy99.12
CDS 8989 - 9333
  /gene=Joy99.13
CDS 9330 - 9686
  /gene=Joy99.14
CDS 9667 - 9948
  /gene=Joy99.15

# Definitions of Functions

In [15]:
#%%writefile gffTester_nose.py
import re,sys

def validHeader(line):
    """
    Return True if line is a valid gff3 header.
    
    Parameters:
    - 'line' - line to check
    """
    if len(line) != 16:
        return False
    elif line.strip() == "##gff-version 3":
        return True
    else:
        return False
    
def validTabStructure(line):
    """
    Checks a line for valid <tab> structure, a valid GFF3 file should have 9 entries and 8 <Tab> charachters
    
    Parameters:
    - 'line': line to check
    """
    theLine = line.strip().split("\t")  #strip off extra charachters and split the line on tab

    if len(theLine) != 9:               # 9 columns in a gff file, so return False if not exactly 9
        print 'Length = ' + str(len(theLine))
        return False
    else:
        return True

def validSeqname(text):
    """
    Checks a string to make sure it is a valid entry for column 1 of gff file, for this column it must match
    the sequence name on the Gbrowse database and have only valid characters
    
    Parameters:
    - 'text': string value of entry in column 1 to check
    """
    return not charCheck(text)  # need name check, for now just check for valid charachters

def validSource(text):
    """
    Checks a string to make sure it is a valid entry for column 2 of gff file, for this column should be source
    which only need validity of the characters
    
    Parameters:
    - 'text': string value of the entry in column 2 to check
    """
    return not charCheck(text)

def validType(text, validTypes=None):
    """
    Checks a string to make sure it is a valid entry for column 3 of gff file, for this column should be feature type.
    For phage this should be one of types ['gene','mRNA','exon']
    
    Parameters:
    - 'text': string value of the entry in column 2 to check
    - 'validTypes': List of valid types for checking if not the default 3
    """
    if validTypes is None:
        validTypes = {'gene','mRNA','exon', 'contig'}
    
    return text in validTypes


def validScore(score):
    """
    Checks a score (entries in column 6), should be a number or "."
    
    Parameters:
    - 'score': score from column 6
    """
    if score == ".":
        return True
    
    try:
        x = float(score)
    except:
        isScore = False
        pass
    else:
        isScore = True        

    return isScore

def validStrand(strand):
    """
    Checks a strand (entries in column 7), should be one of ("=", "-", ".", "?")
    
    Parameters:
    - 'strand': strand entry from column 7
    """
    validValues = {"+", "-", ".", "?"}
    
    return strand in validValues

def validPhase(phase):
    """
    Checks a phase (entries in column 8), should be one of (".", 0, 1, 2, ")
    
    Parameters:
    - 'phase': strand entry from column 8
    """
    validValues = {".", 0, "0" , 1, "1", 2, "2"}
    
    return phase in validValues

def validAttributes(attributes):
    '''
    Checks a column 9 entry, should be series of key=value entries separated by ; 
    It is OK to have spaces in the values entry but not the Key value
    
    
    Parameters:
    - 'attributes': entire entry from column 9
    '''
    #ok to have a null string
    if attributes == '.':
        return True
    
    #checking for invalid characters     
    if bool(charCheck(attributes, search=re.compile(r'[^a-zA-Z0-9.=;_ "]').search)):
        return False
    
    
    #ok go ahead and split into the underlyine key=value items 

    attrPairs = attributes.split(";")   #list of each attribute pair
    
    for attrPair in attrPairs:
        if attrPair == '':                  #this will happen if file had two ;; in a row
            continue
            
        if attrPair.count('=') != 1:        #for each Key=value there must be only one "="
            return False
        
        attrKey, attrValue = attrPair.split('=')
        
        if len(attrKey) < 1:                #must have an entry for a key
            print len(attrKey)
            return False
        
        if charCheck(attrKey):              #check key for valid charachters
            return False
        
        # check for invalid charachters in value, but spaces are OK, so remove before checking
        if charCheck(attrValue.replace(' ','')):
            return False
            
        return True


def charCheck(str, search=re.compile(r'[^a-zA-Z0-9.=;_]').search):
    """
    Checks a string for characters NOT a-zA-Z0-9.=; and returns True if invalid character is
    found.

    Parameters:
    -'str': string being checked.
    -'search': default search all characters in valid set
    """
    return bool(search(str))

def validCoordinate(coord):
    """
    Checks a coordinate (entries in column 4 or 5), should be a positive integer
    
    Parameters:
    - 'coord': coordinate from column 4 or 5
    """
    
    return not bool(charCheck(coord, search=re.compile(r'[^0-9]').search))

def validCoordinates(leftCoord, rightCoord):
    """
    Checks a coordinates (entries in column 4 and 5), should be a positive integers
    and the left Coordinate should be smaller than the right Coordinate
    
    Parameters:
    - 'leftCoord':  coordinate from column 4
    - 'rightCoord': coordinate from column 5

    """
    return (validCoordinate(leftCoord) and validCoordinate(rightCoord) and int(leftCoord) <= int(rightCoord))


def printFailureMessage(failType):
    print "\n##### Fatal Error #####"
    print failType + " failed. No other tests run."
    print """
================================ End Results ================================        
Computer messages follow below OK to ignore, you should fix file and try again.
    """
    
def createDNAMasterFile(gffFileContents):
    """
    Takes in a list of GFF lines, checks for headers and passes lines of type gene to parser.
    Returns the complete text of DNA Master text file.
    
    Parameters:
    - 'gffFileContents':  List of text entries, each element a line from GFF# file.
    """
    
    text = ''
    for wholeLine in gffFileContents:
        line = wholeLine.strip().split("\t")
        
        if len(line) < 8:
            continue
        
        if line[2] == 'gene':
            entry = gene2CDS(line)
            text += entry
            
        text += parseAttributes(line[8])
            
    epilog = "ORIGIN"
    text += epilog
        
    return text
            
        
def gene2CDS(line):
    """
    Takes in a single GFF entry and returns the corresponding DNA Master entry for the CDS line.
    
    Parameters:
    - 'line':  List of entries from a single GFF entry.
    """    
    if line[6] == "+":
        return "CDS " + line[3] + " - " + line[4] + "\n"
    else:
        return "CDS complement (" + line[3] + " - " + line[4] + ")\n"
    
def parseAttributes(attributes):
    """
    takes in the Attributes entry of the GFF. If there is an ID= or Name=
    entry then create the /gene line. If there is a notes= entry then create
    the /note line
    """
    #ok go ahead and split into the underlyine key=value items 

    attrPairs = attributes.split(";")   #list of each attribute pair
    
    returnString = ""
    
    for attrPair in attrPairs:
        attrKey = ""
        attrValue = ""
        
        if len(attrPair.split('=')) == 2:
            attrKey = attrPair.split("=")[0].lower()
            attrValue =  attrPair.split("=")[1]
        
        if attrKey in ["id", "name"]:
            returnString += '  /gene=' + attrValue + '\n'
        if attrKey == "note":
            if attrValue[0:1] == attrValue[-1:] == '"':
                attrValue = attrValue[1:-1]
            returnString += '    /note="' + attrValue + '"\n'
    
    return returnString

## Test code

In [None]:
%%writefile test_code.py

from gffTester_nose import *

'''
need to check the following functions:

def validHeader (line): TEST valid, extra space, extra tab, garbled
def validTabStructure(line): test: 7 tabs, 8 tabs, 9 tabs
def validSeqname(text): SKIP, just a "not charCheck" for now
def validSource(text): SKIP, just a "not charCheck" for now
def validType(text, validTypes=None): TEST: pass with any 3 defaults 'gene','mRNA','exon'; fails on none item, passes with list
def validCoordinate(coord): TEST: fails with text, float, negative int; passes with positive int
def validScore(score):
def validStrand(strand):
def validPhase(phase):
def validAttributes(attributes):
def charCheck(str, search=re.compile(r'[^a-zA-Z0-9.=;_]').search):
def validGene(seq, line):       
def fastaRead(fasta_File):
def countStopCodons(seq, strand):
'''

def test_validHeader_1():
    'valid header tests as valid'
    result = validHeader('##gff-version 3\n')
    assert result == True

def test_validHeader_2():
    'header with extra space fails'
    result = validHeader('##gff-version 3 \n')
    assert result == False
    
def test_validHeader_3():
    'header with extra tab fails'
    result = validHeader('##gff-version 3\t\n')
    assert result == False

def test_validHeader_4():
    'garbled header fails'
    result = validHeader('##gff-Version 3\n')
    assert result == False
    

def test_validTabStructure_1():
    'fails 7 tab structure'
    result = validTabStructure("1\t2\t3\t4\t5\t6\t7\t8\n")
    assert result == False

def test_validTabStructure_2():
    'passes 8 tab structure'
    result = validTabStructure("1\t2\t3\t4\t5\t6\t7\t8\t9\n")
    assert result == True
    
def test_validTabStructure_3():
    'fails 9 tab structure'
    result = validTabStructure("1\t2\t3\t4\t5\t6\t7\t8\t9\t10\n")
    assert result == False
    
def test_validType_1():
    'Type passes with "gene"'
    result = validType('gene')
    assert result == True

def test_validType_2():
    'Type passes with "mRNA"'
    result = validType('mRNA')
    assert result == True

def test_validType_3():
    'Type passes with "exon"'
    result = validType('exon')
    assert result == True
    
def test_validType_4():
    'Type fails with mrna'
    result = validType('mrna')
    assert result == False
    
def test_validType_5():
    'Type passes with specified list'
    result = validType('mrna', ['mRNA', 'mrna'])
    assert result == True
    

def test_validCoordinate_1():
    'Coord fails with text as coordinate'
    result = validCoordinate('a')
    assert result == False

def test_validCoordinate_2():
    'Coord fails with float'
    result = validCoordinate('2.0')
    assert result == False
    
def test_validCoordinate_3():
    'Coord fails with negative int'
    result = validCoordinate('-3')
    assert result == False
    
def test_validCoordinate_4():
    'Coord passes with positive integer'
    result = validCoordinate('100')
    assert result == True
    
def test_validCoordinates_1():
    'Coordinates fails with left larger than right'
    result = validCoordinates('5','4')
    assert result == False

def test_validCoordinates_2():
    'passes with left equal to right'
    result = validCoordinates('5','5')
    assert result == True
    
def test_validCoordinates_3():
    'passes with left smaller than right'
    result = validCoordinates('4','5')
    assert result == True

    
def test_validAttributes_1():
    'Attributes passes with null string'
    result = validAttributes("")
    assert result == True

def test_validAttributes_2():
    'Attributes passes a=b'
    result = validAttributes("a=b")
    assert result == True

def test_validAttributes_3():
    'attrubutes passes with "a=b;"'
    result = validAttributes("a=b;")
    assert result == True

def test_validAttributes_4():
    'Attributes failes with "a=bc=d"'
    result = validAttributes("a=bc=d")
    assert result == False
    
def test_validAttributes_5():
    'attrubutes passes with "a=b;c=d"'
    result = validAttributes("a=b;c=d")
    assert result == True

def test_validAttributes_6():
    'attrubutes passes with "af=b f;c=d"'
    result = validAttributes("af=b f;c=d")
    assert result == True

def test_validAttributes_7():
    'attrubutes fails with blank key "a=b;=b"'
    result = validAttributes("a=b;=b")
    assert result == False

def test_validAttributes_8():
    'attrubutes fails with illegal character in key "key&=ab"'
    result = validAttributes("key&=ab")
    assert result == False


def test_parseAttributes_1():
    'Attribute test 1 single ID entry'
    returned = parseAttributes("ID=gene1")
    assert returned == '  /gene=gene1\n'

def test_parseAttributes_2():
    'Attribute test 2 single ID entry with semicolon'
    returned = parseAttributes("ID=gene1;")
    assert returned == '  /gene=gene1\n'
    
def test_parseAttributes_3():
    'Attribute test 3 single Note entry with spaces'
    returned = parseAttributes("Note=note note")
    assert returned == '    /note="note note"\n'
    
def test_parseAttributes_4():
    'Attribute test 4 single Note entry with semicolon'
    returned = parseAttributes("Note=gene;")
    assert returned == '    /note="gene"\n'
    
def test_parseAttributes_5():
    'Attribute test 5 single Note entry preceded by ;'
    returned = parseAttributes(";Note=gene2")
    assert returned == '    /note="gene2"\n'
    
def test_parseAttributes_6():
    'Attribute test 6 ID + Note entry'
    returned = parseAttributes("ID=gene1;Note=note4")
    assert returned == '  /gene=gene1\n    /note="note4"\n'
    
def test_parseAttributes_7():
    'Attribute test 7 single Note entry with quotes'
    returned = parseAttributes('Note="gene7"')
    assert returned == '    /note="gene7"\n'
    
def test_parseAttributes_8():
    'Attribute test 8 single id entry with quotes'
    returned = parseAttributes('id="gene8"')
    assert returned == '  /gene="gene8"\n'

In [None]:
!nosetests -v test_code.py

In [None]:
print "grene" in ["gene", "id"]

In [None]:
a = '"jkLmn"'
b = '3'
c = '"'
print a[1:-1]

In [None]:
%who

In [None]:
info = dict()

In [None]:
info['pham']= None
print type(info['pham'])
if info['pham']:
    print "true"
else:
    print "false"

In [None]:
whos