In [43]:
import pandas as pd
pd.set_option("display.min_rows", 50)
pd.set_option("display.max_rows", 100)

gene_list = pd.read_table("./salmon_tx2gene.tsv", header = None)

In [44]:
gene_list = gene_list.loc[:,1].drop_duplicates().to_list()

In [62]:
class solver: 
    def __init__(self, gene_list):
        '''take in a gene list, filter to only 5 letter names
        setup containers for incorrect letters.
        '''

        # define set of incorrect (grey) letters
        self.no_let = set()
        fivelet_genes = []
        for gene in gene_list:
            if len(gene) <= 5:
                fivelet_genes.append(gene)
        split_genes = []
        for gene in fivelet_genes:
            split_genes.append(list(gene))
        split_genes = pd.DataFrame(split_genes)
        split_genes = split_genes.fillna(" ")
        self.split_genes = split_genes
        self.possibilities = split_genes
        # container to track guesses
        self.guesses = []

    def __call__(self):
        return self.possibilities

    def guess(self, word, yellow=[], green=[], add_to_guess_list=True):
        '''
        first arg is a string, case sensitive of the guessed gene
        yellow and green are lists of positions of the yellow or green letters (1-based)
        add_to_guess_list is not needed for normal interaction, just for the self.redo function
        returns a list of possibilities
        '''
        if add_to_guess_list:
            self.guesses.append((word, yellow, green))
        wordlist = list(word)
        # require yellow letters to be in another position
        for position in yellow:
            pypos = position - 1
            self.possibilities = self.possibilities.loc[~self.possibilities[pypos].isin([wordlist[pypos]]),:]
            self.possibilities = self.possibilities.loc[self.possibilities.isin([wordlist[pypos]]).any(axis=1),:]
        # require green letters to be in same position
        for position in green:
            pypos = position - 1
            self.possibilities = self.possibilities.loc[self.possibilities[pypos].isin([wordlist[pypos]]),:]
        # add grey letters to blocked list
        for letter in wordlist:
            yellow_or_green_idx = yellow + green
            if wordlist.index(letter) + 1 not in yellow_or_green_idx:
                self.no_let.add(letter)
        for letter in self.no_let:
            self.possibilities = self.possibilities.loc[~self.possibilities.isin(self.no_let).any(axis=1),:]
        return self.possibilities 
    
    def redo(self):
        # allows rerunning the guesses after editing self.guesses if a mistake was made
        self.possibilities = self.split_genes
        for guess_set in self.guesses:
            self.guess(*guess_set, add_to_guess_list=False)
        return self.__call__()



In [63]:
mysolver = solver(gene_list)
mysolver()

Unnamed: 0,0,1,2,3,4
0,O,R,4,F,5
1,O,R,4,F,3
2,N,O,C,2,L
3,P,E,R,M,1
4,H,E,S,4,
5,I,S,G,1,5
6,A,G,R,N,
7,S,D,F,4,
8,A,C,A,P,3
9,P,U,S,L,1


In [47]:
mysolver.guess("DAZ3 ", yellow=[1,2], green=[])

Unnamed: 0,0,1,2,3,4
80,F,H,A,D,1
134,K,D,M,1,A
183,A,H,D,C,1
214,H,D,A,C,1
252,C,D,C,A,8
297,K,D,M,4,A
371,I,N,A,D,L
402,A,C,A,D,M
504,A,M,P,D,2
544,A,M,P,D,1


In [59]:
mysolver.guess("HDAC4", yellow=[2], green=[3])

Unnamed: 0,0,1,2,3,4
10,C,P,T,P,
13,C,C,N,L,2
60,C,O,R,T,
70,C,L,C,N,6
82,C,T,R,C,
94,C,R,O,C,C
131,C,1,Q,C,
132,C,1,Q,B,
147,C,N,R,2,
155,C,L,I,C,4


In [49]:
mysolver.guess("SMAD7", yellow=[], green=[3,4])

Unnamed: 0,0,1,2,3,4
371,I,N,A,D,L
675,F,L,A,D,1
1336,A,T,A,D,1
1633,T,E,A,D,1
4012,T,R,A,D,D
4372,A,T,A,D,5
5508,T,E,A,D,2
8815,E,V,A,D,R
9868,A,T,A,D,2


In [50]:
mysolver.guess("TRADD", yellow=[1], green=[3,4])

Unnamed: 0,0,1,2,3,4
1336,A,T,A,D,1
4372,A,T,A,D,5
9868,A,T,A,D,2


In [51]:
mysolver.guess("ATAD2", yellow=[], green=[1,2,3,4])

Unnamed: 0,0,1,2,3,4
1336,A,T,A,D,1
4372,A,T,A,D,5


In [52]:
mysolver.guess("ATAD1", yellow=[], green=[1,2,3,4,5])

Unnamed: 0,0,1,2,3,4
1336,A,T,A,D,1
