In [9]:
import requests
import re


In [19]:
class UniprotRequest:
    
    """
    The init takes a uniprot_id_list and will retry 10 times. This is helpful because you are making a call to an external
    server and initializing that link for use in the rest of the program. 

    find_motif

    Values passed: 
    1. Protein sequence
    2. REGEX : This link explains how Python3 regex works: https://docs.python.org/3/library/re.html 

    In the example below, what is the REGEX doing? 

    Need lookahead in regex to allow overlapping of motifs.

    First, it defines the find_motifs() method DIFFERENT than find_motif()
    Next there's a fasta_to_seq() method defined
    Next there's a git_protein_data() method defined. 
    """

    def __init__(self, uniprot_id_list, retry=10):
        self.retry = retry
        self.uniprot_URL = "http://www.uniprot.org/uniprot/"
        self.suffix = ".fasta"
        self.results_dict = {}
        self.get_protein_data(uniprot_id_list)
        
    def find_motif(self, protein_sequence, motif_regex):
        start_positions = []
        matches = re.finditer(motif_regex, protein_sequence)
        for match in matches:
            start_positions.append(match.start() + 1)  # 1 indexed
        return start_positions
    """
    More on REGEX: https://docs.python.org/3/library/re.html
    """

    def find_motifs(self, motif_regex=r"N{1,1}(?=[^P]{1,1}(S|T){1,1}[^P]{1,1})", motif="N-{P}-[ST]-{P}",
                    motif_name="N-glycosylation motif"):
        for item in self.results_dict:
            """
            protein_seq is getting results from the URL and processing them, looking for the
            protein_seq part of the data returned. 
            """
            protein_seq = self.results_dict[item]["protein_seq"]
            """
            Here's the individual call to find_motif for each item in the dictionary
            """
            motif_positions = self.find_motif(protein_seq, motif_regex)
            """
            This returns the results with a decorator (new data) that enumerates where the motif was 
            found in the protein sequence
            This is the position data added by this method in the output below: 
            47, 115, 116, 382, 409
            
            you can learn more about this if you select the python flavor here: https://regex101.com/
            """
            self.results_dict[item][motif_name] = {"motif": motif, "positions": motif_positions}

    def fasta_to_seq(self, fasta):
        start_index = re.search(r"\n", fasta).start() + 1
        sequence = fasta[start_index:]
        sequence = re.sub(r"[\n\r]+", "", sequence)
        return sequence

    def get_protein_data(self, uniprot_id_list):

        for item in uniprot_id_list:
            data_entry = {}
            fasta_format = self.data_transaction(item)
            data_entry["fasta_format"] = fasta_format
            protein_seq = self.fasta_to_seq(fasta_format)
            data_entry["protein_seq"] = protein_seq
            self.results_dict[item] = data_entry
        return

    def data_transaction(self, uniprot_id):
        response = requests.get(self.uniprot_URL + uniprot_id + self.suffix)
        retry_times = 0
        if response.ok:
            return response.text
        else:
            retry_times += 1
            if retry_times == self.retry:
                response.raise_for_status()

    def get_data(self):
        return self.results_dict

if __name__ == "__main__":
    """
    The section below initializes a variable using UniprotRequest(), then it runs the find_motifs() method 
    """
    test = UniprotRequest(["P07204_TRBM_HUMAN"])
    test.find_motifs()

    print("results:")
    print(test.results_dict)


results:
{'P07204_TRBM_HUMAN': {'fasta_format': '>sp|P07204|TRBM_HUMAN Thrombomodulin OS=Homo sapiens OX=9606 GN=THBD PE=1 SV=2\nMLGVLVLGALALAGLGFPAPAEPQPGGSQCVEHDCFALYPGPATFLNASQICDGLRGHLM\nTVRSSVAADVISLLLNGDGGVGRRRLWIGLQLPPGCGDPKRLGPLRGFQWVTGDNNTSYS\nRWARLDLNGAPLCGPLCVAVSAAEATVPSEPIWEEQQCEVKADGFLCEFHFPATCRPLAV\nEPGAAAAAVSITYGTPFAARGADFQALPVGSSAAVAPLGLQLMCTAPPGAVQGHWAREAP\nGAWDCSVENGGCEHACNAIPGAPRCQCPAGAALQADGRSCTASATQSCNDLCEHFCVPNP\nDQPGSYSCMCETGYRLAADQHRCEDVDDCILEPSPCPQRCVNTQGGFECHCYPNYDLVDG\nECVEPVDPCFRANCEYQCQPLNQTSYLCVCAEGFAPIPHEPHRCQMFCNQTACPADCDPN\nTQASCECPEGYILDDGFICTDIDECENGGFCSGVCHNLPGTFECICGPDSALARHIGTDC\nDSGKVDGGDSGSGEPPPSPTPGSTLTPPAVGLVHSGLLIGISIASLCLVVALLALLCHLR\nKKQGAARAKMEYKCAAPSKEVVLQHVRTERTPQRL\n', 'protein_seq': 'MLGVLVLGALALAGLGFPAPAEPQPGGSQCVEHDCFALYPGPATFLNASQICDGLRGHLMTVRSSVAADVISLLLNGDGGVGRRRLWIGLQLPPGCGDPKRLGPLRGFQWVTGDNNTSYSRWARLDLNGAPLCGPLCVAVSAAEATVPSEPIWEEQQCEVKADGFLCEFHFPATCRPLAVEPGAAAAAVSITYGTPFAARGADFQALPVGSSAAVAPLGLQLMCTAPPGAVQGHWAREAPGAWDCSVENGGCEHACN