In [76]:
# read in CSVs of requirements, ratings, and responses as a dataframe
# Cobb-specific
import numpy as np
import pandas as pd

def ingestRFP(file, start_row = 0, end_row = 999, req_i = 2, rat_i = 5, resp_i =6):
    '''
    filelist is a file path to a CSV for a proposal
    req_i is the column index for requirements
    resp_i is the column index for responses
    start_row is the row to start reading on
    end_row is the row to stop reading on
    outputs a dataframe of requirement-response pairs
    
    '''
    
    # NOTE: in Excel, save as CSV UTF-8
    rfp_df = pd.read_table(file, sep = ",", skipinitialspace = True)
    
    # TODO: stop doing this as a dict; it doesn't maintain the order. 
    # create a stripped dict
    strip_dict = {
        "req" : rfp_df.iloc[start_row:end_row, req_i],
        "rating" : rfp_df.iloc[start_row:end_row, rat_i],
        "resp" : rfp_df.iloc[start_row:end_row, resp_i]
    }
    
    # first rows, then columns
    return pd.DataFrame(strip_dict)



# add date of response
# add Health / Trade / Other
# save as a file 
corpus = ingestRFP('corpus.csv', 7, 11, 2, 6, 7)
# output_response(corpus, corpus)
newRFP = ingestRFP('newRFP.csv', 7, 11, 2, 6, 7)

newRFP

# cps[cps.union.isin(["Union"])]

matchRows = newRFP.req.isin(corpus.req)
matchRows

# 1. exact match to a CSV of requirements with no responses 
# fill the most recent responses
# output to CSV (or XLS)

# 2. score the non-exact matches by some similarity algorithm
# fill the highest scoring responses

# 3. come up with a way of scoring

7      True
8     False
9      True
10    False
Name: req, dtype: bool

In [92]:
import difflib
# difflib.SequenceMatcher(None, 'hello world', 'hello').ratio()

def respondToReq(req, corpus):
    '''
    Given a requirement (req.req) and a dataframe 'corpus' of past responses
    Read through the corpus (corpus.req) to find the past requirement that most closely matches the requirement
    Overwrite the response (req.resp) and rating (req.rating) with that requirement's response.
    '''
    
    max_ratio = 0
    
    for index, row in corpus.iterrows():
        # this generates a number between 0 and 1 with the similarity of two strings
        try:
            lrat = difflib.SequenceMatcher(None, row.req, req.req).ratio()
        except: 
            lrat = 0
            print("SequenceMatcher choked on row.req = ", row.req, " , req.req = ", req.req)
        
        if lrat > max_ratio:
            max_ratio = lrat
            req.resp = row.resp
            req.rating = row.rating
    
    return req

def output_response(corpus, RFP):
    '''
    Given a 'corpus' of past answers
    output an RFP response in CSV format
    that duplicates previous responses to all exactly matching requirements 
    match on requirement language
    '''
    
    # read through each line of the RFP
    # using apply
    
    # send the arguments after the first to df.apply() like this:
    filledRFP = RFP.apply(respondToReq, corpus = corpus, axis = "columns") #, corpus)
    # axis = 'columns' tells apply to apply itself to each row. 

    return filledRFP

# (file, start_row = 0, end_row = 999, req_i = 2, rat_i = 5, resp_i =6)

corpus = ingestRFP('C:/Users/ahicken/Documents/proposal_data/naedutf8.csv', 9, 230, 2, 6, 7)
# last row should be 230

newRFP = ingestRFP('C:/Users/ahicken/Documents/proposal_data/adha.csv', 9, 132, 2, 5, 6)
# last row should be 132

filledRFP = output_response(corpus, newRFP)

filledRFP

SequenceMatcher choked on row.req =  nan  , req.req =  Ability to manage registration and access for live Webinar
SequenceMatcher choked on row.req =  nan  , req.req =  Ability to manage registration and access for live Webinar
SequenceMatcher choked on row.req =  nan  , req.req =  Ability to create/configure extended, facilitated online courses (e.g., a course an instructor leads over a period weeks using discussion, readings, assignments, etc.)
SequenceMatcher choked on row.req =  nan  , req.req =  Ability to set enrollment durations for courses
SequenceMatcher choked on row.req =  nan  , req.req =  Ability to configure and enforce a certification or learning path involving multiple courses/activities. 
SequenceMatcher choked on row.req =  nan  , req.req =  Ability to assign documents from a central document repository to multiple courses
SequenceMatcher choked on row.req =  nan  , req.req =  Ability to offer closed discussion forums that can only be accessed based upon permission be

SequenceMatcher choked on row.req =  nan  , req.req =  Structured exposure of all data collected/generated by the system for export to/consumption by 3rd-party applications.
SequenceMatcher choked on row.req =  nan  , req.req =  Please discuss your merchant compatability for credit card processing and include any information about your PCI compliance.
SequenceMatcher choked on row.req =  nan  , req.req =  Ability to create learning content with built-in content authoring functionality
SequenceMatcher choked on row.req =  nan  , req.req =  Ability to import and configure content developed in external tools, such as Articulate, Captivate, and Lectora
SequenceMatcher choked on row.req =  nan  , req.req =  Ability to support the import of SCORM, AICC, or xAPI compliant content
SequenceMatcher choked on row.req =  nan  , req.req =  Ability to create reusable learning objects that can be incorporated into different educational offerings
SequenceMatcher choked on row.req =  nan  , req.req =  

SequenceMatcher choked on row.req =  nan  , req.req =  Ability to sub-license the LMS to other groups (e.g. a state affiliate) to allow them to have a portal of their own, run by the nation ADHA with some ability to create their own content.
SequenceMatcher choked on row.req =  nan  , req.req =  Ability to host/support streaming media
SequenceMatcher choked on row.req =  nan  , req.req =  Ability to receive registration and enrollment data from ADHA's custom e-commerce system after the completion of an order and auto-create the appropriate accounts and enrollments
SequenceMatcher choked on row.req =  nan  , req.req =  Ability to support single sign-on with a Webinar platform
SequenceMatcher choked on row.req =  nan  , req.req =  Ability to support deeper integration with Zoom (e.g., to use attendance data from the Webinar to mark the corresponding module or course complete in the platform)
SequenceMatcher choked on row.req =  nan  , req.req =  Ability to support federated search
Sequen

Unnamed: 0,req,rating,resp
9,Ability to manage registration and access for ...,S,We have experience with client preferred third...
10,"Ability to create/configure extended, facilita...",S,
11,Ability to set enrollment durations for courses,S,Course duration can be configured to a set num...
12,Ability to configure and enforce a certificati...,S,
13,Ability to assign documents from a central doc...,S,The file manager acts as a repository for all ...
14,Ability to offer closed discussion forums that...,S,CourseStage discussion forums are full-feature...
15,Ability to associate multiple assessments with...,S,
16,Ability to block access to course content unti...,S,Requirements for issuance of credit are set at...
17,Ability to associate an evaluation (survey) wi...,S,
18,Ability for system to automatically issue cred...,S,Requirements for issuance of credit are set at...


In [95]:
difflib.SequenceMatcher(None, "The quick brown fox jumped over the lazy dog", "The quick brown fox jumped over the lazy dog").ratio()

1.0

In [45]:
 # DataFrame.join(other, on=None, how='left', lsuffix='', rsuffix='', sort=False)[source]

# newRFP.join(corpus, on = 'req', how = 'left' )

# newRFP.join(corpus.set_index(cols), on=cols)



# newRFP

newRFP.merge(corpus, how='left', left_on='req', right_on='req')

Unnamed: 0,req,rating_x,resp_x,rating_y,resp_y
0,Ability to manage registration and access for ...,,,,
1,"Ability to create/configure extended, facilita...",,,,
2,Ability to set enrollment durations for courses,,,,
3,Ability to configure and enforce a certificati...,,,,
