In [1]:
import os
import json
import pandas as pd

#Read one or many files into a pandas dataframe
class DataFrameBuilder(object):
    def __init__(self):
        self.data_frame = None
        self._data_list = []
    def read_to_pandas(self, path):
        with open(path) as data_file:
            if (path.endswith(".csv")):
                data = pd.read_csv(data_file)
            elif path.endswith(".json"):
                data = pd.read_json(data_file)
            else:
                raise ValueError("No parser exists for parsing %s" + path)
            self._data_list.append(data)
        self.data_frame = pd.concat(self._data_list)

#List all the files available in the selected immuta data source
def list_files(data_frame_builder, startpath):
    for root, dirs, files in os.walk(startpath):
        for file in files:
            path = os.path.join(root, file)
            print path
            #optionally build the pandas dataframe from the files
            data_frame_builder.read_to_pandas(path)

### This is your immuta root directory where all your data sources will appear
If you have none, subscribe to some from the Immuta storefront, otherwise, append a data source of interest to the path.  This will read **ALL** your files to the DF, so be careful what you point it to

In [2]:
startpath = 'immuta/[enter data source name here]/'
dfb = DataFrameBuilder()
list_files(dfb, startpath)

### Do something below with your [pandas dfb.data_frame](http://pandas.pydata.org/pandas-docs/stable/10min.html)

In [3]:
# !pip install git+git://github.com/seatgeek/fuzzywuzzy.git@0.15.0#egg=fuzzywuzzy
!pip install fuzzywuzzy



In [4]:
%%time

# trying to match course title from STEP to catalog

from fuzzywuzzy import fuzz
from fuzzywuzzy import process
import numpy as np
import pickle
import os.path


# function to execute fuzzy match
def findFuzzyMatch(amazonUniqueCourses,matchingCatalog, fname): # parameter: STEP report, training catalog, pickle file name to save
    
    amazonUniqueCoursesSplit = np.array_split(amazonUniqueCourses,amazonUniqueCourses.shape[0]/25) #split df into a batch of 25 rows
    
    for amazonUniqueCourse in amazonUniqueCoursesSplit:
        print "start new batch!"
        fuzzyMatch2 = pd.DataFrame(columns=['StepIndex','STEP Course#','STEP Title','CatalogIndex','Catalog Course#', \
                                       'Catalog Title','Fuzzy Score'])
        for aindex, arow in amazonUniqueCourse.iterrows():
            print("aindex: ",aindex)
            for cindex, crow in matchingCatalog.iterrows():
                fuzzyMatch2 = fuzzyMatch2.append(pd.DataFrame([[aindex, arow['LACODE'], arow['LATITLE'], \
                                                              cindex, crow['course#'], crow['course title'], \
                                                              fuzz.ratio(arow['LATITLE'],crow['course title'])]], \
                                                            columns=['StepIndex','STEP Course#','STEP Title','CatalogIndex', \
                                                                     'Catalog Course#','Catalog Title','Fuzzy Score']), ignore_index=True)
        if os.path.isfile(fname):
            fuzzyMatch = pickle.load(open(fname,'rb'))
            fuzzyMatch = fuzzyMatch.append(fuzzyMatch2)
        else:
            fuzzyMatch = fuzzyMatch2
        pickle.dump(fuzzyMatch, open(fname, "wb" ) )
        fuzzyMatch.info()

#load STEP Report
stepReport = pd.read_csv('immuta/STEP Report/STEP Report.csv')
stepReportExtract = stepReport[['asset_id','asset_title']].drop_duplicates() #pull only the unique courses
stepReportExtract = stepReportExtract.rename(columns={"asset_id": "LACODE", "asset_title": "LATITLE"})     

#Load catalog
catalogBusDec = pd.read_csv('immuta/December 05 2016 Catalog Business Courses/December 05 2016 Catalog Business Courses.csv')


# call fuzzy match function para: STEP, Catalog, pickle file name)
## adjust stepReportExtract[0:50] to different sizes e.g. 100, 500, 1000, all rows
findFuzzyMatch(stepReportExtract[0:50],catalogBusDec, "FuzzyMatchBusFinal.p")

# load fuzzy match result from pickle file
FuzzyMatchBusFinal = pickle.load(open('FuzzyMatchBusFinal.p','rb'))
FuzzyMatchBusFinal



start new batch!
('aindex: ', 0)
('aindex: ', 1)
('aindex: ', 2)
('aindex: ', 3)
('aindex: ', 4)
('aindex: ', 5)
('aindex: ', 6)
('aindex: ', 7)
('aindex: ', 8)
('aindex: ', 9)
('aindex: ', 10)
('aindex: ', 11)
('aindex: ', 12)
('aindex: ', 13)
('aindex: ', 14)
('aindex: ', 15)
('aindex: ', 16)
('aindex: ', 18)
('aindex: ', 19)
('aindex: ', 20)
('aindex: ', 21)
('aindex: ', 23)
('aindex: ', 24)
('aindex: ', 25)
('aindex: ', 26)
<class 'pandas.core.frame.DataFrame'>
Int64Index: 179025 entries, 0 to 25574
Data columns (total 7 columns):
StepIndex          179025 non-null float64
STEP Course#       179025 non-null object
STEP Title         179025 non-null object
CatalogIndex       179025 non-null float64
Catalog Course#    179025 non-null object
Catalog Title      179025 non-null object
Fuzzy Score        179025 non-null float64
dtypes: float64(3), object(4)
memory usage: 10.9+ MB
start new batch!
('aindex: ', 27)
('aindex: ', 28)
('aindex: ', 29)
('aindex: ', 30)
('aindex: ', 31)
('ainde

In [7]:
fuzz.partial_ratio("Black-Box Software Testing Techniques", "White-Box Software Testing Techniques")

86

In [8]:
fuzz.partial_ratio("TestPrep 1Z0-808 Java SE 8 Programmer I", "TestPrep 1Z0-809 Java SE 8 Programmer II")

97

In [5]:
pd.options.display.max_rows = 1000
FuzzyMatchBusFinal.sort(['Fuzzy Score'], ascending=False).head(1000)

  from ipykernel import kernelapp as app


Unnamed: 0,StepIndex,STEP Course#,STEP Title,CatalogIndex,Catalog Course#,Catalog Title,Fuzzy Score
2679,2.0,pd_05_a02_bs_enus,Managing from Within: Self-empowerment,633.0,pd_05_a02_bs_enus,Managing from Within: Self-empowerment,100.0
17939,44.0,comm_26_a02_bs_enus,Communicating Across Cultures,548.0,comm_26_a02_bs_enus,Communicating Across Cultures,100.0
16891,43.0,comm_18_a01_bs_enus,Essential Skills for Professional Telephone Calls,523.0,comm_18_a01_bs_enus,Essential Skills for Professional Telephone Calls,100.0
14963,40.0,pd_09_a03_bs_enus,Generating Creative and Innovative Ideas: Veri...,641.0,pd_09_a03_bs_enus,Generating Creative and Innovative Ideas: Veri...,100.0
11253,37.0,_pc_bi_ssbi001,Are You Listening to Your Customers?,0.0,_pc_bi_ssbi001,Are You Listening to Your Customers?,100.0
11189,10.0,cust_07_a02_bs_enus,Communication Skills,959.0,cust_07_a02_bs_enus,Communication Skills,100.0
13124,12.0,_pc_ch_lach023,Building and Leading Teams,848.0,_pc_ch_lach023,Building and Leading Teams,100.0
14963,40.0,pd_09_a03_bs_enus,Generating Creative and Innovative Ideas: Veri...,641.0,pd_09_a03_bs_enus,Generating Creative and Innovative Ideas: Veri...,100.0
12097,11.0,_pc_bi_lsbi010,Inspiring Your Team,844.0,_pc_bi_lsbi010,Inspiring Your Team,100.0
3897,3.0,mntpmp5ed,Mentoring Project Management Professional (PMP...,828.0,mntpmp5ed,Mentoring Project Management Professional (PMP...,100.0
