In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel 
from nltk.tokenize import RegexpTokenizer

In [2]:
# read in combined dataset
df = pd.read_csv('./data/combined.csv')

In [3]:
df.shape

(43454, 27)

In [4]:
df.sort_values(by='postedDate', ascending=False, ignore_index=True, inplace=True)

In [15]:
# # document
# document = df.loc[:, ['title']]
# document.head()

Unnamed: 0,title
0,Laundry Services for Medical Isolation Gowns
1,"QUARTERLY SUBSISTENCE REQUIREMENTS, 3RD QUARTE..."
2,The U.S. Department of Agriculture (USDA) seek...
3,Y--Bible Creek MSE Wall
4,Z--BLM-CO-RMD - PONCHA VILLA Headcut Stabiliza...


#### Content Based Recommender System

**using Tf-Idf Vectorizer and cosine similarity**

[source](https://heartbeat.fritz.ai/recommender-systems-with-python-part-i-content-based-filtering-5df4940bd831)

In [5]:
# define function to tokenize a column

def tokenizer_function(column):
    """
    Takes in a text column
        tokenizes the text in each row
        using pattern [[a-zA-Z]\w+]
        which matches every lowercase and upperase character between a-z that are word characters
    Returns list of strings
    """
    
    # instantiate empty list of tokenized text
    texts = []
    
    # instantiate tokenizer
    tokenizer = RegexpTokenizer('[a-zA-Z]\w+')
    
    # create for loop to tokenize each row and add the list of tokens to texts
    for text in column:
        tokens = tokenizer.tokenize(text)
        
        # transform tokens into lower case strings
        tokens = [token.lower() for token in tokens]
        texts.append(' '.join(tokens))
    return pd.Series(texts)

In [6]:
# tokenize title column

tokenized_corpus = tokenizer_function(df['title'])

In [7]:
tokenized_corpus.head()

0                   facility sanitation hvac cleansing
1                                               spe4a6
2    v112 phoenix va carl hayden medical center shu...
3                dairy solicitation new england region
4                       devil elbow dry storage garage
dtype: object

In [13]:
# instantiate Tf-Idf Vectorizer
tvec = TfidfVectorizer(
#     strip_accents='ascii',
    ngram_range=(1, 3), 
    stop_words='english'
#     max_features=500
)

# fit_transform title column
vectorized_matrix = tvec.fit_transform(tokenized_corpus)

# source: https://heartbeat.fritz.ai/recommender-systems-with-python-part-i-content-based-filtering-5df4940bd831

In [14]:
tvec.get_feature_names()

['a0',
 'a0 renovate',
 'a0 renovate cwt',
 'a00',
 'a00001',
 'a00001 micro',
 'a00001 micro motion',
 'a00002',
 'a00002 micro',
 'a00002 micro motion',
 'a00009',
 'a00018',
 'a00026',
 'a0003',
 'a00036',
 'a00036 cs',
 'a00036 cs alex',
 'a00037',
 'a00037 veterans',
 'a00037 veterans healthcare',
 'a00039',
 'a00068',
 'a00070',
 'a0008147',
 'a0008147 used',
 'a0008147 used mix',
 'a0008153',
 'a0008153 demil',
 'a0008153 demil mut',
 'a0008160',
 'a0008160 scrap',
 'a0008160 scrap residue',
 'a00087',
 'a00089',
 'a00099',
 'a003',
 'a013',
 'a013 powder',
 'a013 powder road',
 'a01397',
 'a03027',
 'a03028',
 'a04',
 'a04 felix',
 'a04 felix st',
 'a1',
 'a1 leased',
 'a1 leased tail',
 'a1 rm',
 'a1 rm utility',
 'a100',
 'a100 rm',
 'a100 rm n34',
 'a100mb',
 'a100mb ethernet',
 'a100mb ethernet service',
 'a120',
 'a120 digit',
 'a120 digit reference',
 'a135',
 'a135 sports',
 'a135 sports arena',
 'a13627',
 'a13627 l3',
 'a13627 l3 nsn',
 'a141',
 'a141 fl',
 'a141 fl in

In [20]:
# get the vocabulary with their tf-idf indeces
vocab_dict = tvec.vocabulary_

In [31]:
# reverse the vocab_dict - to have indeces as keys, and words as values
inverse_vocab_dict = {value: key for (key, value) in vocab_dict.items()}

In [33]:
# define function that gives back sorted dictionary of indeces and vocabulary 
def dictionary_sort(dictionary):
    """ takes dictionary with indeces (integers or floats) as keys
        returns sorted dictionary
        descending order """
    # isolate the keys of the dictionary
    # sort them in descending order
    sorted_key_list = sorted(dictionary.keys(), reverse=True)
    # instantiate empty dictionary to store key: value pairs
    sorted_dictionary = {}
    # iterate through sorted list of keys
    # add key: value pair to sorted dictionary
    for number in sorted_key_list:
        sorted_dictionary[number] = dictionary[number]
    return sorted_dictionary

In [34]:
# dictionary of title vocabulary
# ordered by their importance relative to the corpus of titles
dictionary_sort(inverse_vocab_dict)

{168980: 'zzyzx head mojave',
 168979: 'zzyzx head',
 168978: 'zzyzx',
 168977: 'zx tm wind',
 168976: 'zx tm',
 168975: 'zx',
 168974: 'zurich instruments equal',
 168973: 'zurich instruments',
 168972: 'zurich',
 168971: 'zuni wastewater treatment',
 168970: 'zuni wastewater',
 168969: 'zuni',
 168968: 'zumwalt ddg lltm',
 168967: 'zumwalt ddg',
 168966: 'zumwalt',
 168965: 'zukeran elementary school',
 168964: 'zukeran elementary',
 168963: 'zukeran',
 168962: 'zshield face cloth',
 168961: 'zshield face',
 168960: 'zshield',
 168959: 'zoom webcast training',
 168958: 'zoom webcast',
 168957: 'zoom platinum level',
 168956: 'zoom platinum',
 168955: 'zoom',
 168954: 'zones minneapolis vamc',
 168953: 'zones minneapolis',
 168952: 'zones matoc',
 168951: 'zones hubzone small',
 168950: 'zones hubzone',
 168949: 'zones',
 168948: 'zone weg subscale',
 168947: 'zone weg hq',
 168946: 'zone weg',
 168945: 'zone vessel traffic',
 168944: 'zone vessel',
 168943: 'zone traffic calming',
 1

In [32]:
# cosine similarity matrix
# add some other columns into a bag of words
# further clean the titles

# figure out how to call up titles that were similar to a search term
# try defining max_df instead of removing english stop words
# min_df as well - if something occures just once 
# maybe I need 2-grams to 4-grams (2, 4)

[168980,
 168979,
 168978,
 168977,
 168976,
 168975,
 168974,
 168973,
 168972,
 168971,
 168970,
 168969,
 168968,
 168967,
 168966,
 168965,
 168964,
 168963,
 168962,
 168961,
 168960,
 168959,
 168958,
 168957,
 168956,
 168955,
 168954,
 168953,
 168952,
 168951,
 168950,
 168949,
 168948,
 168947,
 168946,
 168945,
 168944,
 168943,
 168942,
 168941,
 168940,
 168939,
 168938,
 168937,
 168936,
 168935,
 168934,
 168933,
 168932,
 168931,
 168930,
 168929,
 168928,
 168927,
 168926,
 168925,
 168924,
 168923,
 168922,
 168921,
 168920,
 168919,
 168918,
 168917,
 168916,
 168915,
 168914,
 168913,
 168912,
 168911,
 168910,
 168909,
 168908,
 168907,
 168906,
 168905,
 168904,
 168903,
 168902,
 168901,
 168900,
 168899,
 168898,
 168897,
 168896,
 168895,
 168894,
 168893,
 168892,
 168891,
 168890,
 168889,
 168888,
 168887,
 168886,
 168885,
 168884,
 168883,
 168882,
 168881,
 168880,
 168879,
 168878,
 168877,
 168876,
 168875,
 168874,
 168873,
 168872,
 168871,
 168870,
 

To do list:

1. further clean the texts
2. build the cosin similarities matrix
3. include other columns into the cosin similarities matrix
4. right the function that returns the similar titles