In [23]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel 
from nltk.tokenize import RegexpTokenizer

In [2]:
# read in combined dataset
df = pd.read_csv('./data/combined.csv')

In [3]:
df.shape

(39922, 27)

In [8]:
df.sort_values(by='postedDate', ascending=False, ignore_index=True, inplace=True)

In [15]:
# # document
# document = df.loc[:, ['title']]
# document.head()

Unnamed: 0,title
0,Laundry Services for Medical Isolation Gowns
1,"QUARTERLY SUBSISTENCE REQUIREMENTS, 3RD QUARTE..."
2,The U.S. Department of Agriculture (USDA) seek...
3,Y--Bible Creek MSE Wall
4,Z--BLM-CO-RMD - PONCHA VILLA Headcut Stabiliza...


#### Content Based Recommender System

**using Tf-Idf Vectorizer and cosine similarity**

[source](https://heartbeat.fritz.ai/recommender-systems-with-python-part-i-content-based-filtering-5df4940bd831)

In [35]:
# define function to tokenize a column

def tokenizer_function(column):
    """
    Takes in a text column
        tokenizes the text in each row
        using pattern [[a-zA-Z]\w+]
        which matches every lowercase and upperase character between a-z that are word characters
    Returns list of strings
    """
    
    # instantiate empty list of tokenized text
    texts = []
    
    # instantiate tokenizer
    tokenizer = RegexpTokenizer('[a-zA-Z]\w+')
    
    # create for loop to tokenize each row and add the list of tokens to texts
    for text in column:
        tokens = tokenizer.tokenize(text)
        
        # transform tokens into lower case strings
        tokens = [token.lower() for token in tokens]
        texts.append(' '.join(tokens))
    return pd.Series(texts)

In [36]:
# tokenize title column

tokenized_corpus = tokenizer_function(df['title'])

In [37]:
tokenized_corpus.head()

0         laundry services for medical isolation gowns
1     quarterly subsistence requirements rd quarter fy
2    the department of agriculture usda seeks to le...
3                                 bible creek mse wall
4        blm co rmd poncha villa headcut stabilization
dtype: object

In [43]:
# instantiate Tf-Idf Vectorizer
tvec = TfidfVectorizer(
#     strip_accents='ascii',
    ngram_range=(1, 3), 
    stop_words='english',
    max_features=500
)

# fit_transform title column
vectorized_matrix = tvec.fit_transform(tokenized_corpus)

# source: https://heartbeat.fritz.ai/recommender-systems-with-python-part-i-content-based-filtering-5df4940bd831

In [44]:
tvec.get_feature_names()

['access',
 'actuator',
 'adapter',
 'administration',
 'advanced',
 'afb',
 'agency',
 'agreement',
 'ai',
 'air',
 'aircraft',
 'amend',
 'amendment',
 'analysis',
 'analyzer',
 'announcement',
 'annual',
 'antenna',
 'area',
 'army',
 'assemb',
 'assemb repair',
 'assemb repair modification',
 'assembly',
 'assembly metal',
 'assembly nonme',
 'assembly spec',
 'assy',
 'ave',
 'avenue',
 'award',
 'award sole',
 'award sole source',
 'ball',
 'base',
 'based',
 'battery',
 'bearing',
 'bldg',
 'bldg rm',
 'bldg rm fl',
 'bldg room',
 'blvd',
 'board',
 'body',
 'bolt',
 'box',
 'bpa',
 'bracket',
 'brake',
 'brand',
 'breaker',
 'bridge',
 'broad',
 'build',
 'building',
 'business',
 'business line',
 'ca',
 'cable',
 'cable assembly',
 'cable assembly spec',
 'camp',
 'cap',
 'card',
 'card assemb',
 'card assemb repair',
 'care',
 'cartridge',
 'cci',
 'cci bldg',
 'cci bldg rm',
 'cell',
 'center',
 'check',
 'circuit',
 'circuit bldg',
 'circuit breaker',
 'circuit card',
 'ci

In [40]:
linear_kernel(vectorized_matrix)

array([[1.        , 0.        , 0.        , ..., 0.00952368, 0.        ,
        0.        ],
       [0.        , 1.        , 0.        , ..., 0.        , 0.14159357,
        0.        ],
       [0.        , 0.        , 1.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.00952368, 0.        , 0.        , ..., 1.        , 0.        ,
        0.        ],
       [0.        , 0.14159357, 0.        , ..., 0.        , 1.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        1.        ]])

In [27]:
# part of this article (https://heartbeat.fritz.ai/recommender-systems-with-python-part-i-content-based-filtering-5df4940bd831)
# figuring out how it works and adapting it to my model
cosine_similarities = linear_kernel(tfidf_matrix, tfidf_matrix) results = {}
for idx, row in ds.iterrows():
   similar_indices = cosine_similarities[idx].argsort()[:-100:-1] 
   similar_items = [(cosine_similarities[idx][i], ds['id'][i]) for i in similar_indices] 
   results[row['id']] = similar_items[1:]

<function Pattern.findall(string, pos=0, endpos=9223372036854775807)>

To do list:

1. further clean the texts
2. build the cosin similarities matrix
3. include other columns into the cosin similarities matrix
4. right the function that returns the similar titles