In [9]:
pip install redis


The following command must be run outside of the IPython shell:

    $ pip install redis

The Python package manager (pip) can only be used from outside of IPython.
Please reissue the `pip` command in a separate terminal or command prompt.

See the Python documentation for more informations on how to install packages:

    https://docs.python.org/3/installing/


In [8]:
import pandas as pd
import time
import redis
from flask import current_app
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel




ModuleNotFoundError: No module named 'redis'

In [None]:
def info(msg):
    current_app.logger.info(msg)


class ContentEngine(object):

    SIMKEY = 'p:smlr:%s'

    def __init__(self):
        self._r = redis.StrictRedis.from_url(current_app.config['REDIS_URL'])

    def train(self, data_source):
        start = time.time()
        ds = pd.read_csv(data_source)
        info("Training data ingested in %s seconds." % (time.time() - start))

        # Flush the stale training data from redis
        self._r.flushdb()

        start = time.time()
        self._train(ds)
        info("Engine trained in %s seconds." % (time.time() - start))

    def _train(self, ds):
        """
        Train the engine.

        Create a TF-IDF matrix of unigrams, bigrams, and trigrams
        for each product. The 'stop_words' param tells the TF-IDF
        module to ignore common english words like 'the', etc.

        Then we compute similarity between all products using
        SciKit Leanr's linear_kernel (which in this case is
        equivalent to cosine similarity).

        Iterate through each item's similar items and store the
        100 most-similar. Stops at 100 because well...  how many
        similar products do you really need to show?

        Similarities and their scores are stored in redis as a
        Sorted Set, with one set for each item.

        :param ds: A pandas dataset containing two fields: description & id
        :return: Nothin!
        """

        tf = TfidfVectorizer(analyzer='word',
                             ngram_range=(1, 3),
                             min_df=0,
                             stop_words='english')
        tfidf_matrix = tf.fit_transform(ds['description'])

        cosine_similarities = linear_kernel(tfidf_matrix, tfidf_matrix)

        for idx, row in ds.iterrows():
            similar_indices = cosine_similarities[idx].argsort()[:-100:-1]
            similar_items = [(cosine_similarities[idx][i], ds['id'][i])
                             for i in similar_indices]

            # First item is the item itself, so remove it.
            # This 'sum' is turns a list of tuples into a single tuple:
            # [(1,2), (3,4)] -> (1,2,3,4)
            flattened = sum(similar_items[1:], ())
            self._r.zadd(self.SIMKEY % row['id'], *flattened)

    def predict(self, item_id, num):
        """
        Couldn't be simpler! Just retrieves the similar items and
        their 'score' from redis.

        :param item_id: string
        :param num: number of similar items to return
        :return: A list of lists like: [["19", 0.2203],
        ["494", 0.1693], ...]. The first item in each sub-list is
        the item ID and the second is the similarity score. Sorted
        by similarity score, descending.
        """

        return self._r.zrange(self.SIMKEY % item_id,
                              0,
                              num-1,
                              withscores=True,
                              desc=True)

content_engine = ContentEngine()

In [17]:
#include libraries
import csv # to open/close/append CSV
import os # to check if file exists
import nltk #natural language toolkit
import pandas as pd
import numpy as np
from datetime import datetime
import re
from collections import Counter, defaultdict 
from glob import glob
import seaborn as sns
import matplotlib.pyplot as plt
sns.set_style('darkgrid')


file_exists = os.path.isfile('/Users/boris/Downloads/dice_com-job_us_sample.csv')



# loop to check if file exists
if file_exists == 0:
    print('Error: dice_com-job_us_sample.csv does not exist!')
elif file_exists == 1:
    try:
        CSV_file = pd.read_csv('/Users/boris/Downloads/dice_com-job_us_sample.csv', sep=',', header='infer')
        print('Exists, and file is ok.')
    except Exception as e:
        Date_Advertised = None
        print('Exists, but failed to open.')

Exists, and file is ok.


In [22]:
import glob
import csv

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

def find_similar(tfidf_matrix, index, top_n = 5):
    cosine_similarities = linear_kernel(tfidf_matrix[index:index+1], tfidf_matrix).flatten()
    related_docs_indices = [i for i in cosine_similarities.argsort()[::-1] if i != index]
    return [(index, cosine_similarities[index]) for index in related_docs_indices][0:top_n]

corpus = []
for file in glob.glob("/Users/boris/Downloads/dice_com-job_us_sample.csv"):
    with open(file, "r") as paper:
        corpus.append((file, paper.read()))

tf = TfidfVectorizer(analyzer='word', ngram_range=(1,3), min_df = 0, stop_words = 'english')
tfidf_matrix =  tf.fit_transform([content for file, content in corpus])

with open("/Users/boris/Downloads/dice_com-job_us_sample.csv", "w") as similarities_file:
    writer = csv.writer(similarities_file, delimiter = ",")

    for me_index, item in enumerate(corpus):
        similar_documents =  [(corpus[index], score) for index, score in find_similar(tfidf_matrix, me_index)]
        me = corpus[me_index]

        document_id = me[0].split("/")[1].split(".")[0]

        for ((raw_similar_document_id, title), score) in similar_documents:
            similar_document_id = raw_similar_document_id.split("/")[1].split(".")[0]
            writer.writerow([document_id, me[1], similar_document_id, title, score])

In [26]:
corpus[0.1]

TypeError: list indices must be integers or slices, not float

In [28]:
tfidf_matrix.shape

(1, 4649006)