In [1]:
import nltk
from nltk import FreqDist
import string
from nltk.corpus import stopwords
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from sklearn.metrics.pairwise import cosine_similarity
%matplotlib inline

In [2]:
def get_stream_details():
    print("Reading the stream details...")
    complete_stream_details_df = pd.read_csv("H:\\TeamStreamz_IW\\code\\data\\card_module_details_content_extracted.csv", encoding="ISO-8859-1")
    if complete_stream_details_df is not None:
        complete_stream_details_dict = {}
        _stream_id_stream_title_dict = {}
        for _, row in complete_stream_details_df.iterrows():
            
            stream_id = str(row["DECKID"])
            stream_title = str(row["DECKNAME"])
            row_content = str(row["HTML_CONTENT"])

            # TODO: add the card title and the module name to the content on which the tags can be generated
            card_title =str(row["CARDTITLE"])
            module_name = str(row["MODULENAME"])
            
            if row_content and "nan" not in row_content:
                # if the stream ID already exists in the dictionary
                if complete_stream_details_dict.get(stream_id):
                    existing_content = complete_stream_details_dict[stream_id]
                    new_content = existing_content + "\n" + row_content.strip()
                    complete_stream_details_dict[stream_id] = new_content
                else:
                    complete_stream_details_dict[stream_id] = row_content.strip()
                    _stream_id_stream_title_dict[stream_id] = stream_title
        
        return complete_stream_details_dict

In [3]:
stream_details_dict = get_stream_details()
stream_details_dict

Reading the stream details...


{'1089': 'Welcome Team! We\x92re really glad you are part of this journey!  In this stream you will find out more about the program and its fantastic benefits.   But first, scroll to the next card to complete a small survey, to help us guide your ...\nThis tool is used to capture the brand survey information.       "On a scale of 1-10, how likely are you to recommend a Castrol brand to a customer?"         You cannot edit this c...\n\n\n{"title": "Brand Survey", "primaryField": "field1", "page1": {"header": "{CARD_DESCRIPTION}", "field1": {"type": "Text", "name": "NPS Score", "sequence": 1}, "sequence": 1}, "maxSubmissions": 1}\nThis 123 App is your portal to stay  connected and up-to-date  with Castrol brands and promos.   You will be refreshed with the right information, to enable the best sales conversations.   By interacting wit...\nUp-to-date   knowledge on Castrol Brand 123 messaging\nInteractive quizzes to let you   track your progress   and growth\nBadges and Competitions that 

### Pre-process the file using text pre-processing steps

In [4]:
# convert case, remove punctuations
# stem the words
stop = stopwords.words('english')
snowball = nltk.SnowballStemmer('english')

def preprocess(toks):
    toks = [ t.lower() for t in toks if t not in string.punctuation ]
    toks = [t for t in toks if t not in stop ]
    toks = [ snowball.stem(t) for t in toks ]
#   toks = [ wnl.lemmatize(t) for t in toks ]
    toks_clean = [ t for t in toks if len(t) >= 3 ]
    return toks_clean

# list of cleaned tokens for the documents
all_streams_cleaned_tokens = []

stream_ids = list(stream_details_dict)

for k in stream_details_dict:
    stream_content = stream_details_dict[k]
    stream_content_tokens = nltk.word_tokenize(stream_content)
    stream_cleaned_tokens = preprocess(stream_content_tokens)
    all_streams_cleaned_tokens.append(stream_cleaned_tokens)

### Term frequency for the terms

In [5]:
len(all_streams_cleaned_tokens)

113

In [6]:
# check the frequencies
# perform TF-IDF to get the scores for words for each document
# generate the tags based on the TF-IDF scores
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

all_streams_cleaned_text = [ ' '.join(f) for f in all_streams_cleaned_tokens ]
all_streams_cleaned_text[0]

'txmak2kzay4 nmeujebo1ac eeutxfhp3go castrol launch strongest ever oil use titanium fluid strengthth set-uptoday\x92 engin effici power led doubl pressur engin oil stronger.th reason buycastrol ... kirnvrufj3i take world-class driver race clone join neuroscientist dr. jack lewi talk need achiev maximum perform track.se next market visit ahomxmbs_no castrol edg titanium trial present clone rival watch clone aston martin vulcan driver darren turner pit ultim rival take titanium strong excit video 0outa7eurbg vp2-cendpt4'

In [7]:
all_streams_count_vectorizer = CountVectorizer(min_df = 2)
all_streams_tf = all_streams_count_vectorizer.fit_transform(all_streams_cleaned_text)

In [8]:
all_streams_tf

<113x965 sparse matrix of type '<class 'numpy.int64'>'
	with 6838 stored elements in Compressed Sparse Row format>

### TFIDF vectorizer for the terms

In [9]:
# And tfidf indexing
all_streams_tfidf_vectorizer = TfidfVectorizer(min_df = 2)
all_streams_tfidf = all_streams_tfidf_vectorizer.fit_transform(all_streams_cleaned_text)
all_streams_tfidf

<113x965 sparse matrix of type '<class 'numpy.float64'>'
	with 6838 stored elements in Compressed Sparse Row format>

In [10]:
len(all_streams_tfidf_vectorizer.vocabulary_)

965

In [11]:
(all_streams_tfidf_vectorizer.vocabulary_)

{'000': 0,
 '000kg': 1,
 '0outa7eurbg': 2,
 '10': 3,
 '100': 4,
 '120': 5,
 '123': 6,
 '16': 7,
 '18': 8,
 '1899': 9,
 '1906': 10,
 '1968': 11,
 '1980s': 12,
 '1st': 13,
 '2002': 14,
 '2005': 15,
 '2012': 16,
 '2017': 17,
 '2018': 18,
 '2safhlj0fgg': 19,
 '30': 20,
 '31st': 21,
 '5hofo': 22,
 '5w': 23,
 '88': 24,
 '9rsws44pdke': 25,
 '_fmbvrsf4pe': 26,
 'abl': 27,
 'abras': 28,
 'absorb': 29,
 'accept': 30,
 'accord': 31,
 'account': 32,
 'acea': 33,
 'achiev': 34,
 'acid': 35,
 'acknowledg': 36,
 'acquir': 37,
 'acquisit': 38,
 'across': 39,
 'action': 40,
 'activ': 41,
 'adapt': 42,
 'add': 43,
 'address': 44,
 'adjust': 45,
 'admin': 46,
 'advanc': 47,
 'advocaci': 48,
 'aft': 49,
 'ag': 50,
 'ahomxmbs_no': 51,
 'allow': 52,
 'almost': 53,
 'along': 54,
 'alreadi': 55,
 'also': 56,
 'alway': 57,
 'ambit': 58,
 'amount': 59,
 'amp': 60,
 'and': 61,
 'anoth': 62,
 'answer': 63,
 'anti': 64,
 'apart': 65,
 'app': 66,
 'appear': 67,
 'applic': 68,
 'approv': 69,
 'arabia': 70,
 'area': 

In [12]:
token_values = {all_streams_tfidf_vectorizer.vocabulary_[token]: token for token in all_streams_tfidf_vectorizer.vocabulary_}


In [13]:
all_streams_tfidf.shape

(113, 965)

In [14]:
all_streams_tfidf_coo = all_streams_tfidf.tocoo()
len(all_streams_tfidf_coo.row)

6838

In [15]:
len(all_streams_tfidf_coo.col)

6838

In [18]:
similarities = cosine_similarity(all_streams_tfidf)

In [20]:
type(similarities)

numpy.ndarray

In [21]:
similarities.shape

(113, 113)

In [22]:
K = 5

In [26]:
cosine_similar_streams = {}
for original_stream_index, cosine_similarities in enumerate(similarities):
    cosine_similar_streams[stream_ids[original_stream_index]] = []
    most_similar_stream_indices = np.argsort(cosine_similarities)[-K:]
    for similar_stream_index in most_similar_stream_indices:
        cosine_similar_streams[stream_ids[original_stream_index]].append(stream_ids[similar_stream_index])
 
cosine_similar_streams    

{'1089': ['2034', '2035', '1128', '2010', '1089'],
 '1090': ['316', '202', '2281', '2436', '1090'],
 '1095': ['523', '199', '233', '2408', '1095'],
 '1128': ['1089', '1644', '2512', '2401', '1128'],
 '1267': ['2033', '2376', '1661', '201', '1267'],
 '1347': ['202', '1499', '1512', '1655', '1347'],
 '1498': ['2405', '2380', '199', '2265', '1498'],
 '1499': ['1659', '2374', '2381', '2266', '1499'],
 '1512': ['2062', '1662', '2523', '1499', '1512'],
 '163': ['507', '1498', '199', '419', '163'],
 '1644': ['1128', '2062', '2401', '1644', '2512'],
 '1655': ['316', '202', '1512', '1347', '1655'],
 '1658': ['2380', '2265', '2405', '2373', '1658'],
 '1659': ['2031', '2266', '2381', '2374', '1659'],
 '1660': ['2375', '2407', '2267', '2382', '1660'],
 '1661': ['2033', '2376', '1267', '201', '1661'],
 '1662': ['622', '1512', '624', '2062', '1662'],
 '1670': ['2038', '2382', '2032', '2267', '1670'],
 '1857': ['232', '475', '316', '202', '1857'],
 '199': ['2380', '2405', '2265', '1498', '199'],
 '20

In [27]:
print(stream_details_dict['1089'])
print("Stream 2034")
print(stream_details_dict['2034'])
print("Stream 2035")
print(stream_details_dict['2035'])

Welcome Team! Were really glad you are part of this journey!  In this stream you will find out more about the program and its fantastic benefits.   But first, scroll to the next card to complete a small survey, to help us guide your ...
This tool is used to capture the brand survey information.       "On a scale of 1-10, how likely are you to recommend a Castrol brand to a customer?"         You cannot edit this c...


{"title": "Brand Survey", "primaryField": "field1", "page1": {"header": "{CARD_DESCRIPTION}", "field1": {"type": "Text", "name": "NPS Score", "sequence": 1}, "sequence": 1}, "maxSubmissions": 1}
This 123 App is your portal to stay  connected and up-to-date  with Castrol brands and promos.   You will be refreshed with the right information, to enable the best sales conversations.   By interacting wit...
Up-to-date   knowledge on Castrol Brand 123 messaging
Interactive quizzes to let you   track your progress   and growth
Badges and Competitions that entitle you to   grea