In [1]:
import nltk
from nltk import FreqDist
import string
from nltk.corpus import stopwords
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from wordcloud import WordCloud
%matplotlib inline

In [2]:
def get_stream_details():
    print("Reading the stream details...")
    complete_stream_details_df = pd.read_csv("H:\\TeamStreamz_IW\\code\\data\\card_module_details_content_extracted.csv", encoding="ISO-8859-1")
    if complete_stream_details_df is not None:
        complete_stream_details_dict = {}
        _stream_id_stream_title_dict = {}
        for _, row in complete_stream_details_df.iterrows():
            
            stream_id = str(row["DECKID"])
            stream_title = str(row["DECKNAME"])
            row_content = str(row["HTML_CONTENT"])

            # TODO: add the card title and the module name to the content on which the tags can be generated
            card_title =str(row["CARDTITLE"])
            module_name = str(row["MODULENAME"])
            
            if row_content and "nan" not in row_content:
                # if the stream ID already exists in the dictionary
                if complete_stream_details_dict.get(stream_id):
                    existing_content = complete_stream_details_dict[stream_id]
                    new_content = existing_content + "\n" + row_content.strip()
                    complete_stream_details_dict[stream_id] = new_content
                else:
                    complete_stream_details_dict[stream_id] = row_content.strip()
                    _stream_id_stream_title_dict[stream_id] = stream_title
        
        return complete_stream_details_dict

In [3]:
stream_details_dict = get_stream_details()
stream_details_dict

Reading the stream details...


{'1089': 'Welcome Team! Were really glad you are part of this journey!     In this stream you will find out more about the program and its fantastic benefits.     But first, scroll to the next card to complete a small survey, to help us guide your ...\nThis tool is used to capture the brand survey information.       "On a scale of 1-10, how likely are you to recommend a Castrol brand to a customer?"       You cannot edit this c...\n{"title": "Brand Survey", "primaryField": "field1", "page1": {"header": "{CARD_DESCRIPTION}", "field1": {"type": "Text", "name": "NPS Score", "sequence": 1}, "sequence": 1}, "maxSubmissions": 1}\nUp-to-date       knowledge on Castrol Brand 123 messaging\nInteractive quizzes to let you        track your progress       and growth\nBadges and Competitions that entitle you to        great gifts and rewards!\nThe Castrol Zoom App provides interactive information, competitions and rewards on all things Castrol!     Enjoy the journey!',
 '1095': 'In this Stream you

In [4]:
# what is the content length
stream_details_df = pd.DataFrame(list(stream_details_dict.items()), columns = ["StreamID", "Content"])
stream_details_df["ContentLength"] = stream_details_df["Content"].str.len()
stream_details_df.head()

Unnamed: 0,StreamID,Content,ContentLength
0,163,TXmAk2KZAy4\nNMeUjebo1Ac\nEEuTxFhp3go\nCastrol...,800
1,419,TXmAk2KZAy4\nNMeUjebo1Ac\nEEuTxFhp3go\nCastrol...,812
2,507,"wBYKUgUyGWc\nA team of world-class drivers, po...",180
3,199,Castrol EDGE is Castrols flagship power b...,1505
4,201,"Charles Cheers Wakefield, Castrols founder, wa...",1675


### Pre-process the file using text pre-processing steps

In [5]:
# convert case, remove punctuations
# stem the words
stop = stopwords.words('english')
snowball = nltk.SnowballStemmer('english')

def preprocess(toks):
    toks = [ t.lower() for t in toks if t not in string.punctuation ]
    toks = [t for t in toks if t not in stop ]
    toks = [ snowball.stem(t) for t in toks ]
#   toks = [ wnl.lemmatize(t) for t in toks ]
    toks_clean = [ t for t in toks if len(t) >= 3 ]
    return toks_clean

all_streams_cleaned_tokens = []
stream_ids = list(stream_details_df["StreamID"])
for _, row in stream_details_df.iterrows():
    stream_content = row["Content"]
    stream_content_tokens = nltk.word_tokenize(stream_content)
    stream_cleaned_tokens = preprocess(stream_content_tokens)
    all_streams_cleaned_tokens.append(stream_cleaned_tokens)


### Term frequency for the terms

In [6]:
len(all_streams_cleaned_tokens)

112

In [7]:
# check the frequencies
# perform TF-IDF to get the scores for words for each document
# generate the tags based on the TF-IDF scores
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

all_streams_cleaned_text = [ ' '.join(f) for f in all_streams_cleaned_tokens ]
all_streams_cleaned_text[0]

'txmak2kzay4 nmeujebo1ac eeutxfhp3go castrol launch strongest ever oil use titanium fluid strengthth set-uptoday engin effici power led doubl pressur engin oil stronger.th reason buycastrol ... kirnvrufj3i take world-class driver race clone join neuroscientist dr. jack lewi talk need achiev maximum perform track.se next market visit ahomxmbs_no castrol edg titanium trial present clone rival watch clone aston martin vulcan driver darren turner pit ultim rival take titanium strong excit video 0outa7eurbg vp2-cendpt4'

In [8]:
all_streams_count_vectorizer = CountVectorizer(min_df = 2)
all_streams_tf = all_streams_count_vectorizer.fit_transform(all_streams_cleaned_text)

In [9]:
all_streams_tf

<112x827 sparse matrix of type '<class 'numpy.int64'>'
	with 5107 stored elements in Compressed Sparse Row format>

### TFIDF vectorizer for the terms

In [28]:
# And tfidf indexing
all_streams_tfidf_vectorizer = TfidfVectorizer(min_df = 2)
all_streams_tfidf = all_streams_tfidf_vectorizer.fit_transform(all_streams_cleaned_text)
all_streams_tfidf

<112x827 sparse matrix of type '<class 'numpy.float64'>'
	with 5107 stored elements in Compressed Sparse Row format>

In [29]:
len(all_streams_tfidf_vectorizer.vocabulary_)

827

In [30]:
(all_streams_tfidf_vectorizer.vocabulary_)

{'000': 0,
 '0outa7eurbg': 1,
 '10': 2,
 '100': 3,
 '120': 4,
 '123': 5,
 '14': 6,
 '18': 7,
 '1899': 8,
 '1906': 9,
 '1980s': 10,
 '1st': 11,
 '2002': 12,
 '2005': 13,
 '2012': 14,
 '2018': 15,
 '2safhlj0fgg': 16,
 '30': 17,
 '31st': 18,
 '5hofo': 19,
 '5w': 20,
 '9rsws44pdke': 21,
 '_fmbvrsf4pe': 22,
 'abl': 23,
 'abras': 24,
 'absorb': 25,
 'accept': 26,
 'accord': 27,
 'account': 28,
 'acea': 29,
 'achiev': 30,
 'acid': 31,
 'acquir': 32,
 'acquisit': 33,
 'across': 34,
 'action': 35,
 'activ': 36,
 'address': 37,
 'advanc': 38,
 'advocaci': 39,
 'aft': 40,
 'ahomxmbs_no': 41,
 'allow': 42,
 'almost': 43,
 'along': 44,
 'alreadi': 45,
 'also': 46,
 'alway': 47,
 'ambit': 48,
 'amount': 49,
 'anoth': 50,
 'answer': 51,
 'anti': 52,
 'apart': 53,
 'app': 54,
 'appear': 55,
 'applic': 56,
 'approv': 57,
 'arabia': 58,
 'area': 59,
 'around': 60,
 'ask': 61,
 'assess': 62,
 'associ': 63,
 'aston': 64,
 'attach': 65,
 'audi': 66,
 'autom': 67,
 'automat': 68,
 'automobil': 69,
 'automot

In [35]:
token_values = {all_streams_tfidf_vectorizer.vocabulary_[token]: token for token in all_streams_tfidf_vectorizer.vocabulary_}


In [32]:
all_streams_tfidf.shape

(112, 827)

In [33]:
all_streams_tfidf_coo = all_streams_tfidf.tocoo()
len(all_streams_tfidf_coo.row)

5107

In [15]:
len(all_streams_tfidf_coo.col)

5107

In [16]:
K = 5

In [24]:
# create a dictionary indexed by the stream (row) number
token_tfidf_dict = {}
for idx, stream_index in enumerate(all_streams_tfidf_coo.row):
    stream_id = stream_ids[stream_index]
    #print("Stream index: {0} and stream ID: {1}".format(stream_index, stream_id))
    if token_tfidf_dict.get(stream_id):
        token_tfidf_dict[stream_id].append((all_streams_tfidf_coo.col[idx], all_streams_tfidf_coo.data[idx]))
    else:
        token_tfidf_dict[stream_id] = [(all_streams_tfidf_coo.col[idx], all_streams_tfidf_coo.data[idx])]
        
token_tfidf_dict

{'1089': [(113, 0.14854043365490952),
  (778, 0.08738633481930926),
  (747, 0.09987555198803197),
  (495, 0.1087367925893812),
  (716, 0.0951273719879041),
  (414, 0.07605960276403027),
  (153, 0.1087367925893812),
  (93, 0.24811981856563148),
  (739, 0.11647782975797603),
  (684, 0.05591722922682614),
  (372, 0.11564625551822948),
  (388, 0.1820286227733655),
  (527, 0.06437048771079462),
  (287, 0.08414099919179041),
  (344, 0.08263815481918137),
  (181, 0.09101431138668276),
  (487, 0.07489711765058654),
  (152, 0.2109829139237247),
  (286, 0.06690360202293982),
  (772, 0.08914693968811689),
  (580, 0.07489711765058654),
  (109, 0.08572094548086655),
  (738, 0.09101431138668276),
  (565, 0.08414099919179041),
  (280, 0.16828199838358082),
  (525, 0.08414099919179041),
  (766, 0.07489711765058654),
  (649, 0.1624104720294627),
  (723, 0.08414099919179041),
  (340, 0.09101431138668276),
  (111, 0.0951273719879041),
  (743, 0.10549145696186235),
  (107, 0.13371522692682664),
  (705, 0.

In [36]:
for k in token_tfidf_dict:
    num_tokens = min(K, len(token_tfidf_dict[k]))
    top_k_token_ids = sorted(token_tfidf_dict[k], key=lambda x: x[1], reverse=True)[: num_tokens]
    top_k_tokens = [token_values[token_index] for token_index, tfidf_score in top_k_token_ids]
    token_tfidf_dict[k]= top_k_tokens
    

In [37]:
token_tfidf_dict

{'1089': ['survey', 'brand', 'interact', 'competit', 'reward'],
 '1095': ['transmiss', 'automat', 'ensur', 'shift', 'gear'],
 '1128': ['survey', 'brand', 'field1', 'sequenc', 'captur'],
 '1267': ['person', 'castrol', 'wakefield', 'lubric', 'nasa'],
 '1347': ['wish', 'love', 'merri', 'christma', 'happi'],
 '1498': ['edg', 'titanium', 'strength', 'fst', 'castrol'],
 '1499': ['like', 'magnatec', 'app', 'forward', 'deserv'],
 '1512': ['app', 'like', 'forward', 'deserv', 'expertis'],
 '163': ['clone', 'rival', 'titanium', 'driver', 'take'],
 '1644': ['survey', 'field1', 'text', 'name', 'type'],
 '1655': ['wish', 'love', 'merri', 'christma', 'happi'],
 '1658': ['edg', 'fst', 'castrol', 'titanium', 'strength'],
 '1659': ['sgr2uevmfgm', 'extra', 'special', 'magnet', 'intellig'],
 '1660': ['sludg', 'gtx', 'action', 'formula', 'doubl'],
 '1661': ['wakefield', 'castrol', 'person', 'curios', 'lubric'],
 '1662': ['prize', 'collect', 'chanc', 'win', 'col'],
 '1670': ['sludg', 'gtx', 'action', 'oilwa