In [10]:
import nltk
from nltk import FreqDist, word_tokenize
import string
from nltk.corpus import stopwords
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
%matplotlib inline

In [4]:
stream_content_df = pd.read_csv("data/stream_content.csv", header=0, encoding="ISO-8859-1")
stream_content_df.head()

Unnamed: 0,StreamID,Content
0,163,TXmAk2KZAy4\r\nNMeUjebo1Ac\r\nEEuTxFhp3go\r\nC...
1,419,TXmAk2KZAy4\r\nNMeUjebo1Ac\r\nEEuTxFhp3go\r\nC...
2,507,"wBYKUgUyGWc\r\nA team of world-class drivers, ..."
3,199,Castrol EDGE is Castrol?s flagship power bran...
4,201,"Charles ?Cheers? Wakefield, Castrol?s founder,..."


## Pre-process the content to remove stop words, punctutations and lemmatization

In [9]:
def preprocess(tokens):
    
    # TODO: remove random sequences that contain with more than one caps and small or combination of letters and numbers
    
    tokens_nop = [t for t in tokens if t not in string.punctuation]
    tokens_nop = [t.lower() for t in tokens_nop]
    wnl = nltk.WordNetLemmatizer()
    stop = stopwords.words('english')
    tokens_nostop = [t for t in tokens_nop if t not in stop]
    tokens_lem = [wnl.lemmatize(t) for t in tokens_nostop]
    tokens_clean = [t for t in tokens_lem if len(t) >= 3] 
    return tokens_clean

stream_content_df['Content_processed'] = stream_content_df['Content'].map(word_tokenize)
stream_content_df['Content_processed'] = stream_content_df.Content_processed.apply(preprocess)
stream_content_df.head()

Unnamed: 0,StreamID,Content,Content_processed
0,163,TXmAk2KZAy4\r\nNMeUjebo1Ac\r\nEEuTxFhp3go\r\nC...,"[txmak2kzay4, nmeujebo1ac, eeutxfhp3go, castro..."
1,419,TXmAk2KZAy4\r\nNMeUjebo1Ac\r\nEEuTxFhp3go\r\nC...,"[txmak2kzay4, nmeujebo1ac, eeutxfhp3go, castro..."
2,507,"wBYKUgUyGWc\r\nA team of world-class drivers, ...","[wbykuguygwc, team, world-class, driver, power..."
3,199,Castrol EDGE is Castrol?s flagship power bran...,"[castrol, edge, castrol, flagship, power, bran..."
4,201,"Charles ?Cheers? Wakefield, Castrol?s founder,...","[charles, cheer, wakefield, castrol, founder, ..."


### Generate the TFIDF vectors for the streams

In [11]:
stream_content_df['Content_processed'] = stream_content_df['Content_processed'].apply(lambda x: " ".join(x))
stream_content_df.head()

Unnamed: 0,StreamID,Content,Content_processed
0,163,TXmAk2KZAy4\r\nNMeUjebo1Ac\r\nEEuTxFhp3go\r\nC...,txmak2kzay4 nmeujebo1ac eeutxfhp3go castrol la...
1,419,TXmAk2KZAy4\r\nNMeUjebo1Ac\r\nEEuTxFhp3go\r\nC...,txmak2kzay4 nmeujebo1ac eeutxfhp3go castrol la...
2,507,"wBYKUgUyGWc\r\nA team of world-class drivers, ...",wbykuguygwc team world-class driver powered ca...
3,199,Castrol EDGE is Castrol?s flagship power bran...,castrol edge castrol flagship power brand pcos...
4,201,"Charles ?Cheers? Wakefield, Castrol?s founder,...",charles cheer wakefield castrol founder entrep...


In [14]:
all_streams_cleaned_text = stream_content_df['Content_processed']
all_streams_tfidf_vectorizer = TfidfVectorizer(min_df = 2)
all_streams_tfidf = all_streams_tfidf_vectorizer.fit_transform(all_streams_cleaned_text)
all_streams_tfidf

<97x928 sparse matrix of type '<class 'numpy.float64'>'
	with 6122 stored elements in Compressed Sparse Row format>

In [16]:
token_values = {all_streams_tfidf_vectorizer.vocabulary_[token]: token for token in all_streams_tfidf_vectorizer.vocabulary_}


In [20]:
similarities = cosine_similarity(all_streams_tfidf)

In [22]:
K = 5

In [30]:
cosine_similar_streams = {}
for original_stream_index, cosine_similarities in enumerate(similarities):
    cosine_similar_streams[stream_ids[original_stream_index]] = []
    most_similar_stream_indices = np.argsort(cosine_similarities)[-K:]
    cosine_similarities.sort()
    print(cosine_similarities[-K:])
    for similar_stream_index in most_similar_stream_indices:
        cosine_similar_streams[stream_ids[original_stream_index]].append(stream_ids[similar_stream_index])
 
cosine_similar_streams    

[0.24403878 0.2441724  0.24718596 1.         1.        ]
[0.24403878 0.2441724  0.24718596 1.         1.        ]
[0.2602543  0.26253067 0.26983345 0.29341018 1.        ]
[0.85942372 0.8602946  0.91537306 0.93176197 1.        ]
[0.46900658 0.54847783 0.82242088 0.99852593 1.        ]
[0.46831523 0.54766934 0.82120857 0.99852593 1.        ]
[0.27231037 0.28097326 1.         1.         1.        ]
[0.27231037 0.28097326 1.         1.         1.        ]
[0.27231037 0.28097326 1.         1.         1.        ]
[0.79329993 0.82480695 0.83892424 0.8407961  1.        ]
[0.82672322 0.83143316 0.89579635 0.90442193 1.        ]
[0.39786174 0.39793801 0.41154459 0.45293136 1.        ]
[0.16964532 0.17018361 0.17482592 0.1755388  1.        ]
[0.3609036  0.37119459 0.37273652 0.46437432 1.        ]
[0.67422076 0.67422076 1.         1.         1.        ]
[0.67422076 0.67422076 1.         1.         1.        ]
[0.67422076 0.67422076 1.         1.         1.        ]
[0.70053247 0.83891738 1.      

{'1089': ['2034', '2035', '1128', '2010', '1089'],
 '1090': ['316', '202', '2281', '2436', '1090'],
 '1095': ['523', '199', '233', '2408', '1095'],
 '1128': ['1089', '1644', '2512', '2401', '1128'],
 '1267': ['2033', '2376', '1661', '201', '1267'],
 '1347': ['202', '1499', '1512', '1655', '1347'],
 '1498': ['2405', '2380', '199', '2265', '1498'],
 '1499': ['1659', '2374', '2381', '2266', '1499'],
 '1512': ['2062', '1662', '2523', '1499', '1512'],
 '163': ['2436', '2434', '2444', '2511', '2597'],
 '1644': ['1128', '2062', '2401', '1644', '2512'],
 '1655': ['316', '202', '1512', '1347', '1655'],
 '1658': ['2380', '2265', '2405', '2373', '1658'],
 '1659': ['2031', '2266', '2381', '2374', '1659'],
 '1660': ['2375', '2407', '2267', '2382', '1660'],
 '1661': ['2033', '2376', '1267', '201', '1661'],
 '1662': ['622', '1512', '624', '2062', '1662'],
 '1670': ['2038', '2382', '2032', '2267', '1670'],
 '1857': ['232', '475', '316', '202', '1857'],
 '199': ['2380', '2405', '2265', '1498', '199'],


In [28]:
print(stream_details_dict['1089'])
print("Stream 2034")
print(stream_details_dict['2034'])
print("Stream 2035")
print(stream_details_dict['2035'])

Welcome Team! Were really glad you are part of this journey!  In this stream you will find out more about the program and its fantastic benefits.   But first, scroll to the next card to complete a small survey, to help us guide your ...
This tool is used to capture the brand survey information.       "On a scale of 1-10, how likely are you to recommend a Castrol brand to a customer?"         You cannot edit this c...


{"title": "Brand Survey", "primaryField": "field1", "page1": {"header": "{CARD_DESCRIPTION}", "field1": {"type": "Text", "name": "NPS Score", "sequence": 1}, "sequence": 1}, "maxSubmissions": 1}
This 123 App is your portal to stay  connected and up-to-date  with Castrol brands and promos.   You will be refreshed with the right information, to enable the best sales conversations.   By interacting wit...
Up-to-date   knowledge on Castrol Brand 123 messaging
Interactive quizzes to let you   track your progress   and growth
Badges and Competitions that entitle you to   grea

In [35]:
cosine_similar_streams = {}
for original_stream_index, cosine_similarities in enumerate(similarities):
    cosine_similar_streams[stream_ids[original_stream_index]] = []
    cosine_similarities_sorted_indices = np.argsort(cosine_similarities)
    cosine_similarities.sort()
   
    cosine_similarities_sorted_indices_reversed = cosine_similarities_sorted_indices[::-1]
    cosine_similarities_reversed = cosine_similarities[::-1]
    
    ctr = 0
    for itr, cosine_similarity_value in enumerate(cosine_similarities_reversed):
        if cosine_similarity_value < 1:
            ctr += 1
            print(cosine_similarity_value)
            cosine_similar_streams[stream_ids[original_stream_index]].append(stream_ids[cosine_similarities_sorted_indices_reversed[itr]])
            if ctr >= K:
                break
 
cosine_similar_streams    

0.2471859607184682
0.24417240005916563
0.244038780480129
0.23576954742268535
0.23269699672927036
0.2471859607184682
0.24417240005916563
0.244038780480129
0.23576954742268535
0.23269699672927036
0.2934101828167366
0.26983344541655346
0.26253067388347867
0.26025430464393146
0.25868935063088744
0.9999999999999999
0.9317619684915007
0.9153730643028907
0.8602945969249295
0.8594237227353164
0.9985259329074919
0.8224208765351206
0.548477834745479
0.46900657836733983
0.41069095537925143
0.9999999999999998
0.9985259329074919
0.8212085729848287
0.5476693416183109
0.468315231203999
0.28097326218616453
0.2723103703984296
0.24944604630358255
0.24077460795389596
0.2393232224019523
0.28097326218616453
0.2723103703984296
0.24944604630358255
0.24077460795389596
0.2393232224019523
0.28097326218616453
0.2723103703984296
0.24944604630358255
0.24077460795389596
0.2393232224019523
0.8407961003491053
0.838924239175036
0.824806949455201
0.7932999343497235
0.7881090853430092
0.9044219296754255
0.89579635178905

{'1089': ['2511', '2444', '2434', '2436', '2435'],
 '1090': ['2511', '2444', '2434', '2436', '2435'],
 '1095': ['2511', '2444', '2434', '2436', '2435'],
 '1128': ['2511', '2444', '2434', '2436', '2435'],
 '1267': ['2597', '2511', '2444', '2434', '2436'],
 '1347': ['2511', '2444', '2434', '2436', '2435'],
 '1498': ['2597', '2511', '2444', '2434', '2436'],
 '1499': ['2597', '2511', '2444', '2434', '2436'],
 '1512': ['2511', '2444', '2434', '2436', '2435'],
 '163': ['2444', '2434', '2436', '2435', '2525'],
 '1644': ['2444', '2434', '2436', '2435', '2525'],
 '1655': ['2511', '2444', '2434', '2436', '2435'],
 '1658': ['2511', '2444', '2434', '2436', '2435'],
 '1659': ['2597', '2511', '2444', '2434', '2436'],
 '1660': ['2511', '2444', '2434', '2436', '2435'],
 '1661': ['2511', '2444', '2434', '2436', '2435'],
 '1662': ['2511', '2444', '2434', '2436', '2435'],
 '1670': ['2511', '2444', '2434', '2436', '2435'],
 '1857': ['2511', '2444', '2434', '2436', '2435'],
 '199': ['2597', '2511', '2444',

In [36]:
print(stream_details_dict['1089'])
print("\n\nStream 2511")
print(stream_details_dict['2511'])
print("\n\nStream 2444")
print(stream_details_dict['2444'])

Welcome Team! Were really glad you are part of this journey!  In this stream you will find out more about the program and its fantastic benefits.   But first, scroll to the next card to complete a small survey, to help us guide your ...
This tool is used to capture the brand survey information.       "On a scale of 1-10, how likely are you to recommend a Castrol brand to a customer?"         You cannot edit this c...


{"title": "Brand Survey", "primaryField": "field1", "page1": {"header": "{CARD_DESCRIPTION}", "field1": {"type": "Text", "name": "NPS Score", "sequence": 1}, "sequence": 1}, "maxSubmissions": 1}
This 123 App is your portal to stay  connected and up-to-date  with Castrol brands and promos.   You will be refreshed with the right information, to enable the best sales conversations.   By interacting wit...
Up-to-date   knowledge on Castrol Brand 123 messaging
Interactive quizzes to let you   track your progress   and growth
Badges and Competitions that entitle you to   grea