In [1]:
import os
import json
import string
import nltk
from nltk import word_tokenize, FreqDist
from nltk.corpus import stopwords
import pandas as pd
%matplotlib inline

In [2]:
def get_stream_details():
    print("Reading the stream details...")
    complete_stream_details_df = pd.read_csv("H:\\TeamStreamz_IW\\code\\data\\card_module_details_content_extracted.csv", encoding="ISO-8859-1")
    if complete_stream_details_df is not None:
        complete_stream_details_dict = {}
        _stream_id_stream_title_dict = {}
        for _, row in complete_stream_details_df.iterrows():
            
            stream_id = str(row["DECKID"])
            stream_title = str(row["DECKNAME"])
            row_content = str(row["HTML_CONTENT"])

            # TODO: add the card title and the module name to the content on which the tags can be generated
            card_title =str(row["CARDTITLE"])
            module_name = str(row["MODULENAME"])
            
            if row_content and "nan" not in row_content:
                # if the stream ID already exists in the dictionary
                if complete_stream_details_dict.get(stream_id):
                    existing_content = complete_stream_details_dict[stream_id]
                    new_content = existing_content + "\n" + row_content.strip()
                    complete_stream_details_dict[stream_id] = new_content
                else:
                    complete_stream_details_dict[stream_id] = row_content.strip()
                    _stream_id_stream_title_dict[stream_id] = stream_title
        
        return complete_stream_details_dict

In [4]:
stream_details_dict = get_stream_details()

Reading the stream details...


In [9]:
df_ori = pd.DataFrame(list(stream_details_dict.items()), columns=["StreamID", "Content"])
df_ori.head()

Unnamed: 0,StreamID,Content
0,163,TXmAk2KZAy4\nNMeUjebo1Ac\nEEuTxFhp3go\nCastrol...
1,419,TXmAk2KZAy4\nNMeUjebo1Ac\nEEuTxFhp3go\nCastrol...
2,507,"wBYKUgUyGWc\nA team of world-class drivers, po..."
3,199,Castrol EDGE is Castrols flagship power bran...
4,201,"Charles Cheers Wakefield, Castrols founder,..."


# Pre-process the existing text data in the content

In [10]:
print(df_ori.shape)
df = df_ori.drop_duplicates(['Content'])
print(df.shape)
print("Removed {0} duplicates (based on Content)".format(df_ori.shape[0]-df.shape[0]))

(113, 2)
(97, 2)
Removed 16 duplicates (based on Content)


In [11]:
df["Content"].head()

0    TXmAk2KZAy4\nNMeUjebo1Ac\nEEuTxFhp3go\nCastrol...
1    TXmAk2KZAy4\nNMeUjebo1Ac\nEEuTxFhp3go\nCastrol...
2    wBYKUgUyGWc\nA team of world-class drivers, po...
3    Castrol EDGE  is Castrols flagship power bran...
4    Charles Cheers Wakefield, Castrols founder,...
Name: Content, dtype: object

In [12]:
df["Content"] = df["Content"].astype(str)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [14]:
def preprocess(tokens):
    tokens_nop = [t for t in tokens if t not in string.punctuation]
    tokens_nop = [t.lower() for t in tokens_nop]
    wnl = nltk.WordNetLemmatizer()
    stop = stopwords.words('english')
    tokens_nostop = [t for t in tokens_nop if t not in stop]
    tokens_lem = [wnl.lemmatize(t) for t in tokens_nostop]
    tokens_clean = [t for t in tokens_lem if len(t) >= 3]
    return tokens_clean

def plotWC(tokens):
    text_clean = " ".join(tokens)
    print(text_clean)
    wc = WordCloud(background_color="white").generate(text_clean)
    plt.imshow(wc, interpolation='bilinear')
    plt.axis("off")
    plt.show()
    plt.figure(figsize=(20, 9))
    fd = nltk.FreqDist(tokens)  # case sensitive!
    fd.plot(50)

In [16]:
df['Content_token'] = df['Content'].map(word_tokenize)
df['Content_processed'] = df.Content_token.apply(preprocess)
df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


Unnamed: 0,StreamID,Content,Content_token,Content_processed
0,163,TXmAk2KZAy4\nNMeUjebo1Ac\nEEuTxFhp3go\nCastrol...,"[TXmAk2KZAy4, NMeUjebo1Ac, EEuTxFhp3go, Castro...","[txmak2kzay4, nmeujebo1ac, eeutxfhp3go, castro..."
1,419,TXmAk2KZAy4\nNMeUjebo1Ac\nEEuTxFhp3go\nCastrol...,"[TXmAk2KZAy4, NMeUjebo1Ac, EEuTxFhp3go, Castro...","[txmak2kzay4, nmeujebo1ac, eeutxfhp3go, castro..."
2,507,"wBYKUgUyGWc\nA team of world-class drivers, po...","[wBYKUgUyGWc, A, team, of, world-class, driver...","[wbykuguygwc, team, world-class, driver, power..."
3,199,Castrol EDGE is Castrols flagship power bran...,"[Castrol, EDGE, is, Castrols, flagship, power...","[castrol, edge, castrols, flagship, power, br..."
4,201,"Charles Cheers Wakefield, Castrols founder,...","[Charles, Cheers, Wakefield, ,, Castrols, f...","[charles, cheers, wakefield, castrols, foun..."


# Get the feature matrix

In [20]:
from sklearn.feature_extraction.text import TfidfVectorizer
df['Content_processedtext'] = df.Content_processed.apply(lambda x: ' '.join(x))

vec_tfidf = TfidfVectorizer(max_df=0.7, min_df=3, use_idf=True)
content_tfidf = vec_tfidf.fit_transform(df['Content_processedtext'])
print(content_tfidf.shape)

(97, 684)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


# Reduce the dimensions for easier computation

In [35]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import Normalizer
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline

# Use SVD to reduce dimensions to 50% features - try values between 25-50 for 70 to 100%
svd = TruncatedSVD(30)
normalizer = Normalizer(copy=False)
lsa = make_pipeline(svd, normalizer)
X_lsa = lsa.fit_transform(content_tfidf)

In [36]:
explained_variance = svd.explained_variance_ratio_.sum()
print("Explained variance of the SVD step: {}%".format(int(explained_variance * 100)))

Explained variance of the SVD step: 81%


# Perform K means clustering

In [41]:
from sklearn.cluster import KMeans
from sklearn import metrics
num_clusters = 3

for num in [num_clusters]:
    km3 = KMeans(n_clusters=num, init='k-means++', max_iter=1000, n_init=1, random_state=1)
    %time km3.fit(X_lsa)
    # The higher the better (-1 to 1)
    print("Clusters: {0}".format(num))
    print("Silhouette Coefficient for clusters: %0.3f"
          % metrics.silhouette_score(X_lsa, km3.labels_))

Wall time: 4.99 ms
Clusters: 3
Silhouette Coefficient for clusters: 0.160


In [42]:
def print_terms(cm, num):
    original_space_centroids = svd.inverse_transform(cm.cluster_centers_)
    order_centroids = original_space_centroids.argsort()[:, ::-1]
    terms = vec_tfidf.get_feature_names()
    for i in range(num):
        print("Cluster %d:" % i, end='')
        for ind in order_centroids[i, :10]:
            print(' %s' % terms[ind], end='')
        print()

print_terms(km3, num_clusters)

Cluster 0: oil edge engine titanium strength pressure fst technology car new
Cluster 1: sequence name field1 type text point app redemption zoom page1
Cluster 2: engine sludge gtx protection part start oilways magnatec critical warm


# Evaluate the co-occurences of different terms in the content

In [43]:
sparse_matrix = {}

for content_tokens in df['Content_processed']:
    for word1 in content_tokens:
        
        if not sparse_matrix.get(word1):
            sparse_matrix[word1] = {}

        for word2 in content_tokens:
            if sparse_matrix[word1].get(word2):
                sparse_matrix[word1][word2] += 1
            else:
                sparse_matrix[word1][word2] = 1

In [44]:
from collections import Counter
counts = Counter()
co_occurance_threshold = 1000
for word1 in sparse_matrix:
    for word2, co_occurence_count in sparse_matrix[word1].items():
        current_key_1 = "{0},{1}".format(word1, word2)
        current_key_2 = "{0},{1}".format(word2, word1)
        
        if word1 != word2 and co_occurence_count <= co_occurance_threshold:
            if counts.get(current_key_1):
                counts[current_key_1] += co_occurence_count // 2
            elif counts.get(current_key_2):
                counts[current_key_2] += co_occurence_count // 2
            else:
                counts[current_key_1] = co_occurence_count // 2
        
counts.most_common(50)

[('castrol,gtx', 978),
 ('engine,critical', 968),
 ('engine,warm-up', 918),
 ('...,point', 880),
 ('castrol,sludge', 860),
 ('oil,edge', 834),
 ('point,transaction', 824),
 ('product,point', 806),
 ('castrol,edge', 798),
 ('point,code', 790),
 ('engine,molecule', 778),
 ('oil,titanium', 774),
 ('...,incentive', 774),
 ('product,incentive', 730),
 ('incentive,transaction', 730),
 ('redemption,transaction', 728),
 ('engine,time', 726),
 ('castrol,point', 722),
 ('incentive,code', 710),
 ('engine,intelligent', 704),
 ('...,redemption', 694),
 ('point,click', 686),
 ('oil,pressure', 672),
 ('redemption,click', 666),
 ('product,redemption', 662),
 ('oil,strength', 656),
 ('engine,car', 650),
 ('code,redemption', 646),
 ('castrol,titanium', 642),
 ('castrol,part', 624),
 ('redemption,request', 624),
 ('point,request', 608),
 ('castrol,incentive', 592),
 ('point,user', 586),
 ('incentive,click', 584),
 ('engine,high', 580),
 ('castrol,strength', 578),
 ('engine,cling', 578),
 ('point,sale', 5