In [1]:
import os
import json
import string
import nltk
from nltk import word_tokenize, FreqDist
from nltk.corpus import stopwords
import pandas as pd
import gensim
%matplotlib inline



In [2]:
def get_stream_details():
    print("Reading the stream details...")
    complete_stream_details_df = pd.read_csv("H:\\TeamStreamz_IW\\code\\data\\card_module_details_content_extracted.csv", encoding="ISO-8859-1")
    if complete_stream_details_df is not None:
        complete_stream_details_dict = {}
        _stream_id_stream_title_dict = {}
        for _, row in complete_stream_details_df.iterrows():
            
            stream_id = str(row["DECKID"])
            stream_title = str(row["DECKNAME"])
            row_content = str(row["HTML_CONTENT"])

            # TODO: add the card title and the module name to the content on which the tags can be generated
            card_title =str(row["CARDTITLE"])
            module_name = str(row["MODULENAME"])
            
            if row_content and "nan" not in row_content:
                # if the stream ID already exists in the dictionary
                if complete_stream_details_dict.get(stream_id):
                    existing_content = complete_stream_details_dict[stream_id]
                    new_content = existing_content + "\n" + row_content.strip()
                    complete_stream_details_dict[stream_id] = new_content
                else:
                    complete_stream_details_dict[stream_id] = row_content.strip()
                    _stream_id_stream_title_dict[stream_id] = stream_title
        
        return complete_stream_details_dict

In [3]:
stream_details_dict = get_stream_details()

Reading the stream details...


In [4]:
df_ori = pd.DataFrame(list(stream_details_dict.items()), columns=["StreamID", "Content"])
df_ori.head()

Unnamed: 0,StreamID,Content
0,163,TXmAk2KZAy4\nNMeUjebo1Ac\nEEuTxFhp3go\nCastrol...
1,419,TXmAk2KZAy4\nNMeUjebo1Ac\nEEuTxFhp3go\nCastrol...
2,507,"wBYKUgUyGWc\nA team of world-class drivers, po..."
3,199,Castrol EDGE is Castrols flagship power bran...
4,201,"Charles Cheers Wakefield, Castrols founder,..."


# Prepare the content for extracting the Word2Vec values

In [5]:
print(df_ori.shape)
df = df_ori.drop_duplicates(['Content'])
print(df.shape)
print("Removed {0} duplicates (based on Content)".format(df_ori.shape[0]-df.shape[0]))

(113, 2)
(97, 2)
Removed 16 duplicates (based on Content)


In [7]:
df.head()

Unnamed: 0,StreamID,Content
0,163,TXmAk2KZAy4\nNMeUjebo1Ac\nEEuTxFhp3go\nCastrol...
1,419,TXmAk2KZAy4\nNMeUjebo1Ac\nEEuTxFhp3go\nCastrol...
2,507,"wBYKUgUyGWc\nA team of world-class drivers, po..."
3,199,Castrol EDGE is Castrols flagship power bran...
4,201,"Charles Cheers Wakefield, Castrols founder,..."


In [8]:
df.to_csv("H:\\TeamStreamz_IW\\code\\hybird_recommender\\data\\stream_content.csv", index=False)

In [9]:
df["Content"] = df["Content"].astype(str)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [10]:
def preprocess(tokens):
    tokens_nop = [t for t in tokens if t not in string.punctuation]
    tokens_nop = [t.lower() for t in tokens_nop]
    wnl = nltk.WordNetLemmatizer()
    stop = stopwords.words('english')
    tokens_nostop = [t for t in tokens_nop if t not in stop]
    tokens_lem = [wnl.lemmatize(t) for t in tokens_nostop]
    tokens_clean = [t for t in tokens_lem if len(t) >= 3]
    return tokens_clean

In [11]:
df['Content_processed'] = df.Content.apply(gensim.utils.simple_preprocess)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [13]:
(df.head())

Unnamed: 0,StreamID,Content,Content_processed
0,163,TXmAk2KZAy4\nNMeUjebo1Ac\nEEuTxFhp3go\nCastrol...,"[txmak, kzay, nmeujebo, ac, eeutxfhp, go, cast..."
1,419,TXmAk2KZAy4\nNMeUjebo1Ac\nEEuTxFhp3go\nCastrol...,"[txmak, kzay, nmeujebo, ac, eeutxfhp, go, cast..."
2,507,"wBYKUgUyGWc\nA team of world-class drivers, po...","[wbykuguygwc, team, of, world, class, drivers,..."
3,199,Castrol EDGE is Castrols flagship power bran...,"[castrol, edge, is, castrol, flagship, power, ..."
4,201,"Charles Cheers Wakefield, Castrols founder,...","[charles, cheers, wakefield, castrol, founder,..."


In [14]:
word2vec_model = gensim.models.Word2Vec(df['Content_processed'], size=100, window=5, min_count=1, workers=4)
word2vec_model.save("outputs/content.model")

In [15]:
required_qual_model = gensim.models.Word2Vec.load("outputs/content.model")
required_qual_model.wv['castrol'].shape

(100,)

## Get the average word 2 vec vectors for the documents

In [18]:
import numpy as np

def get_average_word2vec_for_content(token_list):
    if token_list:
        num_tokens = len(token_list)
        word_2_vec_sum = np.zeros(100)
        for token in token_list:
            word_2_vec_sum += required_qual_model.wv[token]
        
        #print("SUM: " + str(word_2_vec_sum))
        #print("LEN: " + str(num_tokens))
        average_word_2_vec = word_2_vec_sum/num_tokens
        #print("AVG" + str(average_word_2_vec))
        return average_word_2_vec
    return None


In [19]:
get_average_word2vec_for_content(df['Content_processed'][0])

array([-0.04717176, -0.09935359,  0.05193145, -0.1600476 , -0.31570751,
        0.10265526, -0.05897608,  0.16129018, -0.12905528, -0.09577187,
        0.19799357,  0.07441719,  0.19023432, -0.20791484, -0.13509663,
        0.00957502,  0.2359546 ,  0.00363353, -0.09033673, -0.08823789,
       -0.19941717,  0.15238751, -0.11284285,  0.23419589, -0.30125451,
        0.05063687,  0.20641822, -0.11102139, -0.07713662,  0.01233445,
       -0.04327075,  0.67229279, -0.20432221,  0.3441221 , -0.09191832,
        0.14696339,  0.01075936, -0.00530203,  0.21218132, -0.04072955,
        0.03637225,  0.42287218,  0.14707519, -0.21395865, -0.01741613,
       -0.06692989, -0.24148433,  0.42952582, -0.08277454,  0.0304829 ,
       -0.06191949,  0.02931565,  0.18630537, -0.56485808,  0.09160004,
       -0.33522558,  0.10445677,  0.15009846,  0.28806414,  0.18076468,
       -0.20557811, -0.04690619, -0.28437946, -0.05850776, -0.00966623,
        0.05763548,  0.44806228, -0.31753437, -0.33816023,  0.00

In [20]:
df["Content_Avg_Word2Vec"] = df['Content_processed'].apply(get_average_word2vec_for_qual)
df["Content_Avg_Word2Vec"].head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


0    [-0.047171759298554904, -0.09935359034930383, ...
1    [-0.04652433617698937, -0.09797148434090955, 0...
2    [-0.04743776740048945, -0.09823440102627501, 0...
3    [-0.05991558747314464, -0.12427333847255195, 0...
4    [-0.0504213154972415, -0.10617610479461578, 0....
Name: Content_Avg_Word2Vec, dtype: object

In [21]:
num_clusters = 5
num_rows = df["Content_Avg_Word2Vec"].shape[0]
word_2_vec_values = df["Content_Avg_Word2Vec"].values
print(word_2_vec_values.shape)
word_2_vec_values[0]
word_2_vec_values = np.array([np.array(x) for x in word_2_vec_values])
print(word_2_vec_values.shape)

(97,)
(97, 100)


In [22]:
from sklearn.cluster import KMeans
from sklearn import metrics

kmeans_clustering = KMeans(n_clusters = num_clusters, init='k-means++')
idx = kmeans_clustering.fit_predict(word_2_vec_values)
print(idx.shape)
df["Content_Cluster_Index"] = idx

(97,)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys


In [23]:
df['Content_processed'] = df["Content_processed"].apply(preprocess)
print("Silhouette Coefficient for clusters: %0.3f"
          % metrics.silhouette_score(word_2_vec_values, kmeans_clustering.labels_))

Silhouette Coefficient for clusters: 0.572


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [25]:
grouped_df = df.groupby("Content_Cluster_Index")["Content_processed"].sum()

from collections import Counter
for idx, g in enumerate(grouped_df):
    c = Counter(g)
    print("Cluster # {0}".format(idx))
    tokens = [x[0] for x in c.most_common(20)]
    print(" ".join(tokens))

Cluster # 0
engine castrol oil gtx sludge edge part start titanium protection magnatec strength double pressure critical high like formula oilways fst molecule warm technology stop time stream intelligent action car keep video cling watch information damage provides way understand dirt new temperature find layer protects performance wear drain brand back unique
Cluster # 1
castrol engine oil brand car wakefield new power curiosity field take performance lubricant smooth name world started nasa product type diesel charles first superior sequence advanced transmission titanium cheer founder business cheapside london rover mar played central role journey time part customer high point clone need see took protects operation
Cluster # 2
castrol
Cluster # 3
field name sequence oil type castrol text engine synthetic primaryfield page title header technology vecton performance journey plan friction account protection transmission check number value volume market address make better fuel economy