In [1]:
import os
import json
import string
import nltk
from nltk import word_tokenize, FreqDist, pos_tag
from nltk.corpus import stopwords
import pandas as pd
import numpy as np
%matplotlib inline

In [2]:
def get_stream_details():
    print("Reading the stream details...")
    complete_stream_details_df = pd.read_csv("H:\\TeamStreamz_IW\\code\\data\\card_module_details_content_extracted.csv", encoding="ISO-8859-1")
    if complete_stream_details_df is not None:
        complete_stream_details_dict = {}
        _stream_id_stream_title_dict = {}
        for _, row in complete_stream_details_df.iterrows():
            
            stream_id = str(row["DECKID"])
            stream_title = str(row["DECKNAME"])
            row_content = str(row["HTML_CONTENT"])

            # TODO: add the card title and the module name to the content on which the tags can be generated
            card_title =str(row["CARDTITLE"])
            module_name = str(row["MODULENAME"])
            
            if row_content and "nan" not in row_content:
                # if the stream ID already exists in the dictionary
                if complete_stream_details_dict.get(stream_id):
                    existing_content = complete_stream_details_dict[stream_id]
                    new_content = existing_content + "\n" + row_content.strip()
                    complete_stream_details_dict[stream_id] = new_content
                else:
                    complete_stream_details_dict[stream_id] = row_content.strip()
                    _stream_id_stream_title_dict[stream_id] = stream_title
        
        return complete_stream_details_dict

In [3]:
stream_details_dict = get_stream_details()

Reading the stream details...


In [4]:
df_ori = pd.DataFrame(list(stream_details_dict.items()), columns=["StreamID", "Content"])
df_ori.head()

Unnamed: 0,StreamID,Content
0,163,TXmAk2KZAy4\nNMeUjebo1Ac\nEEuTxFhp3go\nCastrol...
1,419,TXmAk2KZAy4\nNMeUjebo1Ac\nEEuTxFhp3go\nCastrol...
2,507,"wBYKUgUyGWc\nA team of world-class drivers, po..."
3,199,Castrol EDGE is Castrols flagship power bran...
4,201,"Charles Cheers Wakefield, Castrols founder,..."


In [5]:
print(df_ori.shape)
df = df_ori.drop_duplicates(['Content'])
print(df.shape)
print("Removed {0} duplicates (based on Content)".format(df_ori.shape[0]-df.shape[0]))

(113, 2)
(97, 2)
Removed 16 duplicates (based on Content)


In [6]:
df["Content"].head()

0    TXmAk2KZAy4\nNMeUjebo1Ac\nEEuTxFhp3go\nCastrol...
1    TXmAk2KZAy4\nNMeUjebo1Ac\nEEuTxFhp3go\nCastrol...
2    wBYKUgUyGWc\nA team of world-class drivers, po...
3    Castrol EDGE  is Castrols flagship power bran...
4    Charles Cheers Wakefield, Castrols founder,...
Name: Content, dtype: object

In [8]:
# simply tokenize without any preprocessing
df['Content_token'] = df['Content'].map(word_tokenize)
df['Content_POS'] = df['Content_token'].map(pos_tag, {"tagset": "universal"})
df['Content_POS'].head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


0    [(TXmAk2KZAy4, NNP), (NMeUjebo1Ac, NNP), (EEuT...
1    [(TXmAk2KZAy4, NNP), (NMeUjebo1Ac, NNP), (EEuT...
2    [(wBYKUgUyGWc, NN), (A, DT), (team, NN), (of, ...
3    [(Castrol, NNP), (EDGE, NNP), (is, VBZ), (Cast...
4    [(Charles, NNP), (Cheers, NNP), (Wakefield, ...
Name: Content_POS, dtype: object

In [9]:
df["Content_Nouns"] = df["Content_POS"].apply(lambda items: [x for x in items if x[1][0] == 'N'])
df["Content_Nouns"].head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


0    [(TXmAk2KZAy4, NNP), (NMeUjebo1Ac, NNP), (EEuT...
1    [(TXmAk2KZAy4, NNP), (NMeUjebo1Ac, NNP), (EEuT...
2    [(wBYKUgUyGWc, NN), (team, NN), (world-class, ...
3    [(Castrol, NNP), (EDGE, NNP), (Castrols, NNP)...
4    [(Charles, NNP), (Cheers, NNP), (Wakefield, ...
Name: Content_Nouns, dtype: object

### We assume that nouns that make sense should not have digits

In [22]:
def is_text_random_sequence(text, max_num_digits = 0):
    digits = [x for x in text if x.isdigit()]
    #print(digits)
    if len(digits) > max_num_digits:
        return True
    
    return False

In [20]:
is_text_random_sequence("TXmAk2KZAy4")

True

In [21]:
df["Content_Nouns"] = df["Content_Nouns"].apply(lambda items: [x for x in items if not is_text_random_sequence(x[0])])
df["Content_Nouns"].head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


0    [(Castrol, NNP), (oil, NN), (TITANIUM, NNP), (...
1    [(Castrol, NNP), (oil, NN), (TITANIUM, NNP), (...
2    [(wBYKUgUyGWc, NN), (team, NN), (world-class, ...
3    [(Castrol, NNP), (EDGE, NNP), (Castrols, NNP)...
4    [(Charles, NNP), (Cheers, NNP), (Wakefield, ...
Name: Content_Nouns, dtype: object

In [23]:
df.head()

Unnamed: 0,StreamID,Content,Content_token,Content_POS,Content_Nouns
0,163,TXmAk2KZAy4\nNMeUjebo1Ac\nEEuTxFhp3go\nCastrol...,"[TXmAk2KZAy4, NMeUjebo1Ac, EEuTxFhp3go, Castro...","[(TXmAk2KZAy4, NNP), (NMeUjebo1Ac, NNP), (EEuT...","[(Castrol, NNP), (oil, NN), (TITANIUM, NNP), (..."
1,419,TXmAk2KZAy4\nNMeUjebo1Ac\nEEuTxFhp3go\nCastrol...,"[TXmAk2KZAy4, NMeUjebo1Ac, EEuTxFhp3go, Castro...","[(TXmAk2KZAy4, NNP), (NMeUjebo1Ac, NNP), (EEuT...","[(Castrol, NNP), (oil, NN), (TITANIUM, NNP), (..."
2,507,"wBYKUgUyGWc\nA team of world-class drivers, po...","[wBYKUgUyGWc, A, team, of, world-class, driver...","[(wBYKUgUyGWc, NN), (A, DT), (team, NN), (of, ...","[(wBYKUgUyGWc, NN), (team, NN), (world-class, ..."
3,199,Castrol EDGE is Castrols flagship power bran...,"[Castrol, EDGE, is, Castrols, flagship, power...","[(Castrol, NNP), (EDGE, NNP), (is, VBZ), (Cast...","[(Castrol, NNP), (EDGE, NNP), (Castrols, NNP)..."
4,201,"Charles Cheers Wakefield, Castrols founder,...","[Charles, Cheers, Wakefield, ,, Castrols, f...","[(Charles, NNP), (Cheers, NNP), (Wakefield, ...","[(Charles, NNP), (Cheers, NNP), (Wakefield, ..."


In [25]:
df = df.drop(["Content_token", "Content_POS"], axis=1)
df.to_csv("outputs/content_nouns.csv", index=False)