## SAS Text Analytics: Topics, Sentiment, and Deep Learning

### Load Libraries, Action Sets, and Data

In [170]:
# Import Python Libraries
import swat
import pandas as pd
from IPython.display import display
from io import StringIO
import swat.cas.datamsghandlers as dmh

In [171]:
# Create CAS Connection
conn = swat.CAS(host, portnum, username, password)

In [228]:
# Load Action Sets
conn.loadactionset('textmining')
conn.loadactionset('table')
conn.loadactionset('sentimentAnalysis');

NOTE: Added action set 'textmining'.
NOTE: Added action set 'table'.
NOTE: Added action set 'sentimentAnalysis'.


In [173]:
# Load in Data and Add Column Names
# Use Python Engine for Parsing the Regex (\n Line Breaks)
imdb_df = pd.read_csv('/data/andre_data/SAS_Demo_Reviews_Text_Analytics/imdb_labelled.txt',
                      header=None, sep='\t|\n',engine="python")

yelp_df = pd.read_csv('/data/andre_data/SAS_Demo_Reviews_Text_Analytics/yelp_labelled.txt',
                      header=None, sep='\t|\n',engine="python")

amazon_df = pd.read_csv('/data/andre_data/SAS_Demo_Reviews_Text_Analytics/amazon_cells_labelled.txt',
                        header=None, sep='\t|\n',engine="python");

In [174]:
# View Data as it was Read In
imdb_df.head()

Unnamed: 0,0,1
0,"A very, very, very slow-moving, aimless movie ...",0
1,Not sure who was more lost - the flat characte...,0
2,Attempting artiness with black & white and cle...,0
3,Very little music or anything to speak of.,0
4,The best scene in the movie was when Gerardo i...,1


### Data Prep for Text Modeling

In [175]:
# Add Better Column Names
imdb_df.columns = ["text", "polarity"]
yelp_df.columns = ["text", "polarity"]
amazon_df.columns = ["text", "polarity"]

In [176]:
# Add Category Columns
imdb_df["category"] = "imdb"
yelp_df["category"] = "yelp"
amazon_df["category"] = "amazon"

In [177]:
# Append all Dataframes
#all_df = imdb_df.append([yelp_df, amazon_df])
all_df = pd.concat([imdb_df, yelp_df, amazon_df])

In [211]:
# Add Document ID Column
all_df["doc_id"] = all_df.index + 1
imdb_df["doc_id"] = imdb_df.index + 1
yelp_df["doc_id"] = yelp_df.index + 1
amazon_df["doc_id"] = amazon_df.index + 1

In [212]:
# Display Shape of All the Dataframes
display(imdb_df.shape)
display(yelp_df.shape)
display(amazon_df.shape)
display(all_df.shape)

(1000, 4)

(1000, 4)

(1000, 4)

(3000, 4)

In [213]:
# View Data after Manipulations
display(imdb_df.head(n=2))
display(yelp_df.head(n=2))
display(amazon_df.head(n=2))
display(all_df.head(n=2))

Unnamed: 0,text,polarity,category,doc_id
0,"A very, very, very slow-moving, aimless movie ...",0,imdb,1
1,Not sure who was more lost - the flat characte...,0,imdb,2


Unnamed: 0,text,polarity,category,doc_id
0,Wow... Loved this place.,1,yelp,1
1,Crust is not good.,0,yelp,2


Unnamed: 0,text,polarity,category,doc_id
0,So there is no way for me to plug it in here i...,0,amazon,1
1,"Good case, Excellent value.",1,amazon,2


Unnamed: 0,text,polarity,category,doc_id
0,"A very, very, very slow-moving, aimless movie ...",0,imdb,1
1,Not sure who was more lost - the flat characte...,0,imdb,2


### Load Data Into CAS

In [215]:
imdb_df_cas = conn.upload_frame(imdb_df, casout=dict(name='imdb_df_table', replace=True))
yelp_df_cas = conn.upload_frame(yelp_df, casout=dict(name='yelp_df_table', replace=True))
amazon_df_cas = conn.upload_frame(amazon_df, casout=dict(name='amazon_df_table', replace=True))
all_df_cas = conn.upload_frame(all_df, casout=dict(name='all_df_table', replace=True));

NOTE: Cloud Analytic Services made the uploaded file available as table IMDB_DF_TABLE in caslib CASUSER(sas).
NOTE: The table IMDB_DF_TABLE has been created in caslib CASUSER(sas) from binary data uploaded to Cloud Analytic Services.
NOTE: Cloud Analytic Services made the uploaded file available as table YELP_DF_TABLE in caslib CASUSER(sas).
NOTE: The table YELP_DF_TABLE has been created in caslib CASUSER(sas) from binary data uploaded to Cloud Analytic Services.
NOTE: Cloud Analytic Services made the uploaded file available as table AMAZON_DF_TABLE in caslib CASUSER(sas).
NOTE: The table AMAZON_DF_TABLE has been created in caslib CASUSER(sas) from binary data uploaded to Cloud Analytic Services.
NOTE: Cloud Analytic Services made the uploaded file available as table ALL_DF_TABLE in caslib CASUSER(sas).
NOTE: The table ALL_DF_TABLE has been created in caslib CASUSER(sas) from binary data uploaded to Cloud Analytic Services.


### Setup Stop Words List 

In [216]:
# NLTK Stopwords List (Outdated)
stoplist = ['i','me','my','myself','we','our','ours','ourselves','you','your','yours','yourself','yourselves',
            'he','him','his','himself','she','her','hers','herself','it','its','itself','they','them','their',
            'theirs','themselves','what','which','who','whom','this','that','these','those','am','is','are','was',
            'were','be','been','being','have','has','had','having','do','does','did','doing','a','an','the','and',
            'but','if','or','because','as','until','while','of','at','by','for','with','about','against','between',
            'into','through','during','before','after','above','below','to','from','up','down','in','out','on',
            'off','over','under','again','further','then','once','here','there','when','where','why','how','all',
            'any','both','each','few','more','most','other','some','such','no','nor','not','only','own','same','so',
            'than','too','very','s','t','can','will','just','don','should','now']

In [217]:
# Convert Stoplist Array to Pandas DF and Load to CAS Table for Text Mining
stoplist_df = pd.DataFrame(stoplist, columns=['Term'])
stoplist_df_cas = conn.upload_frame(stoplist_df, casout=dict(name='stoplist_df_table', replace=True))

stoplist_df_cas.head()

NOTE: Cloud Analytic Services made the uploaded file available as table STOPLIST_DF_TABLE in caslib CASUSER(sas).
NOTE: The table STOPLIST_DF_TABLE has been created in caslib CASUSER(sas) from binary data uploaded to Cloud Analytic Services.


Unnamed: 0,Term
0,i
1,me
2,my
3,myself
4,we


### Text Modeling: Topic Discovery and Sentiment Analysis

In [None]:
# 'entities' = specifies whether to extract entities in parsing. If set
#   to None, no entities are output. If set to STD, the standard entities
#   are output.
# 'Stemming' = specifies whether stemming is to occur in parsing. When
#   set to True, terms are evaluated to see if they belong to common parent
#   form and the information is added to the position table
# 'Tagging' = Specifies whether part-of-speech tagging is used in parsing.
# 'Reduce' = Minimum number of documents a term should be in to be kept
# 'numLabels' = Specifies the number of terms to use in the descriptive label for each topic.

#### IMDB Reviews Use Case

In [219]:
# Apply Text Mining to Discover Topics
conn.tmMine(documents='imdb_df_table',
           text='text',
           docid='doc_id',
           stopList='stoplist_df_table',
           parent=conn.CASTable('imdb_testparent',replace=True),
           terms=conn.CASTable('imdb_terms',replace=True),
           parseConfig=conn.CASTable('imdb_config',replace=True),
           entities='NONE',
           NounGroups=False,
           Stemming=True,
           Tagging=False,
           reduce=1,
           numTopics=20,
           numLabels=5,
           topics=conn.CASTable('imdb_topics',replace=True),
           docpro=conn.CASTable('imdb_docpro',replace=True),
           u=conn.CASTable('imdb_svdu',replace=True))

Unnamed: 0,casLib,Name,Label,Rows,Columns,casTable
0,CASUSER(sas),imdb_terms,,3565,11,"CASTable('imdb_terms', caslib='CASUSER(sas)')"
1,CASUSER(sas),imdb_testparent,,7109,3,"CASTable('imdb_testparent', caslib='CASUSER(sa..."
2,CASUSER(sas),imdb_config,,1,22,"CASTable('imdb_config', caslib='CASUSER(sas)')"
3,CASUSER(sas),imdb_svdu,,2548,21,"CASTable('imdb_svdu', caslib='CASUSER(sas)')"
4,CASUSER(sas),imdb_docpro,,1000,21,"CASTable('imdb_docpro', caslib='CASUSER(sas)')"
5,CASUSER(sas),imdb_topics,,20,3,"CASTable('imdb_topics', caslib='CASUSER(sas)')"


In [233]:
# View Discovered Topics
display(conn.fetch(table='imdb_topics', orderBy='_TopicID_'))

Unnamed: 0,_TopicId_,_Name_,_TermCutOff_
0,1.0,"+film, +watch, +time, +great, +director",0.058
1,2.0,"+theme, occasionally, +experience, +piece, +art",0.021
2,3.0,"john, every, +identify, +eye, family",0.029
3,4.0,"adorable, special, +come, +funny, excellent",0.043
4,5.0,"+lot, style, poor, cheap, john",0.027
5,6.0,"+character, +actor, +great, +make, +give",0.057
6,7.0,"+may, +make, +show, +waste, always",0.046
7,8.0,"+time, direction, subtle, +perfect, perhaps",0.044
8,9.0,"+good, +help, +role, +make, cool",0.041
9,10.0,"also, +go, well, +fine, +great",0.044


In [239]:
# Apply Sentiment Model
conn.applysent(casout={'name':'imdb_sent_table'},
               docid='doc_id', 
               text='text', 
               table={'name':'imdb_df_table'});

ERROR: The table imdb_sent_table already exists in the session.
ERROR: The table %U.*s already exists in the session.
ERROR: The action stopped due to errors.


In [240]:
# View Sentiment Model Table
conn.fetch(sortby=[{'name':'doc_id'}],table={'name':'imdb_sent_table'})

Unnamed: 0,doc_id,_sentiment_,_score_
0,1.0,Negative,0.228571
1,2.0,Neutral,0.5
2,3.0,Negative,0.307692
3,4.0,Neutral,0.5
4,5.0,Positive,0.6
5,6.0,Neutral,0.5
6,7.0,Negative,0.4
7,8.0,Positive,0.692308
8,9.0,Neutral,0.5
9,10.0,Positive,0.6


#### Yelp Reviews Use Case

In [221]:
# Topic Modeling
conn.tmMine(documents='yelp_df_table',
           text='text',
           docid='doc_id',
           stopList='stoplist_df_table',
           parent=conn.CASTable('yelp_parent',replace=True),
           terms=conn.CASTable('yelp_terms',replace=True),
           parseConfig=conn.CASTable('yelp_config',replace=True),
           entities='NONE',
           NounGroups=False,
           Stemming=True,
           Tagging=False,
           reduce=1,
           numTopics=20,
           numLabels=5,
           topics=conn.CASTable('yelp_topics',replace=True),
           docpro=conn.CASTable('yelp_docpro',replace=True),
           u=conn.CASTable('yelp_svdu',replace=True))

Unnamed: 0,casLib,Name,Label,Rows,Columns,casTable
0,CASUSER(sas),yelp_terms,,2336,11,"CASTable('yelp_terms', caslib='CASUSER(sas)')"
1,CASUSER(sas),yelp_parent,,5318,3,"CASTable('yelp_parent', caslib='CASUSER(sas)')"
2,CASUSER(sas),yelp_config,,1,22,"CASTable('yelp_config', caslib='CASUSER(sas)')"
3,CASUSER(sas),yelp_svdu,,1613,21,"CASTable('yelp_svdu', caslib='CASUSER(sas)')"
4,CASUSER(sas),yelp_docpro,,1000,21,"CASTable('yelp_docpro', caslib='CASUSER(sas)')"
5,CASUSER(sas),yelp_topics,,20,3,"CASTable('yelp_topics', caslib='CASUSER(sas)')"


In [225]:
# View Topics
display(conn.fetch(table='yelp_topics', orderBy='_TopicID_'))

Unnamed: 0,_TopicId_,_Name_,_TermCutOff_
0,1.0,"ever, +steak, +taste, worst, +say",0.071
1,2.0,"+go, back, never, much, soon",0.071
2,3.0,"+wait, +minute, +food, another, +get",0.072
3,4.0,"+great, +menu, +pizza, always, area",0.069
4,5.0,"+come, us, +table, +chicken, vegas",0.07
5,6.0,"+sauce, +say, another, well, +spicy",0.061
6,7.0,"+service, +food, +bad, slow, +customer",0.072
7,8.0,"+like, +feel, warm, +look, +place",0.068
8,9.0,"+good, +food, +selection, +burger, really",0.07
9,10.0,"staff, friendly, +nice, even, super",0.07


In [241]:
# Sentiment Model
conn.applysent(casout={'name':'yelp_sent_table'},
               docid='doc_id', 
               text='text', 
               table={'name':'yelp_df_table'});

In [242]:
# View Sentiment Table
conn.fetch(sortby=[{'name':'doc_id'}],table={'name':'yelp_sent_table'})

Unnamed: 0,doc_id,_sentiment_,_score_
0,1.0,Positive,0.6
1,2.0,Negative,0.4
2,3.0,Negative,0.307692
3,4.0,Positive,0.6
4,5.0,Positive,0.6
5,6.0,Negative,0.307692
6,7.0,Neutral,0.5
7,8.0,Neutral,0.5
8,9.0,Positive,0.6
9,10.0,Positive,0.6


#### Amazon Reviews Use Case

In [223]:
# Topic Modeling
conn.tmMine(documents='amazon_df_table',
           text='text',
           docid='doc_id',
           stopList='stoplist_df_table',
           parent=conn.CASTable('amazon_parent',replace=True),
           terms=conn.CASTable('amazon_terms',replace=True),
           parseConfig=conn.CASTable('amazon_config',replace=True),
           entities='NONE',
           NounGroups=False,
           Stemming=True,
           Tagging=False,
           reduce=1,
           numTopics=20,
           numLabels=5,
           topics=conn.CASTable('amazon_topics',replace=True),
           docpro=conn.CASTable('amazon_docpro',replace=True),
           u=conn.CASTable('amazon_svdu',replace=True))

Unnamed: 0,casLib,Name,Label,Rows,Columns,casTable
0,CASUSER(sas),amazon_terms,,2102,11,"CASTable('amazon_terms', caslib='CASUSER(sas)')"
1,CASUSER(sas),amazon_parent,,5034,3,"CASTable('amazon_parent', caslib='CASUSER(sas)')"
2,CASUSER(sas),amazon_config,,1,22,"CASTable('amazon_config', caslib='CASUSER(sas)')"
3,CASUSER(sas),amazon_svdu,,1443,21,"CASTable('amazon_svdu', caslib='CASUSER(sas)')"
4,CASUSER(sas),amazon_docpro,,1000,21,"CASTable('amazon_docpro', caslib='CASUSER(sas)')"
5,CASUSER(sas),amazon_topics,,20,3,"CASTable('amazon_topics', caslib='CASUSER(sas)')"


In [226]:
# View Topics
display(conn.fetch(table='amazon_topics', orderBy='_TopicID_'))

Unnamed: 0,_TopicId_,_Name_,_TermCutOff_
0,1.0,"battery, +long, +last, life, original",0.072
1,2.0,"+sound, quality, +end, low, clear",0.072
2,3.0,"+work, +great, well, +item, fine",0.073
3,4.0,"+ear, +fit, comfortable, jabra, +design",0.073
4,5.0,"+phone, +great, cell, +say, +buy",0.073
5,6.0,"+good, +price, much, far, +picture",0.074
6,7.0,"+use, data, +easy, +product, even",0.072
7,8.0,"+charger, car, new, +plug, +come",0.072
8,9.0,"+product, +price, +recommend, +great, +look",0.075
9,10.0,"+case, +think, right, leather, little",0.059


In [243]:
# Apply Sentiment Model
conn.applysent(casout={'name':'amazon_sent_table'},
               docid='doc_id', 
               text='text', 
               table={'name':'amazon_df_table'});

In [244]:
# View Sentiment Model Table
conn.fetch(sortby=[{'name':'doc_id'}],table={'name':'amazon_sent_table'})

Unnamed: 0,doc_id,_sentiment_,_score_
0,1.0,Neutral,0.5
1,2.0,Positive,0.692308
2,3.0,Positive,0.6
3,4.0,Negative,0.4
4,5.0,Positive,0.6
5,6.0,Neutral,0.5
6,7.0,Positive,0.6
7,8.0,Neutral,0.5
8,9.0,Negative,0.4
9,10.0,Negative,0.4
