In [34]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 

import gensim
import gensim.corpora as corpora
from gensim.models.coherencemodel import CoherenceModel
from sklearn.feature_extraction.text import TfidfVectorizer

import nltk
#nltk.download()
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
nltk.download('stopwords')
from nltk.stem.wordnet import WordNetLemmatizer
nltk.download('wordnet')
dictionary_words = set(nltk.corpus.words.words())

from pprint import pprint

from textblob import TextBlob, Word

import spacy
import spacy_legacy
#loading the english language small model of spacy
en = spacy.load('en_core_web_sm')
sw_spacy = en.Defaults.stop_words


import pickle
import re 
import pyLDAvis
import pyLDAvis.gensim_models

import itertools 
#custom list of stop words
stop_words_english = []
with open('stop_words_english.txt', encoding="utf8") as my_file:
    for line in my_file:
        stop_words_english.append(line.replace("\n", ""))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Wahbeh\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Wahbeh\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [35]:
df = pd.read_csv ('SampleReviews.csv')

In [36]:
df.shape

(1000, 1)

In [37]:
df

Unnamed: 0,Full Text
0,My nephew loves h8s new apple watch. The fact ...
1,The Apple Watch Series 6 has so many amazing f...
2,Replaced my Apple Watch with this gem. I like ...
3,My old Apple Watch stopped charging and I was ...
4,I’m super pleased with these chargers. They wo...
...,...
995,Love my Apple Watch! I get all my notification...
996,Perfect fit easy Install very durable comparab...
997,This is my 4th sport loop for apple watch I ow...
998,I was surprised at how much I actually interac...


In [38]:
df= df.apply(lambda x: x.astype(str).str.lower())

In [39]:
df.head()

Unnamed: 0,Full Text
0,my nephew loves h8s new apple watch. the fact ...
1,the apple watch series 6 has so many amazing f...
2,replaced my apple watch with this gem. i like ...
3,my old apple watch stopped charging and i was ...
4,i’m super pleased with these chargers. they wo...


# Caclualte Bigrams Frequencies to Identofy Wearables Quality Dimensions:

In [40]:
reviews = np.array(df['Full Text'])
reviews = ' '.join(reviews)
#reviews

In [41]:
def clean_text(reviews):
    reviews = re.sub(r'[^\w\s]','', reviews) #remove punctuation marks and special chars
    reviews = re.sub('[^A-Za-z0-9]+', ' ', reviews) #remove anything that is not a number or or letter
    reviews = re.sub('[\W_]', ' ', reviews) #Remove any thing excpet charcater and alphanumeric words
    reviews = re.sub(r'\w*\d\w*', '', reviews) #Remove any thing with numbers
    reviews = ' '.join(word for word in reviews.split() if not word.isdigit())

    #remove words that consists of less than three letter
    words = [word for word in reviews.split() if len(word)>2]
    reviews = " ".join(words)

    #Remove stop words using custom list of stop words
    words = [word for word in reviews.split() if word.lower() not in stop_words_english]
    reviews = " ".join(words)

    #Remove brands related keywords
    brands = ["fitbit ace", "fitbit charge", "fitbit verca", "fitbit ionic",  "fitbit versa lite",  
              "fitbit inspire",  "fitbit versa",  "garmin forerunner",  "garmin fēnix",  "garmin vívoki",  
              "garmin vívofit",  "garmin vívoactive",  "garmin vívomove",  "garmin vívosmart",  
              "garmin vívosport",  "garmin venu",  "garmin instinct",  "garmin luxe",  "garmin darth vader",  
              "garmin captain marvel",  "garmin approach",  "garmin lily",  "garmin swim",  "xiaomi mi band",  
              "xiaomi mi samrt band",  "moov now",  "moov hr",  "apple watch",  "fossil gen",  
              "fossil sport smartwatch",  "fossil hybrid smartwatch hr",  "misfit vap, ",  "misfit ray",  
              "misfit path",  "misfit command",  "withings steel",  "withings move",  "withings pulds",  
              "withings move ecg",  "ihealth watch",  "samsung gear",  "samsung galaxy watch active",  
              "samsung galaxy fit",  "polar a360",  "polar a370",  "polar m430",  "polar m200",  "polar h10",  
              "polar vantage",  "polar ignite",  "polar grit x",  "polar titan",  "polar oh1",  "polar h9",  
              "striiv fusion",  "striiv apex hr",  "striiv dash hr",  "huawei talkband",  " huawei watch",  
              "huawei band",  "mykronoz zewatch",  "mykronoz zefit",  "mykronoz zesport",  "mykronoz zetime",  
              "mykronoz zefit",  "mykronoz zeround",  "mykronoz zeneo",  "mykronoz zetrack",  "coros apex",  
              "wyze band",  "letsfit fitness racker",  "coros pace",  "withings scanwatch",  "amazon halo",  
              "timex ironman",  "suunto peak",  "wahoo fitness", "fitbit", "garmin", "apple", "xiaomi", "moov",
              "fossil", "misfit", "withings", "ihealth", "samsung", "polar", "striiv", "huawei", "mykronoz", 
              "coros", "wyze", "letsfit", "amazon"]

    resultwords  = [word for word in reviews.split() if word.lower() not in brands]
    reviews = ' '.join(resultwords)


    #Remove category-related words
    categories = ["watch", "watches", "smart watch", "smartwatch", "band", "bands", "smartband", 
                  "smart band", "smartbands", "smart bands", "case", "cases", "cover", "protective", 
                  "protector", "screen", "show", "iphone", "express", "appleevent", "shield", 
                  "show", "appltv", "gear vr", "tv", "shoe", "mountain", "mountains", "beach", 
                  "appltv", "accessory", "accessories"]

    resultwords  = [word for word in reviews.split() if word.lower() not in categories]
    reviews = ' '.join(resultwords)


    #Remove feature-related words
    features = ["ability", "quality", "feature", "aspect", "abilities", "qualities", "features", "aspects"]
    resultwords  = [word for word in reviews.split() if word.lower() not in features]
    reviews = ' '.join(resultwords)
    
    return reviews

In [42]:
reviews = clean_text(reviews)

In [43]:
text_file = open("processedreviews.txt", "w")
text_file.write(reviews)
text_file.close()

# Open Processed Reviews 

In [44]:
with open("processedreviews.txt","r") as f:
    reviews = f.read()

In [45]:
reviews

'nephew loves fact waterproof finding black friday deal buys series amazing health readings attention replaced gem durability battery life day hunt stopped charging bummed buy glad faster clear super pleased chargers work great versa price chargers highly recommend buying excellent space gray serves great leaves access switch easily wifes lasted years great hoping years husband wanted airy wear working garden likes lot love everyday learn wanted needed lot money item perfect imagine spending true love love love big game changer notes ipad bought relative series love enjoying learning bought needed sport bought cheaper fell broke bought trusted brand forerunner gps heart rate monitor running black excellent convenient heath wellness versa great affordable attractive love stylish exercises heart rate burning calories loved price compared watchesnneven previous generation functions versa wanted large display menu options color salesman patient friendly knowledgeable wanted love watchnthan

In [46]:
tokens = nltk.word_tokenize(reviews)

#Create your bigrams
bgs = nltk.bigrams(tokens)

#compute frequency distribution for all the bigrams in the text
fdist = nltk.FreqDist(bgs)
# for k,v in fdist.items():
#     print(k,v)

In [47]:
df1 = pd.DataFrame(fdist.items())
df1.columns = ['Bigrams', 'Frequencies']
df1.sort_values(by=['Frequencies'], ascending=False).head()

Unnamed: 0,Bigrams,Frequencies
17,"(battery, life)",51
36,"(highly, recommend)",35
484,"(works, great)",32
74,"(love, love)",18
1022,"(great, love)",17


In [48]:
df2 = df1.loc[df1['Frequencies'] >= 10]
df2.sort_values(by=['Frequencies'], ascending=False)

Unnamed: 0,Bigrams,Frequencies
17,"(battery, life)",51
36,"(highly, recommend)",35
484,"(works, great)",32
74,"(love, love)",18
99,"(heart, rate)",17
1022,"(great, love)",17
1315,"(great, product)",17
804,"(absolutely, love)",14
562,"(great, price)",13
150,"(fits, perfectly)",12


In [49]:
df2.to_csv('TopFrequentBigrams.csv', index=False)

# Identify Keywords with Negators and Remove Review

In [50]:
from re import split
def split_string(string, delimiters):
    string = string.replace(".", "," ) 
    string = string.replace("?", "," )
    pattern = r'|'.join(delimiters)
    return split(pattern, string)

In [51]:
r = "Python is fun. The string is for test because python run the code once you execute it"

#conjunctions and punctuations are based on the Buschken paper, excpet the '(' and ')'
d = ['& ', ', ','; ','! ',': ','for ','and ', 'but ', 'or ', 'so ',
     'after ', 'as ', 'because ', 'before ', 'even ', 'if ', 'now ', 'once ', 
     'since ', 'than ', 'that ', 'thought ', 'when ', 'where ', 'which ', 
     'while ', 'who ', 'what ']

new_string = split_string(r, d)
print(new_string)

['Python is fun', 'The string is ', 'test ', 'python run the code ', 'you execute it']


In [52]:
#custom list of stop words without conjunctions
stop_words_english = []
with open('stop_words_english_conjunctions_negations.txt', encoding="utf8") as my_file:
    for line in my_file:
        stop_words_english.append(line.replace("\n", ""))

In [53]:
def clean_text(docs):
    for i in range(len(docs)):
        #remove hashtags and mentions
        docs[i] = " ".join(filter(lambda x:x[0]!='#', docs[i].split())) #remove hashtags
        docs[i] = " ".join(filter(lambda x:x[0]!='@', docs[i].split())) #remove mentiones
        
        #remove special chars, anything not a number or letter, anything excpet alphanumeric words
        docs[i] = re.sub(r'[^\w\s]','',docs[i]) #remove punctuation marks and special chars
        docs[i] = re.sub('[^A-Za-z0-9]+', ' ', docs[i]) #remove anything that is not a number or or letter
        docs[i] = re.sub('[\W_]', ' ', docs[i]) #Remove any thing excpet charcater and alphanumeric words
        docs[i] = re.sub(r'\w*\d\w*', '', docs[i]) #Remove any thing with numbers
        docs[i] = ' '.join(word for word in docs[i].split() if not word.isdigit())
        
        #compare words to englisg dictionary and remove nonsense words
        words = [word for word in docs[i].split() if word.lower() in dictionary_words or not word.isalpha()]
        docs[i] = " ".join(words)
        
        #remove words that consists of less than three letter
        words = [word for word in docs[i].split() if len(word)>2]
        docs[i] = " ".join(words)
                             
        #Remove stop words using custom list of stop words
        words = [word for word in reviews[i].split() if word.lower() not in stop_words_english]
        reviews[i] = " ".join(words)

        #Remove brands related keywords
        brands = ["fitbit ace", "fitbit charge", "fitbit verca", "fitbit ionic",  "fitbit versa lite",  
                  "fitbit inspire",  "fitbit versa",  "garmin forerunner",  "garmin fēnix",  "garmin vívoki",  
                  "garmin vívofit",  "garmin vívoactive",  "garmin vívomove",  "garmin vívosmart",  
                  "garmin vívosport",  "garmin venu",  "garmin instinct",  "garmin luxe",  "garmin darth vader",  
                  "garmin captain marvel",  "garmin approach",  "garmin lily",  "garmin swim",  "xiaomi mi band",  
                  "xiaomi mi samrt band",  "moov now",  "moov hr",  "apple watch",  "fossil gen",  
                  "fossil sport smartwatch",  "fossil hybrid smartwatch hr",  "misfit vap, ",  "misfit ray",  
                  "misfit path",  "misfit command",  "withings steel",  "withings move",  "withings pulds",  
                  "withings move ecg",  "ihealth watch",  "samsung gear",  "samsung galaxy watch active",  
                  "samsung galaxy fit",  "polar a360",  "polar a370",  "polar m430",  "polar m200",  "polar h10",  
                  "polar vantage",  "polar ignite",  "polar grit x",  "polar titan",  "polar oh1",  "polar h9",  
                  "striiv fusion",  "striiv apex hr",  "striiv dash hr",  "huawei talkband",  " huawei watch",  
                  "huawei band",  "mykronoz zewatch",  "mykronoz zefit",  "mykronoz zesport",  "mykronoz zetime",  
                  "mykronoz zefit",  "mykronoz zeround",  "mykronoz zeneo",  "mykronoz zetrack",  "coros apex",  
                  "wyze band",  "letsfit fitness racker",  "coros pace",  "withings scanwatch",  "amazon halo",  
                  "timex ironman",  "suunto peak",  "wahoo fitness", "fitbit", "garmin", "apple", "xiaomi", "moov",
                  "fossil", "misfit", "withings", "ihealth", "samsung", "polar", "striiv", "huawei", "mykronoz", 
                  "coros", "wyze", "letsfit", "amazon"]

        resultwords  = [word for word in reviews[i].split() if word.lower() not in brands]
        reviews[i] = ' '.join(resultwords)


        #Remove category-related words
        categories = ["watch", "watches", "smart watch", "smartwatch", "band", "bands", "smartband", 
                      "smart band", "smartbands", "smart bands", "case", "cases", "cover", "protective", 
                      "protector", "screen", "show", "iphone", "express", "appleevent", "shield", 
                      "show", "appltv", "gear vr", "tv", "shoe", "mountain", "mountains", "beach", 
                      "appltv", "accessory", "accessories"]

        resultwords  = [word for word in reviews[i].split() if word.lower() not in categories]
        reviews[i] = ' '.join(resultwords)


        #Remove feature-related words
        features = ["ability", "quality", "feature", "aspect", "abilities", "qualities", "features", "aspects"]
        resultwords  = [word for word in reviews[i].split() if word.lower() not in features]
        reviews[i] = ' '.join(resultwords)

    return docs

In [54]:
reviews = np.array(df['Full Text'])
processed_reviews = clean_text(reviews)

In [55]:
processed_reviews

array(['nephew fact that waterproof finding black deal',
       'series amazing but that health what attention',
       'gem durability and battery life day hunt',
       'stopped charging and bummed buy but glad that faster and clear',
       'super work great for price highly recommend',
       'excellent for space gray and great leaves access that switch easily',
       'great that', 'and husband airy wear when working garden lot',
       'love everyday learn but for lot money',
       'item perfect imagine spending for true love love love big game changer',
       'bought for relative series love and enjoying and learning bought',
       'sport and bought and fell and broke bought and brand',
       'forerunner heart rate monitor running black excellent convenient heath wellness',
       'great affordable attractive',
       'love stylish and for cant heart rate and burning',
       'price though previous generation', 'and large display menu color',
       'salesman patient friendl

In [56]:
d = ['& ', ', ','; ','! ',': ','for ','and ', 'but ', 'or ', 'so ',
     'after ', 'as ', 'because ', 'before ', 'even ', 'if ', 'now ', 'once ', 
     'since ', 'than ', 'that ', 'thought ', 'when ', 'where ', 'which ', 
     'while ', 'who ', 'what ']

for i in range(len(processed_reviews)):
    processed_reviews[i] = split_string(processed_reviews[i], d)

processed_reviews

array([list(['nephew fact ', 'waterproof finding black deal']),
       list(['series amazing ', '', 'health ', 'attention']),
       list(['gem durability ', 'battery life day hunt']),
       list(['stopped charging ', 'bummed buy ', 'glad ', 'faster ', 'clear']),
       list(['super work great ', 'price highly recommend']),
       list(['excellent ', 'space gray ', 'great leaves access ', 'switch easily']),
       list(['great that']),
       list(['', 'husb', 'airy wear ', 'working garden lot']),
       list(['love everyday learn ', '', 'lot money']),
       list(['item perfect imagine spending ', 'true love love love big game changer']),
       list(['bought ', 'relative series love ', 'enjoying ', 'learning bought']),
       list(['sport ', 'bought ', 'fell ', 'broke bought ', 'brand']),
       list(['forerunner heart rate monit', 'running black excellent convenient heath wellness']),
       list(['great affordable attractive']),
       list(['love stylish ', '', 'cant heart rate '

In [57]:
print(type(processed_reviews))

<class 'numpy.ndarray'>


In [58]:
df1 = pd.DataFrame(processed_reviews)
df1.columns = ['Full Text']
df1.head()

Unnamed: 0,Full Text
0,"[nephew fact , waterproof finding black deal]"
1,"[series amazing , , health , attention]"
2,"[gem durability , battery life day hunt]"
3,"[stopped charging , bummed buy , glad , faster..."
4,"[super work great , price highly recommend]"


In [3]:
############## Check Terms ####################

#terms = ['easy to use', 'simple', 'straightforward', 'user-friendly', 'intuitive']

def checkterms(string, termsvalues):
    checkterm = False
    for i in range(len(termsvalues)):
        if (string.find(termsvalues[i]) >= 0):
            checkterm = True
    return checkterm
 
#checkterms('This is easy to use hello', terms)

True

In [5]:
############## Check Negators ####################
def checknegators(string):
    negatorvalues = ["ain't", "aren't", "cannot", "cant", "can't", "couldn't", "daren't", "didn't", 
                     "don't", "hasn't","haven't", "isn't", "mayn't","mightn't","mustn't","needn't",
                     "not","oughtn't","shan't","shouldn't","weren't", "won't", "wouldn't"]
    checknegator = False
    for i in range(len(negatorvalues)):
        if (string.find(negatorvalues[i]) >= 0):
            checknegator = True
    return checknegator
 
#checknegators('This is hello simple')

False

In [155]:
############## Encode Reviews####################


In [170]:
#myArray = ['easy to use watch', 'health ', 'attention']

#ease_of_use_terms = ['simple', 'easy to use',  'straightforward', 'user-friendly', 'intuitive']


def encodereviews(reviewarray, qualityterms):
    encode = 0
    for i in reviewarray:
        if((checknegators(i) == False) and (checkterms(i, qualityterms) == True)):
            encode = 1
        else:
            continue
    return encode    

In [171]:
#encodereviews(myArray, ease_of_use_terms)

In [172]:
df1["Ease of Use"] = ""
df1.head()

Unnamed: 0,Full Text,Ease of Use
0,"[nephew fact , waterproof finding black deal]",
1,"[series amazing , , health , attention]",


In [178]:
ease_of_use_terms = ['easy to use', 'simple', 'straightforward', 'user-friendly', 'intuitive']

for i in range(len(df1['Full Text'])):
    #print(df1['Full Text'][i])
    if(encodereviews(df1['Full Text'][i], ease_of_use_terms) == 1):
        df1.at[i,'Ease of Use'] = 1
    else:
        df1.at[i,'Ease of Use'] = 0

In [179]:
df1

Unnamed: 0,Full Text,Ease of Use
0,"[nephew fact , waterproof finding black deal]",0
1,"[series amazing , , health , attention]",0
2,"[gem durability , battery life day hunt]",0
3,"[stopped charging , bummed buy , glad , faster...",0
4,"[super work great , price highly recommend]",0
...,...,...
995,"[love phone , return bulky , great]",0
996,"[perfect fit easy install durable comparable ,...",0
997,"[sport loop , , favorite style lightweight com...",0
998,"[interact day long returned upgrade series , w...",0


In [180]:
df1.to_csv('out1.csv')

In [182]:
#ease_of_use_terms = ['easy to use', 'simple', 'straightforward', 'user-friendly', 'intuitive']
df1.loc[df1['Ease of Use'] == 1]

Unnamed: 0,Full Text,Ease of Use
77,"[love smart simple , , totally worth price]",1
109,"[everyday exception , , works , super intuitive]",1
111,"[love , simplicity amazing simple , understand]",1
125,"[, great , affordable alternative style simple...",1
245,"[sharp simple clean design easy perfectly , wr...",1
332,"[sharp simple clean design charge perfectly , ...",1
345,"[, , , , decided process simple , happier]",1
346,[simple box feel phone properly even],1
449,"[worth buy simple beautiful , great , and]",1
484,"[love price simple bought , stretch]",1
