### General idea
1. Clean dataset => dfClean
2. Vectorize words => to probability density
3. Perform logistic regression on vectorized words 
    of scales of reviews 0 (0,.1),1 (.2,.3) ,2 (.4,.5),3 (.6,.7) ,4 (.8,.9, 1) reviews

In [35]:
# Read data set and stop words
import pandas as pd
import re 
import nltk
nltk.download('stopwords')
df = pd.read_csv (r'.\AppReview.csv')
len(df.index)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\danxg\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


111143

In [36]:
# Corpus of stop words
from nltk.corpus import stopwords

In [37]:
# processes a review and returns a list of words
def review_to_words(review, string = True, remove_stopwords=True):
    # Remove HTML
    #review_text = BeautifulSoup(review).get_text()
    review_text=review
    # Remove non-letters
    review_text = re.sub("[^a-zA-Z]"," ", review_text)
    # Convert words to lower case and split them
    words = review_text.lower().split()
    # Optionally remove stop words (false by default)
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        words = [w for w in words if not w in stops]
    if string:
        return " ".join(words)
    else:
        return words

In [38]:
#Clean up text
#Remove non-ascii text
#Remove all rows missing reviewerName
def fixString(x):
    return x.encode('ascii',errors='ignore')

# df[["reviewText"]]=df[["reviewText"]].apply(lambda x: str(x["reviewText"]).encode('ascii',errors='ignore').decode(), axis=1)
# df[["reviewerName"]]=df[["reviewerName"]].apply(lambda x: str(x["reviewerName"]).encode('ascii',errors='ignore').decode(), axis=1)
df[["reviewText"]]=df[["reviewText"]].apply(lambda x: review_to_words(x["reviewText"]), axis=1)
df[["reviewerName"]]=df[["reviewerName"]].apply(lambda x: str(x["reviewerName"]).encode('ascii',errors='ignore').decode(), axis=1)

dfCleaned=df[df['reviewText'].str.strip().astype(bool)]
dfCleaned=dfCleaned[df['reviewerName'].str.strip().astype(bool)]

#
dfCleaned

Unnamed: 0,appID,reviewerName,reviewText,reviewerRating,reviewDate,textAnalytics
0,3,Eric Hansen,love well worth money full version came ad blo...,1.0,2017/07/07 00:00:00,
1,3,Jacob N.,awful bug allow use space bar want type search...,0.4,2017/08/29 00:00:00,
2,3,Higgins Family,would stars except bugs example incognito tab ...,0.8,2017/10/02 00:00:00,
3,3,Rajko Dikmann,worked perfect weeks ago browsing experience s...,0.4,2017/09/28 00:00:00,
4,3,Sergei Garcia,hands best browser play store even flagship de...,1.0,2017/07/09 00:00:00,
...,...,...,...,...,...,...
111138,343,heera d,good less power consumer full review,0.8,1-September-2013,
111139,343,Maulik Upadhyay,xperia pro nice change would nice full review,0.8,21-August-2013,
111140,343,Lezlie Coleman,willl full review,0.6,24-September-2013,
111141,343,Taher Bhai,great wonderful app samsung galaxy full review,1.0,14-August-2013,


In [39]:
dfCleaned[['reviewText']]

Unnamed: 0,reviewText
0,love well worth money full version came ad blo...
1,awful bug allow use space bar want type search...
2,would stars except bugs example incognito tab ...
3,worked perfect weeks ago browsing experience s...
4,hands best browser play store even flagship de...
...,...
111138,good less power consumer full review
111139,xperia pro nice change would nice full review
111140,willl full review
111141,great wonderful app samsung galaxy full review


## Vectorize words
### This is based on 
https://towardsdatascience.com/sentiment-analysis-a-how-to-guide-with-movie-reviews-9ae335e6bcb2
With actual logistic regression:
https://towardsdatascience.com/sentiment-classification-with-logistic-regression-analyzing-yelp-reviews-3981678c3b44


In [40]:
from sklearn.model_selection import train_test_split

dfReviews = dfCleaned['reviewText']
dfTrain, dfTest = train_test_split(dfReviews, test_size=.1)

In [41]:
# import statements
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

# Initialize a bag of words
#vectorizer = CountVectorizer(analyzer = "word", tokenizer = None, preprocessor = None, stop_words = None, max_features = 5000) 
vectorizer = CountVectorizer(analyzer = "word", tokenizer = None, preprocessor = None, stop_words = None, max_features = 1000) 

# Fit transform the data 
train_feat = vectorizer.fit_transform(dfTrain).toarray()
test_feat = vectorizer.transform(dfTest).toarray()


# TFIDF train set
tfidf_transformer = TfidfTransformer().fit(train_feat)
train_tfidf = tfidf_transformer.transform(train_feat)
 
# apply tfidf to test set
test_tfidf = tfidf_transformer.transform(test_feat)


In [42]:
#train data
trainYdata = pd.merge(dfTrain.to_frame(), dfCleaned[['reviewerRating']], left_index=True, right_index=True)
#determined feature names
featureNamesList=vectorizer.get_feature_names()

In [43]:
# look at data for training
type(dfTrain)
stopcounter = 0
for index, value in dfTrain.items():
    print(f"Index : {index}, Value : {value}")
    stopcounter=stopcounter+1
    if stopcounter > 10:
        break

Index : 8621, Value : optimal cool full review
Index : 40054, Value : bug show dot menu devices screen navigation bar full review
Index : 33591, Value : certificate app great need help create certificate try connect says certificate required connect server full review
Index : 31036, Value : nearly perfect far best hn app found minor issues app crashes extremely long comment threads load two pages articles share option browsers targets full review
Index : 34921, Value : best part like launcher everyone know know use keep asking use devices lol full review
Index : 39121, Value : nice work well phone full review
Index : 14464, Value : best app audiobooks installed google play able give stars already f droid full review
Index : 24447, Value : simple reliable full review
Index : 26979, Value : woo love app man added widget thanks much make switch baconreader thanks taking request consideration best dev ever full review
Index : 80120, Value : add double tap wake full review
Index : 86623, Va

In [44]:
# Dump some vectorized words and probablities
import scipy.sparse

featureNamesList=vectorizer.get_feature_names()
type(dfTrain)

cx = scipy.sparse.coo_matrix(train_feat)
cx2 = scipy.sparse.coo_matrix(train_tfidf)


print(dfTrain.iloc[0])
for i,j,v in zip(cx.row, cx.col, cx.data):
    if i!=0:
        break
    print("(%d, %d), %s = %s" % (i,j,featureNamesList[j], v))
    
print("\r\n\r\nThe weighted results\r\n")
for i,j,v in zip(cx2.row, cx2.col, cx2.data):
    if i!=0:
        break
    print("(%d, %d), %s = %s" % (i,j,featureNamesList[j], v))

optimal cool full review
(0, 165), cool = 1
(0, 329), full = 1
(0, 727), review = 1


The weighted results

(0, 727), review = 0.18351936503364819
(0, 329), full = 0.18351936503364819
(0, 165), cool = 0.9657335477839077


In [45]:
# look at some more data
import scipy.sparse

featureNamesList=vectorizer.get_feature_names()
type(test_tfidf)

cx = scipy.sparse.coo_matrix(test_tfidf)

print(dfTest.iloc[0])
for i,j,v in zip(cx.row, cx.col, cx.data):
    if i!=0:
        break
    print("(%d, %d), %s = %s" % (i,j,featureNamesList[j], v))


works perfectly first try getting firewall mappings set full review
(0, 986), works = 0.23358367392553098
(0, 898), try = 0.357978945729349
(0, 768), set = 0.35878874929416504
(0, 727), review = 0.06746594947988933
(0, 630), perfectly = 0.37730389166170025
(0, 346), getting = 0.3911631890718122
(0, 329), full = 0.06746594947988933
(0, 307), first = 0.3642483367150333
(0, 306), firewall = 0.5014113565166582


### Modelling part
1. Leverage the raw vector count and the tf-idf weighted version

In [46]:
# # Just looking at some data
# train_tfidf
# dfTrain
# print(df.iloc[105146])
# dfTrain
# df
# train_tfidf.todense()
# dfTrain.to_frame()

In [47]:
# Get trained Y data and test Y data
trainYdata = pd.merge(dfTrain.to_frame(), dfCleaned[['reviewerRating']], left_index=True, right_index=True)
testYdata = pd.merge(dfTest.to_frame(), dfCleaned[['reviewerRating']], left_index=True, right_index=True)

In [48]:
# Get trained X data and test X data
trainXdata = train_tfidf.todense()
testXdata = test_tfidf.todense()


In [49]:
from sklearn.datasets import load_iris
from sklearn.linear_model import LogisticRegression

In [53]:
# Train data - map the y to ints of scales of reviews 0,1,2,3,4 reviews
y = trainYdata[['reviewerRating']]
y_int = trainYdata['reviewerRating'].apply(lambda x: 0 if x<.2 else (1 if x<.4 else (2 if x<.6 else (3 if x<.8 else 4))))

X = trainXdata

clf = LogisticRegression(random_state=0, max_iter=1000).fit(X, y_int)

# Check trained accuracy
clf.predict(X[:2, :])
clf.predict_proba(X[:2, :])
clf.score(X, y_int)


0.8203898141443399

### Check test accuracy

In [54]:
y_int_test = testYdata['reviewerRating'].apply(lambda x: 0 if x<.2 else (1 if x<.4 else (2 if x<.6 else (3 if x<.8 else 4))))

In [55]:
clf.predict(testXdata)
clf.predict_proba(testXdata)
clf.score(testXdata, y_int_test)

0.8172519913106444