<center>
<h1> BANK REVIEWS COMPLAINT ANALYSIS
</center>

In [32]:
#import required packages
#basics
import pandas as pd 
import numpy as np

#misc
import gc
import time
import warnings

#stats
#from scipy.misc import imread
from scipy import sparse
import scipy.stats as ss

#viz
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec 
import seaborn as sns
from wordcloud import WordCloud ,STOPWORDS
from PIL import Image
#import matplotlib_venn as venn

#nlp
import string
import re    #for regex
import nltk
from nltk.corpus import stopwords

#import spacy
from nltk import pos_tag
from nltk.stem.wordnet import WordNetLemmatizer 
from nltk.tokenize import word_tokenize

# Tweet tokenizer does not split at apostophes which is what we want
from nltk.tokenize import TweetTokenizer   

from imblearn.over_sampling import RandomOverSampler

#FeatureEngineering
#!pip install lightgbm
#from lightgbm import LGBMClassifier
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, HashingVectorizer, TfidfTransformer
from sklearn.decomposition import TruncatedSVD
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.utils.validation import check_X_y, check_is_fitted
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm, decomposition, ensemble
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import log_loss
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split

import  textblob
#import xgboost
#from keras.preprocessing import text, sequence
#from keras import layers, models, optimizers

from textblob import TextBlob
from nltk.stem import PorterStemmer
import nltk
#nltk.download('wordnet')
from textblob import Word

#settings
start_time=time.time()
color = sns.color_palette()
sns.set_style("dark")
eng_stopwords = set(stopwords.words("english"))
warnings.filterwarnings("ignore")

lem = WordNetLemmatizer()
tokenizer=TweetTokenizer()

%matplotlib inline

### Import Dataset

In [41]:
bankreviews = pd.read_excel("BankReviews.xlsx")

In [42]:
bankreviews.head()

Unnamed: 0,Date,Stars,Reviews,BankName
0,2017-04-10,5,"Great job, Wyndham Capital! Each person was pr...",Wyndham Capital Mortgage
1,2017-02-10,5,Matthew Richardson is professional and helpful...,Wyndham Capital Mortgage
2,2017-08-21,5,We had a past experience with Wyndham Mortgage...,Wyndham Capital Mortgage
3,2017-12-17,5,We have been dealing with Brad Thomka from the...,Wyndham Capital Mortgage
4,2016-05-27,5,I can't express how grateful I am for the supp...,Wyndham Capital Mortgage


In [43]:
bankreviews=bankreviews[['Stars', 'Reviews', 'BankName']]

In [44]:
bankreviews.head()

Unnamed: 0,Stars,Reviews,BankName
0,5,"Great job, Wyndham Capital! Each person was pr...",Wyndham Capital Mortgage
1,5,Matthew Richardson is professional and helpful...,Wyndham Capital Mortgage
2,5,We had a past experience with Wyndham Mortgage...,Wyndham Capital Mortgage
3,5,We have been dealing with Brad Thomka from the...,Wyndham Capital Mortgage
4,5,I can't express how grateful I am for the supp...,Wyndham Capital Mortgage


### Basic Exploratory Analysis

In [37]:
df = bankreviews

In [38]:
df['Reviews'] = df['Reviews'].astype(str)
df['count_sent']=df["Reviews"].apply(lambda x: len(re.findall("\n",str(x)))+1)

#Word count in each comment:
df['count_word']=df["Reviews"].apply(lambda x: len(str(x).split()))

#Unique word count
df['count_unique_word']=df["Reviews"].apply(lambda x: len(set(str(x).split())))

#Letter count
df['count_letters']=df["Reviews"].apply(lambda x: len(str(x)))

#Word density

df['word_density'] = df['count_letters'] / (df['count_word']+1)

#punctuation count
df["count_punctuations"] =df["Reviews"].apply(lambda x: len([c for c in str(x) if c in string.punctuation]))

#upper case words count
df["count_words_upper"] = df["Reviews"].apply(lambda x: len([w for w in str(x).split() if w.isupper()]))

#upper case words count
df["count_words_lower"] = df["Reviews"].apply(lambda x: len([w for w in str(x).split() if w.islower()]))

#title case words count
df["count_words_title"] = df["Reviews"].apply(lambda x: len([w for w in str(x).split() if w.istitle()]))

#Number of stopwords
df["count_stopwords"] = df["Reviews"].apply(lambda x: len([w for w in str(x).lower().split() if w in eng_stopwords]))

#Average length of the words
df["mean_word_len"] = df["Reviews"].apply(lambda x: np.mean([len(w) for w in str(x).split()]))

#Number of numeric
df['numeric'] = df['Reviews'].apply(lambda x :len([x for x in x.split() if x.isdigit()]))

#Number of alphanumeric
df['alphanumeric'] = df['Reviews'].apply(lambda x :len([x for x in x.split() if x.isalnum()]))

#Number of alphabetics
df['alphabetetics'] = df['Reviews'].apply(lambda x :len([x for x in x.split() if x.isalpha()]))

#Number of alphabetics
df['Spaces'] = df['Reviews'].apply(lambda x :len([x for x in x.split() if x.isspace()]))

#Number of Words ends with
df['words_ends_with_et'] = df['Reviews'].apply(lambda x :len([x for x in x.lower().split() if x.endswith('et')]))

#Number of Words ends with
df['words_start_with_no'] = df['Reviews'].apply(lambda x :len([x for x in x.lower().split() if x.startswith('no')]))

# Count the occurences of all words
df['wordcounts'] = df['Reviews'].apply(lambda x :dict([ [t, x.split().count(t)] for t in set(x.split()) ]))

pos_family = {
    'noun' : ['NN','NNS','NNP','NNPS'],
    'pron' : ['PRP','PRP$','WP','WP$'],
    'verb' : ['VB','VBD','VBG','VBN','VBP','VBZ'],
    'adj' :  ['JJ','JJR','JJS'],
    'adv' : ['RB','RBR','RBS','WRB']
}

# function to check and get the part of speech tag count of a words in a given sentence
def check_pos_tag(x, flag):
    cnt = 0
    try:
        wiki = textblob.TextBlob(x)
        for tup in wiki.tags:
            ppo = list(tup)[1]
            if ppo in pos_family[flag]:
                cnt += 1
    except:
        pass
    return cnt

df['noun_count'] = df['Reviews'].apply(lambda x: check_pos_tag(x, 'noun'))
df['verb_count'] = df['Reviews'].apply(lambda x: check_pos_tag(x, 'verb'))
df['adj_count']  = df['Reviews'].apply(lambda x: check_pos_tag(x, 'adj'))
df['adv_count']  = df['Reviews'].apply(lambda x: check_pos_tag(x, 'adv'))
df['pron_count'] = df['Reviews'].apply(lambda x: check_pos_tag(x, 'pron')) 

### Calculating Sentiment analysis using Textblob module

In [45]:
bankreviews['sentiment'] = bankreviews["Reviews"].apply(lambda x: TextBlob(x).sentiment.polarity )

In [46]:
bankreviews.head()

Unnamed: 0,Stars,Reviews,BankName,sentiment
0,5,"Great job, Wyndham Capital! Each person was pr...",Wyndham Capital Mortgage,0.533333
1,5,Matthew Richardson is professional and helpful...,Wyndham Capital Mortgage,0.453333
2,5,We had a past experience with Wyndham Mortgage...,Wyndham Capital Mortgage,-0.033231
3,5,We have been dealing with Brad Thomka from the...,Wyndham Capital Mortgage,0.09374
4,5,I can't express how grateful I am for the supp...,Wyndham Capital Mortgage,0.125


In [47]:
bankreviews.Stars.value_counts()

5    410
1     95
Name: Stars, dtype: int64

### Split the data into train & Test

In [52]:
# create a new DataFrame that only contains the 5-star and 1-star reviews
#yelp_best_worst = yelp[(yelp.stars==5) | (yelp.stars==1)]

# define X and y
X = bankreviews.Reviews
y = bankreviews.Stars

# split the new DataFrame into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(378,)
(127,)
(378,)
(127,)


In [25]:
bankreviews.shape

(505, 4)

In [26]:
bankreviews.head()

Unnamed: 0,Stars,Reviews,BankName,sentiment
0,5,"Great job, Wyndham Capital! Each person was pr...",Wyndham Capital Mortgage,0.533333
1,5,Matthew Richardson is professional and helpful...,Wyndham Capital Mortgage,0.453333
2,5,We had a past experience with Wyndham Mortgage...,Wyndham Capital Mortgage,-0.033231
3,5,We have been dealing with Brad Thomka from the...,Wyndham Capital Mortgage,0.09374
4,5,I can't express how grateful I am for the supp...,Wyndham Capital Mortgage,0.125


### Creating user defined functions for clean the text and pre-process the data

In [48]:
#Abbrevations and Words correction
def clean_text(Reviews):
    Reviews = Reviews.lower()
    Reviews = Reviews.strip()
    Reviews = re.sub(r' +', ' ', Reviews)
    Reviews = re.sub(r"[-()\"#/@;:{}`+=~|.!?,'0-9]", "", Reviews)
    return(Reviews)

In [49]:
stop = set(nltk.corpus.stopwords.words('english'))

nltk.corpus.stopwords.words('english')

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [50]:
import string
def pre_process(Reviews):
    Reviews = Reviews.apply(lambda x: " ".join(x for x in x.split() if x not in stop)) #Removing stop words
    return(Reviews)

In [53]:
X_train = X_train.apply(lambda x: clean_text(x))
X_test = X_test.apply(lambda x: clean_text(x))

In [54]:
X_train=pre_process(X_train)
X_test=pre_process(X_test)

### Vectorization (Count, Tfidf, Hashing)
 - Charter level
 - Word level
 - n-grams

In [55]:
#Train
count_vect = CountVectorizer(analyzer='word', token_pattern=r'\w{1,}', 
                             ngram_range=(1, 1 ), 
                             min_df=0.01, 
                             encoding='latin-1' ,
                             max_features=800)
xtrain_count = count_vect.fit_transform(X_train)

In [56]:
xtrain_count

<378x732 sparse matrix of type '<class 'numpy.int64'>'
	with 9755 stored elements in Compressed Sparse Row format>

### View the document term metrics

In [57]:
dtm=xtrain_count.toarray()

In [58]:
dtm

array([[0, 0, 0, ..., 0, 1, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [59]:
count_vect.get_feature_names()

['aaron',
 'ability',
 'able',
 'absolutely',
 'accept',
 'accommodating',
 'account',
 'accurate',
 'achieve',
 'across',
 'actual',
 'actually',
 'adam',
 'adan',
 'additional',
 'advice',
 'agent',
 'agreed',
 'alex',
 'almost',
 'along',
 'already',
 'also',
 'always',
 'amazing',
 'american',
 'amount',
 'another',
 'answer',
 'answered',
 'answering',
 'answers',
 'anthony',
 'anyone',
 'anything',
 'application',
 'apply',
 'appraisal',
 'appraiser',
 'appreciate',
 'appreciated',
 'approved',
 'around',
 'ask',
 'asked',
 'asking',
 'aspects',
 'asset',
 'attention',
 'attentive',
 'attorney',
 'available',
 'away',
 'awesome',
 'back',
 'bad',
 'balance',
 'bank',
 'banks',
 'barrett',
 'based',
 'basis',
 'beat',
 'became',
 'began',
 'beginning',
 'believe',
 'beneficial',
 'best',
 'better',
 'beyond',
 'big',
 'bob',
 'brent',
 'broker',
 'brokers',
 'bumps',
 'business',
 'buy',
 'buyer',
 'buyers',
 'buying',
 'ca',
 'call',
 'called',
 'calling',
 'calls',
 'calm',
 'ca

In [60]:
dtm1=pd.DataFrame(dtm)

In [61]:
dtm1.columns=count_vect.get_feature_names()

In [62]:
dtm1

Unnamed: 0,aaron,ability,able,absolutely,accept,accommodating,account,accurate,achieve,across,...,worth,would,wouldnt,writing,written,wrong,wyndham,year,years,yet
0,0,0,0,0,1,0,0,0,0,0,...,0,0,1,0,0,0,0,0,1,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,1,0,0,0,0,0,...,0,0,1,0,0,0,0,0,1,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
373,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
374,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
375,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
376,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Vectorization (count, tfidf) for both train & test

In [63]:
#Train
count_vect = CountVectorizer(analyzer='word', 
                             token_pattern=r'\w{1,}', 
                             ngram_range=(1, 1 ), 
                             min_df=5, 
                             encoding='latin-1' , 
                             max_features=800)

xtrain_count = count_vect.fit_transform(X_train)

tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(xtrain_count)

#Test
#count_vect = CountVectorizer()
xtest_count = count_vect.transform(X_test)

#tfidf_transformer = TfidfTransformer()
X_test_tfidf = tfidf_transformer.transform(xtest_count)


In [64]:
dtm2=pd.DataFrame(X_train_tfidf.toarray(), columns=count_vect.get_feature_names())

In [65]:
dtm2.head()

Unnamed: 0,aaron,able,absolutely,accept,accommodating,account,accurate,across,actual,adam,...,working,works,would,wouldnt,writing,wrong,wyndham,year,years,yet
0,0.0,0.0,0.0,0.133259,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.123737,0.0,0.0,0.0,0.0,0.09625,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.194406,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.130702,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.121363,0.0,0.0,0.0,0.0,0.094403,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [66]:
# ngram level tf-idf 
tfidf_vect_ngram = TfidfVectorizer(analyzer='word', token_pattern='\w{1,}', ngram_range=(1, 2), max_features=800)
tfidf_vect_ngram.fit(df['Reviews'])
xtrain_tfidf_ngram =  tfidf_vect_ngram.transform(X_train)
xtest_tfidf_ngram =  tfidf_vect_ngram.transform(X_test)

### Create user defined function for train the models

In [67]:
def train_model(classifier, feature_vector_train, label, feature_vector_valid,  valid_y):
    # fit the training dataset on the classifier
    classifier.fit(feature_vector_train, label)
    
    # predict the labels on validation dataset
    predictions = classifier.predict(feature_vector_valid)
    
    return metrics.accuracy_score(classifier.predict(feature_vector_train), label), metrics.accuracy_score(predictions, valid_y)

### Building different models with different vectors

In [68]:
unique_elements, counts_elements = np.unique(y_train, return_counts=True)
print("Frequency of unique values of the said array:")
print(np.asarray((unique_elements, counts_elements)))

Frequency of unique values of the said array:
[[  1   5]
 [ 70 308]]


In [69]:
ros = RandomOverSampler(random_state=123)

X_train_tfidf_os, y_train_tfidf_os = ros.fit_sample(X_train_tfidf, y_train)

X_train_cnt_os, y_train_cnt_os = ros.fit_sample(xtrain_count, y_train)

X_train_tfidf_ngram_os, y_train_tfidf_ngram_os = ros.fit_sample(xtrain_tfidf_ngram, y_train)

unique_elements, counts_elements = np.unique(y_train_tfidf_os, return_counts=True)
print("Frequency of unique values of the said array:")
print(np.asarray((unique_elements, counts_elements)))

Frequency of unique values of the said array:
[[  1   5]
 [308 308]]


## Naive Bayes

In [70]:
#Naive Bayes
# Naive Bayes on TF-IDF
accuracy_L1 = train_model(naive_bayes.MultinomialNB(), X_train_tfidf_os, y_train_tfidf_os, X_test_tfidf, y_test)
print("NB  for L1, Count Vectors: ", accuracy_L1)

# Naive Bayes on Word Level TF IDF Vectors
accuracy_L1 = train_model(naive_bayes.MultinomialNB(), X_train_cnt_os, y_train_cnt_os, xtest_count, y_test)
print("NB  for L1, WordLevel TF-IDF: ", accuracy_L1)

# Naive Bayes on Ngram Level TF IDF Vectors
accuracy_L1 = train_model(naive_bayes.MultinomialNB(), X_train_tfidf_ngram_os, y_train_tfidf_ngram_os, xtest_tfidf_ngram, y_test)
print("NB  for L1, N-Gram Vectors: ", accuracy_L1)

NB  for L1, Count Vectors:  (0.9756493506493507, 0.952755905511811)
NB  for L1, WordLevel TF-IDF:  (0.9724025974025974, 0.9606299212598425)
NB  for L1, N-Gram Vectors:  (0.974025974025974, 0.9448818897637795)


## Logistic Regression

In [71]:
#Logistic Regression
# Logistic Regression on Count Vectors and TF-IDF
accuracy_L1 = train_model(LogisticRegression(), X_train_tfidf_os, y_train_tfidf_os, X_test_tfidf, y_test)
print("LR  for L1, Count Vectors: ", accuracy_L1)



# Logistic Regression on Word Level TF IDF Vectors
accuracy_L1 = train_model(LogisticRegression(), X_train_cnt_os, y_train_cnt_os, xtest_count, y_test)
print("LR  for L1, WordLevel TF-IDF: ", accuracy_L1)



# Logistic Regression on Ngram Level TF IDF Vectors
accuracy_L1 = train_model(LogisticRegression(), X_train_tfidf_ngram_os, y_train_tfidf_ngram_os, xtest_tfidf_ngram, y_test)
print("LR  for L1, N-Gram Vectors: ", accuracy_L1)


LR  for L1, Count Vectors:  (0.9837662337662337, 0.968503937007874)
LR  for L1, WordLevel TF-IDF:  (0.9983766233766234, 0.9763779527559056)
LR  for L1, N-Gram Vectors:  (0.9756493506493507, 0.9606299212598425)


## Linear SVC

In [72]:
print(metrics.confusion_matrix(y_test, y_pred_rfc))

NameError: name 'y_pred_rfc' is not defined

In [59]:
#Linear SVC
# Linear SVC on Count Vectors and TF-IDF
accuracy_L1 = train_model(SVC(), X_train_tfidf_os, y_train_tfidf_os, X_test_tfidf, y_test)
print("SVC  for L1, Count Vectors: ", accuracy_L1)



# Linear SVC on Word Level TF IDF Vectors
accuracy_L1 = train_model(SVC(), X_train_cnt_os, y_train_cnt_os, xtest_count, y_test)
print("SVC  for L1, WordLevel TF-IDF: ", accuracy_L1)



# Linear SVC on Ngram Level TF IDF Vectors
accuracy_L1 = train_model(SVC(), X_train_tfidf_ngram_os, y_train_tfidf_ngram_os, xtest_tfidf_ngram, y_test)
print("SVC  for L1, N-Gram Vectors: ", accuracy_L1)

SVC  for L1, Count Vectors:  (0.9983766233766234, 0.937007874015748)
SVC  for L1, WordLevel TF-IDF:  (0.9756493506493507, 0.9448818897637795)
SVC  for L1, N-Gram Vectors:  (0.9983766233766234, 0.9448818897637795)


### Adding Features to a Document-Term Matrix

In [62]:
# create a DataFrame that only contains the 5-star and 1-star reviews

bankreviews['Stars'] = np.where(bankreviews['Stars']>3, 5, np.where(bankreviews['Stars']<3,1,3))

bankreviews = bankreviews[(bankreviews.Stars==5) | (bankreviews.Stars==1)]

# define X and y
feature_cols = ['Reviews', 'sentiment', 'BankName']
X = bankreviews[feature_cols]
y = bankreviews.Stars

# split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [64]:
# use CountVectorizer with text column only
vect = TfidfVectorizer(lowercase=True, stop_words='english', max_features=1000, min_df=5, ngram_range=(1, 2))
X_train_dtm = vect.fit_transform(X_train.Reviews)
X_test_dtm = vect.transform(X_test.Reviews)
print(X_train_dtm.shape)
print(X_test_dtm.shape)

# shape of other four feature columns
X_train.drop('Reviews', axis=1).shape

(378, 627)
(127, 627)


(378, 2)

In [65]:
# use CountVectorizer with text column only
vect = CountVectorizer()
X_train_dtm = vect.fit_transform(X_train.Reviews)
X_test_dtm = vect.transform(X_test.Reviews)
print(X_train_dtm.shape)
print(X_test_dtm.shape)

# shape of other four feature columns
X_train.drop('Reviews', axis=1).shape

(378, 2438)
(127, 2438)


(378, 2)

In [97]:
# cast other feature columns to float and convert to a sparse matrix
extra = sparse.csr_matrix(X_train.drop('Reviews', axis=1).astype(float))
extra.shape

# combine sparse matrices
X_train_dtm_extra = sparse.hstack((X_train_dtm, extra))
X_train_dtm_extra.shape

# repeat for testing set
extra = sparse.csr_matrix(X_test.drop('Reviews', axis=1).astype(float))
X_test_dtm_extra = sparse.hstack((X_test_dtm, extra))
X_test_dtm_extra.shape

AttributeError: drop not found

In [67]:
# use logistic regression with text column only
logreg = LogisticRegression(C=1e9)
logreg.fit(X_train_dtm, y_train)
y_pred_class = logreg.predict(X_test_dtm)
print(metrics.accuracy_score(y_test, y_pred_class))
print(metrics.roc_auc_score(y_test, y_pred_class))

0.952755905511811
0.9101960784313725


In [98]:
# use logistic regression with all features
logreg = LogisticRegression(C=1e9)
logreg.fit(X_train_dtm_extra, y_train)
y_pred_class = logreg.predict(X_test_dtm_extra)
print(metrics.accuracy_score(y_test, y_pred_class))
print(metrics.roc_auc_score(y_test, y_pred_class))

NameError: name 'X_train_dtm_extra' is not defined

# Topic Modeling

In [69]:
bankreviews.head()

Unnamed: 0,Stars,Reviews,BankName,sentiment
0,5,"Great job, Wyndham Capital! Each person was pr...",Wyndham Capital Mortgage,0.533333
1,5,Matthew Richardson is professional and helpful...,Wyndham Capital Mortgage,0.453333
2,5,We had a past experience with Wyndham Mortgage...,Wyndham Capital Mortgage,-0.033231
3,5,We have been dealing with Brad Thomka from the...,Wyndham Capital Mortgage,0.09374
4,5,I can't express how grateful I am for the supp...,Wyndham Capital Mortgage,0.125


In [72]:
data = bankreviews


In [74]:
#make a copy of columns I need from raw data
df1 = data.iloc[:, [0,1,2]]

In [75]:
df1.head()

Unnamed: 0,Stars,Reviews,BankName
0,5,"Great job, Wyndham Capital! Each person was pr...",Wyndham Capital Mortgage
1,5,Matthew Richardson is professional and helpful...,Wyndham Capital Mortgage
2,5,We had a past experience with Wyndham Mortgage...,Wyndham Capital Mortgage
3,5,We have been dealing with Brad Thomka from the...,Wyndham Capital Mortgage
4,5,I can't express how grateful I am for the supp...,Wyndham Capital Mortgage


In [76]:
df1 = df1.rename(columns={"Stars":"Score","Reviews":"Text"})
df1.head()

Unnamed: 0,Score,Text,BankName
0,5,"Great job, Wyndham Capital! Each person was pr...",Wyndham Capital Mortgage
1,5,Matthew Richardson is professional and helpful...,Wyndham Capital Mortgage
2,5,We had a past experience with Wyndham Mortgage...,Wyndham Capital Mortgage
3,5,We have been dealing with Brad Thomka from the...,Wyndham Capital Mortgage
4,5,I can't express how grateful I am for the supp...,Wyndham Capital Mortgage


In [77]:
df1['Bad'] = np.where(df1.Score<=4,1,0)

In [78]:
df1.head()

Unnamed: 0,Score,Text,BankName,Bad
0,5,"Great job, Wyndham Capital! Each person was pr...",Wyndham Capital Mortgage,0
1,5,Matthew Richardson is professional and helpful...,Wyndham Capital Mortgage,0
2,5,We had a past experience with Wyndham Mortgage...,Wyndham Capital Mortgage,0
3,5,We have been dealing with Brad Thomka from the...,Wyndham Capital Mortgage,0
4,5,I can't express how grateful I am for the supp...,Wyndham Capital Mortgage,0


In [79]:
# convert text to lowercase
df1.loc[:, 'Text'] = df1['Text'].str.lower()

In [80]:
# get the tf-idf table 

from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(min_df = 0.05, max_df=0.95,
                             ngram_range=(1, 1), 
                             stop_words='english')
vectorizer.fit(df1['Text'])

TfidfVectorizer(max_df=0.95, min_df=0.05, stop_words='english')

In [81]:
# Create a training data
X_train = vectorizer.transform(df1['Text'])

### Segmentation

In [82]:
#Apply TfidfVectorizer to review text
from sklearn.cluster import KMeans
from sklearn import metrics

In [83]:
# we create a kmeans model
model = KMeans(n_clusters=6, 
               init='k-means++', 
               max_iter=100, n_init=1,random_state=5)

In [84]:
# apply the model on this prepared data
res = model.fit(X_train)
res

KMeans(max_iter=100, n_clusters=6, n_init=1, random_state=5)

In [85]:
# Getting the vocalbulary as well as cluster centers
# Firstly, set of words from the tf-idf itself
vocab = vectorizer.get_feature_names()
vocab = np.array(vocab)

In [86]:
# Then, the cluster centers from the model fit that we stored on res
cluster_centers = np.array(res.cluster_centers_)


In [87]:
#x=np.array((0.1,10,0.05,1.5))
#print(x)
#print(x.argsort())
res.cluster_centers_[0].argsort()

array([ 52,  47,  25,  24,  48,  22,  57,  64,  65,  68,  17,  74,  80,
        40,  14,  82,  83,  10,  86,  91,   7,  93,  96,   3,   2,   1,
        81,  29,  99, 105,  13,  34,  61,  59,  16,  15,  41,  18,  19,
        97,  63,  73,  71,  72,  85,  11,  60,  84,  31,  66, 104,   6,
        54,  45,   9,  58,  77,  21,  32,  78,  37, 103,  95,  33,  36,
        98,  28, 101,  87,  43,  38,  88,  39,  26, 100,  46,   8,  79,
        62,  20,  30,  49,  53,  90,  94,  92, 102,  55,   5,  42,  51,
        89,  67,  69,  70,  76,  50,   4,  75,  44,  27,  12,  56,  23,
        35,   0], dtype=int64)

In [88]:
# sorting the cluster centers
sorted_vals = [res.cluster_centers_[i].argsort() for i in range(0,np.shape(res.cluster_centers_)[0])]

In [89]:
# get top 10 words from that cluster
words=set()
for i in range(len(res.cluster_centers_)):
    words = set(vocab[sorted_vals[i][-10:]])
    print(words)

{'end', 'didn', 'getting', 'helpful', 'beginning', 'know', 'closed', 'loan', 'rate', 'able'}
{'mortgage', 'looking', 'service', 'recommend', 'process', 'loan', 'company', 'customer', 'provided', 'nasb'}
{'officer', 'knowledgeable', 'best', 'thank', 've', 'loan', 'process', 'responsive', 'great', 'worked'}
{'pleasure', 'easy', 'closing', 'great', 'process', 'got', 'work', 'time', 'nasb', 'team'}
{'hard', 'recommend', 'friendly', 'loan', 'professional', 'questions', 'responsive', 'highly', 'worked', 'team'}
{'did', 'helpful', 'closing', 'process', 'loan', 'home', 'questions', 'rate', 'time', 'read'}


In [90]:
words=list(words)

In [91]:
# how many observations are in each cluster

df1['cluster'] = model.labels_
df1.groupby('cluster').count()

Unnamed: 0_level_0,Score,Text,BankName,Bad
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,29,29,29,29
1,105,105,105,105
2,69,69,69,69
3,90,90,90,90
4,57,57,57,57
5,155,155,155,155


In [92]:
# what does each cluster look like
df1.groupby('cluster').mean()

Unnamed: 0_level_0,Score,Bad
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1
0,4.172414,0.206897
1,3.895238,0.27619
2,4.710145,0.072464
3,4.6,0.1
4,4.859649,0.035088
5,3.864516,0.283871


### Topic Modeling

In [93]:
# train a LDA Model

from sklearn import decomposition

lda_model = decomposition.LatentDirichletAllocation(n_components=10, learning_method='online', max_iter=50)
X_topics = lda_model.fit_transform(X_train)
topic_word = lda_model.components_ 
vocab = vectorizer.get_feature_names()

In [94]:
# view the topic models
n_top_words = 10
topic_summaries = []
for i, topic_dist in enumerate(topic_word):
    topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n_top_words+1):-1]
    topic_summaries.append(' '.join(topic_words))

topic_summaries

['great process mortgage quick pleasure work definitely questions knowledgeable easy',
 'closing rate said lender told did loan weeks day credit',
 'step time way helpful make able helped loan took home',
 'got best work beginning answered couldn end calls thanks really',
 'thank loan communication happy future good sure business officer house',
 'process team service great loan home time best professional recommend',
 'responsive good extremely smooth process feel helpful really work service',
 'company loan calls mortgage phone officer refinance customer times things',
 'home amazing recommend hard helped experience working highly closed thanks',
 'weeks refinancing later closed told home company make recommend closing']