In [1]:
import itertools
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import scipy.sparse as sps
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import normalize, MaxAbsScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from pylab import rcParams
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
import re
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.svm import LinearSVC
from sklearn.feature_selection import SelectFromModel
import pickle
from sklearn.neural_network import MLPClassifier

In [3]:
def wordSplit(rev, stop_words = stopwords.words('English')):
    s = [item for sublist in [f.lower().split() for f in re.findall('\d+|\D+',rev.lower())] for item in sublist]
    s = [x for x in s if x not in stop_words]
    
    return s

def bagOfWordsShingles(review, k=3, stop_words = stopwords.words('English')):
    s = wordSplit(review, stop_words)
    if len(s) < k:
        tokens = ['']
        for s_ in s:
            tokens[0] += s_ + ' '
        tokens[0] = tokens[0][:-1]
        return tokens
        
    tokens = []
    for i in range(len(s) - k + 1):
        bag = ''
        for j in range(i, i + k):
            bag += s[j]+' '
        tokens.append(bag[:-1])
    return tokens

def bagOfWordsShingles2(review, k=3, stop_words = stopwords.words('English')):
    s = wordSplit(review, stop_words)
    tokens = []
    for K in range(0,k):
        for i in range(len(s) - (k-K) + 1):
            bag = ''
            for j in range(i, i + k-K):
                bag += s[j]+' '
            tokens.append(bag[:-1])

    return tokens

In [4]:
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    #rcParams['figure.figsize'] = 6, 6
    
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    #plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

In [5]:
def train_test_LR(featMap, trueLabels, testSize=.33):
    train, test, train_labels, test_labels = train_test_split(featMap,trueLabels,test_size=testSize,random_state=42)
    train = MaxAbsScaler().fit_transform(train)
    test = MaxAbsScaler().fit_transform(test)
    
    lreg = LogisticRegression(tol=0.001)
    lreg.fit(train,train_labels)
    test_predict = lreg.predict(test)
    
    confMat = confusion_matrix(test_labels,test_predict).astype(float)
    normConfMat = confMat.copy()
    normConfMat[0,:] /= confMat[0,:].sum()
    normConfMat[1,:] /= confMat[1,:].sum()
    return confMat, normConfMat, lreg.score(test, test_labels)

In [6]:
def extractFeatures(englishReviews, wordThreshold = 0):
    thrPosEngRevs = englishReviews.loc[englishReviews['Positive_Processed_Word_Count'] > wordThreshold]['Positive_Review']
    thrNegEngRevs = englishReviews.loc[englishReviews['Negative_Processed_Word_Count'] > wordThreshold]['Negative_Review']

    allReviews = pd.concat([thrPosEngRevs, thrNegEngRevs])
    labels = np.hstack([np.ones(thrPosEngRevs.shape[0]), np.zeros(thrNegEngRevs.shape[0])])
    tfidf = TfidfVectorizer(stop_words=stopwords.words('English'),tokenizer=bagOfWordsShingles2, lowercase=True).fit(allReviews)
    featureMatrix = tfidf.fit_transform(allReviews)
    
    return tfidf, featureMatrix, labels

In [None]:
#englishReviews = pd.read_csv('UKReviewsWordCounted.csv')
allReviews = pd_read_csv('Hotel_Reviews.csv')
englishReviews = allReviews.loc[allReviews['Reviewer_Nationality'] == ' United Kingdom ']

In [7]:


thrPosEngRevs = englishReviews['Positive_Review']
thrNegEngRevs = englishReviews['Negative_Review']

allReviews = pd.concat([thrPosEngRevs, thrNegEngRevs])


tfidf = TfidfVectorizer(stop_words=stopwords.words('English'),tokenizer=wordSplit, lowercase=True).fit(allReviews)
featureMatrix = tfidf.transform(allReviews)
labels = np.hstack([np.ones(thrPosEngRevs.shape[0]), np.zeros(thrNegEngRevs.shape[0])])


featureMatrix = TfidfVectorizer(stop_words=stopwords.words('English'),tokenizer=wordSplit, lowercase=True).fit_transform(allReviews)
labels = np.hstack([np.ones(thrPosEngRevs.shape[0]), np.zeros(thrNegEngRevs.shape[0])])

In [None]:
# Dim Reduce on whole dataset
# Don't do this...
selector = LinearSVC(tol=0.001).fit(featureMatrix, labels)
selector = LogisticRegression(tol = 0.001).fit(featureMatrix, labels)
model = SelectFromModel(selector, prefit=True, threshold='1.25*mean')

# Train & Test on SVM and LREG

In [18]:
train, test, train_labels, test_labels = train_test_split(featureMatrix,labels,test_size=.33,random_state=42)

selector = LinearSVC(tol=0.001, penalty="l1", dual=False).fit(train,train_labels)
#selector = LogisticRegression().fit(train,train_labels)
model = SelectFromModel(selector, prefit=True, threshold='2*mean')

train = model.transform(train)
test = model.transform(test)

print train.shape, test.shape

### Until here, you have the dimensionality reduction
### From then on, do your ML algorithm

lsvc = LinearSVC().fit(train, train_labels)
test_predict = lsvc.predict(test)

cm = confusion_matrix(test_labels,test_predict).astype(float)
cm[0,:] /= cm[0,:].sum()
cm[1,:] /= cm[1,:].sum()

for c in cm:
    print c

lreg = LogisticRegression().fit(train,train_labels)
test_predict = lreg.predict(test)

cm = confusion_matrix(test_labels,test_predict).astype(float)
cm[0,:] /= cm[0,:].sum()
cm[1,:] /= cm[1,:].sum()
for c in cm:
    print c

(328629, 6679) (161863, 6679)
[ 0.94804551  0.05195449]
[ 0.06734666  0.93265334]
[ 0.95074734  0.04925266]
[ 0.06691551  0.93308449]


# SVM
SVM:
Normalized confusion matrix
[[ 0.94740104  0.05259896]
 [ 0.08559075  0.91440925]]
LRGEG:
Normalized confusion matrix
[[ 0.94666981  0.05333019]
 [ 0.08183352  0.91816648]]
DIM: 376 / 43330

# LREG
SVM:
Normalized confusion matrix
[[ 0.94790918  0.05209082]
 [ 0.06632421  0.93367579]]
LRGEG:
Normalized confusion matrix
[[ 0.9512183   0.0487817 ]
 [ 0.06607783  0.93392217]]
DIM: 9493 / 43330