### Libraries

In [2]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cross_validation import train_test_split
from sklearn import naive_bayes
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
import numpy as np
import re
from bs4 import BeautifulSoup
from scipy.sparse import hstack
from gensim.models import Doc2Vec, Word2Vec
from KaggleWord2VecUtility import KaggleWord2VecUtility



### Data Import

In [4]:
train = pd.read_csv("train.csv", header=0, sep=",", encoding="latin_1")
test =  pd.read_csv("test.csv", header=0, sep=",", encoding="latin_1")

In [5]:
print(train.head())
print(test.head())

     id   rating                                             review
0  3253     poor  She treats students like they're kids & is ver...
1  2968  awesome  I have graduated from UofT over a year ago, bu...
2  1138     good  He may not use big words all the time, but he ...
3   209     good  Dr. Molumby was one of those professors you et...
4  3418  awesome  She's a little stress because of the class loa...
     id                                             review
0  2437  Fun lecturer, tho what he teaches is by far th...
1  3470  She is a very kind professor. Her lecture is e...
2  3867  Professor Bauman is one of the most inspiratio...
3  1784  the class is multiple choice but he just wants...
4   594  Mr. Everett is a wonderful instructor. He has ...


#### 5 Classification Labels: {awesom, good, average, poor, awful}
#### Train data: records
#### Test data: records

### Data Manipulation

Removing numbers from reviews

In [150]:
new_list = []
for word in train.review:
    new_list.append(re.sub("[^a-zA-Z]", " ", word))
train.review = new_list
new_list = []
for word in test.review:
    new_list.append(re.sub("[^a-zA-Z]", " ", word))
test.review = new_list

In [8]:
clean_train_reviews = []
for review in train['review']:
    clean_train_reviews.append(" ".join(KaggleWord2VecUtility.review_to_wordlist(review)))

clean_test_reviews = []
for review in test['review']:
    clean_test_reviews.append(" ".join(KaggleWord2VecUtility.review_to_wordlist(review)))



 BeautifulSoup(YOUR_MARKUP})

to this:

 BeautifulSoup(YOUR_MARKUP, "html5lib")

  markup_type=markup_type))


In [9]:
stopset = set(stopwords.words('english'))
vectorizer = TfidfVectorizer(use_idf=True, lowercase=True, strip_accents='ascii')

In [10]:
vectorizer

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents='ascii', sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [18]:
# dependent variable
#y = df.rating
y = train.rating

In [14]:
# Convert from reviews to features
#X = vectorizer.fit_transform(df.review)
X = vectorizer.fit_transform(clean_train_reviews)

In [24]:
test_data_features = vectorizer.transform(clean_test_reviews)

In [309]:
print(y.shape)
print(X.shape)

(3079,)
(3079, 5436)


In [15]:
vocab = vectorizer.get_feature_names()
print(vocab)



In [107]:
# To print the count of our vocab
dist = np.sum(X, axis=0)
# For each, print the vocabulary word and the number of times it 
# appears in the training set
for tag, count in zip(vocab, dist):
    print (count,tag)

[[ 0.32339125  0.26927859  0.57089978 ...,  0.30133727  0.20439133
   0.29907638]] abc


In [None]:
# Test Train split as usual
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=42)

## Logistic Regression

In [28]:
clf = LogisticRegression(class_weight="balanced", solver="newton-cg", tol = 1e-3)
clf.fit(X, y)
predictions = clf.predict(test_data_features)
output = pd.DataFrame( data={"id":test["id"], "rating":predictions} )
output.to_csv( "regression.csv", index=False, quoting=3 )

## Random Forest
Now using random forest classifier


In [294]:
from sklearn.ensemble import RandomForestClassifier
forest = RandomForestClassifier(n_estimators = 110)
forest = forest.fit( X, y )

In [296]:
##Prediction:
predictions = forest.predict(test_data_features)
output = pd.DataFrame( data={"id":test["id"], "rating":predictions} )

In [108]:
#test = pd.read_csv("test.csv", header=0, sep=",", encoding="latin_1")
print (test.shape)

(1320, 2)


In [295]:
test_data_features = vectorizer.transform(test.review)
test_data_features = test_data_features.toarray()

In [297]:
## Writing to csv
output.to_csv( "randomforest.csv", index=False, quoting=3 )

# Support Vector Machines

kernel=linear Score: 0.531

In [311]:
clf = svm.SVC(kernel='linear', C = 1.0)
clf.fit(X, y)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [312]:
clf.score(X, y)

0.82137057486196818

In [313]:
test_data_features = vectorizer.transform(test.review)
test_data_features = test_data_features.toarray()
predictions = clf.predict(test_data_features)

In [314]:
output = pd.DataFrame(data={"id":test["id"], "rating":predictions})
output.to_csv("svm.csv", index=False, quoting=3)

## Ensemble BOW and Word2Vec

In [None]:
print("Combing the bag of words and the w2v vectors...\n")
train_bwv = hstack([train_bow, train_w2v])
test_bwv = hstack([test_bow, test_w2v])

# Word Vectors

In [1]:
import nltk.data
#import pattern

ModuleNotFoundError: No module named 'nltk.data'

In [186]:
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

In [192]:
def review_to_wordlist( review, remove_stopwords=False ):
    # Function to convert a document to a sequence of words,
    # optionally removing stop words.  Returns a list of words.
    #
    # 1. Remove HTML
    review_text = BeautifulSoup(review).get_text()
    #  
    # 2. Remove non-letters
    #review_text = re.sub("[^a-zA-Z]"," ", review_text)
    #
    # 3. Convert words to lower case and split them
    words = review_text.lower().split()
    #
    # 4. Optionally remove stop words (false by default)
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        words = [w for w in words if not w in stops]
    #
    # 5. Return a list of words
    return(words)

In [193]:
# Split a review into parsed sentences
def review_to_sentences( review, tokenizer, remove_stopwords=False ):
    # Function to split a review into parsed sentences. Returns a 
    # list of sentences, where each sentence is a list of words
    #
    # 1. Use the NLTK tokenizer to split the paragraph into sentences
    raw_sentences = tokenizer.tokenize(review.strip())
    #
    # 2. Loop over each sentence
    sentences = []
    for raw_sentence in raw_sentences:
        # If a sentence is empty, skip it
        if len(raw_sentence) > 0:
            # Otherwise, call review_to_wordlist to get a list of words
            sentences.append( review_to_wordlist( raw_sentence, \
              remove_stopwords ))
    #
    # Return the list of sentences (each sentence is a list of words,
    # so this returns a list of lists
    return sentences

In [194]:
sentences = []  # Initialize an empty list of sentences

print("Parsing sentences from training set")
for review in train["review"]:
    sentences += review_to_sentences(review, tokenizer)
##
#print("Parsing sentences from unlabeled set")
#for review in unlabeled_train["review"]:
#    sentences += review_to_sentences(review, tokenizer)

Parsing sentences from training set




 BeautifulSoup(YOUR_MARKUP})

to this:

 BeautifulSoup(YOUR_MARKUP, "html5lib")

  markup_type=markup_type))
  ' Beautiful Soup.' % markup)
  ' Beautiful Soup.' % markup)


In [196]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',\
    level=logging.INFO)

In [197]:
# Setting values for various parameters
num_features = 300    # Word vector dimensionality                      
min_word_count = 40   # Minimum word count                        
num_workers = 4       # Number of threads to run in parallel
context = 10          # Context window size                                                                                    
downsampling = 1e-3   # Downsample setting for frequent words


In [None]:
from gensim.models import word2vec
print("Training model...")
model = word2vec.Word2Vec(sentences, workers=num_workers, \
            size=num_features, min_count = min_word_count, \
            window = context, sample = downsampling)

In [201]:
model_name = "300features_40count_10context"
model.save(model_name)

2017-11-25 01:27:07,872 : INFO : saving Word2Vec object under 300features_40count_10context, separately None
2017-11-25 01:27:07,872 : INFO : not storing attribute syn0norm
2017-11-25 01:27:07,888 : INFO : not storing attribute cum_table
2017-11-25 01:27:07,988 : INFO : saved 300features_40count_10context


In [202]:
logging.disabled = True

In [212]:
# Testing the most_similar
word = "best"
try:
    print(model.most_similar(word))
except:
    print(word, "not in the vocab")

[('ever', 0.9976127743721008), ('had', 0.9949758052825928), ('one', 0.9927864074707031), ("i've", 0.9892295002937317), ('professor', 0.9876281023025513), ('worst', 0.9864251017570496), ('my', 0.9834612607955933), ('teacher', 0.9754388928413391), ('i', 0.9700353741645813), ('had.', 0.9571642875671387)]


In [221]:
from gensim.models import Word2Vec
model = Word2Vec.load("300features_40count_10context")
type(model.wv.syn0)

2017-11-25 01:46:36,083 : INFO : loading Word2Vec object from 300features_40count_10context
2017-11-25 01:46:36,136 : INFO : loading wv recursively from 300features_40count_10context.wv.* with mmap=None
2017-11-25 01:46:36,136 : INFO : setting ignored attribute syn0norm to None
2017-11-25 01:46:36,152 : INFO : setting ignored attribute cum_table to None
2017-11-25 01:46:36,152 : INFO : loaded 300features_40count_10context


numpy.ndarray

In [222]:
model.wv.syn0.shape

(381, 300)

In [None]:
model["guy"]

In [231]:
def makeFeatureVec(words, model, num_features):
    # Function to average all of the word vectors in a given
    # paragraph
    #
    # Pre-initialize an empty numpy array (for speed)
    featureVec = np.zeros((num_features,),dtype="float32")
    #
    nwords = 0.
    # 
    # Index2word is a list that contains the names of the words in 
    # the model's vocabulary. Convert it to a set, for speed 
    index2word_set = set(model.wv.index2word)
    #
    # Loop over each word in the review and, if it is in the model's
    # vocaublary, add its feature vector to the total
    for word in words:
        if word in index2word_set: 
            nwords = nwords + 1.
            featureVec = np.add(featureVec,model[word])
    # 
    # Divide the result by the number of words to get the average
    featureVec = np.divide(featureVec,nwords)
    return featureVec

In [239]:
def getAvgFeatureVecs(reviews, model, num_features):
    # Given a set of reviews (each one a list of words), calculate 
    # the average feature vector for each one and return a 2D numpy array 
    # 
    # Initialize a counter
    counter = 0
    # 
    # Preallocate a 2D numpy array, for speed
    reviewFeatureVecs = np.zeros((len(reviews),num_features),dtype="float32")
    # 
    # Loop through the reviews
    for review in reviews:
        
       #
       # Print a status message every 1000th review
       #if counter%1000. == 0.:
       #    print("Review %d of %d") % (counter, len(reviews))
       # 
       # Call the function (defined above) that makes average feature vectors
        print("Counter:", counter)
        reviewFeatureVecs[counter] = makeFeatureVec(review, model, \
           num_features)
       #
       # Increment the counter
        counter = counter + 1
    return reviewFeatureVecs

Now, we can call these functions to create average vectors for each paragraph

In [240]:
# ****************************************************************
# Calculate average feature vectors for training and testing sets,
# using the functions we defined above. Notice that we now use stop word
# removal.

clean_train_reviews = []
for review in train["review"]:
    clean_train_reviews.append( review_to_wordlist( review, \
        remove_stopwords=True ))

trainDataVecs = getAvgFeatureVecs( clean_train_reviews, model, num_features )

print("Creating average feature vecs for test reviews")
clean_test_reviews = []
for review in test["review"]:
    clean_test_reviews.append( review_to_wordlist( review, \
        remove_stopwords=True ))

testDataVecs = getAvgFeatureVecs( clean_test_reviews, model, num_features )



 BeautifulSoup(YOUR_MARKUP})

to this:

 BeautifulSoup(YOUR_MARKUP, "html5lib")

  markup_type=markup_type))


Counter: 0
Counter: 1
Counter: 2
Counter: 3
Counter: 4
Counter: 5
Counter: 6
Counter: 7
Counter: 8
Counter: 9
Counter: 10
Counter: 11
Counter: 12
Counter: 13
Counter: 14
Counter: 15
Counter: 16
Counter: 17
Counter: 18
Counter: 19
Counter: 20
Counter: 21
Counter: 22
Counter: 23
Counter: 24
Counter: 25
Counter: 26
Counter: 27
Counter: 28
Counter: 29
Counter: 30
Counter: 31
Counter: 32
Counter: 33
Counter: 34
Counter: 35




Counter: 36
Counter: 37
Counter: 38
Counter: 39
Counter: 40
Counter: 41
Counter: 42
Counter: 43
Counter: 44
Counter: 45
Counter: 46
Counter: 47
Counter: 48
Counter: 49
Counter: 50
Counter: 51
Counter: 52
Counter: 53
Counter: 54
Counter: 55
Counter: 56
Counter: 57
Counter: 58
Counter: 59
Counter: 60
Counter: 61
Counter: 62
Counter: 63
Counter: 64
Counter: 65
Counter: 66
Counter: 67
Counter: 68
Counter: 69
Counter: 70
Counter: 71
Counter: 72
Counter: 73
Counter: 74
Counter: 75
Counter: 76
Counter: 77
Counter: 78
Counter: 79
Counter: 80
Counter: 81
Counter: 82
Counter: 83
Counter: 84
Counter: 85
Counter: 86
Counter: 87
Counter: 88
Counter: 89
Counter: 90
Counter: 91
Counter: 92
Counter: 93
Counter: 94
Counter: 95
Counter: 96
Counter: 97
Counter: 98
Counter: 99
Counter: 100
Counter: 101
Counter: 102
Counter: 103
Counter: 104
Counter: 105
Counter: 106
Counter: 107
Counter: 108
Counter: 109
Counter: 110
Counter: 111
Counter: 112
Counter: 113
Counter: 114
Counter: 115
Counter: 116
Counter: 11

Counter: 1131
Counter: 1132
Counter: 1133
Counter: 1134
Counter: 1135
Counter: 1136
Counter: 1137
Counter: 1138
Counter: 1139
Counter: 1140
Counter: 1141
Counter: 1142
Counter: 1143
Counter: 1144
Counter: 1145
Counter: 1146
Counter: 1147
Counter: 1148
Counter: 1149
Counter: 1150
Counter: 1151
Counter: 1152
Counter: 1153
Counter: 1154
Counter: 1155
Counter: 1156
Counter: 1157
Counter: 1158
Counter: 1159
Counter: 1160
Counter: 1161
Counter: 1162
Counter: 1163
Counter: 1164
Counter: 1165
Counter: 1166
Counter: 1167
Counter: 1168
Counter: 1169
Counter: 1170
Counter: 1171
Counter: 1172
Counter: 1173
Counter: 1174
Counter: 1175
Counter: 1176
Counter: 1177
Counter: 1178
Counter: 1179
Counter: 1180
Counter: 1181
Counter: 1182
Counter: 1183
Counter: 1184
Counter: 1185
Counter: 1186
Counter: 1187
Counter: 1188
Counter: 1189
Counter: 1190
Counter: 1191
Counter: 1192
Counter: 1193
Counter: 1194
Counter: 1195
Counter: 1196
Counter: 1197
Counter: 1198
Counter: 1199
Counter: 1200
Counter: 1201
Counte

Counter: 2089
Counter: 2090
Counter: 2091
Counter: 2092
Counter: 2093
Counter: 2094
Counter: 2095
Counter: 2096
Counter: 2097
Counter: 2098
Counter: 2099
Counter: 2100
Counter: 2101
Counter: 2102
Counter: 2103
Counter: 2104
Counter: 2105
Counter: 2106
Counter: 2107
Counter: 2108
Counter: 2109
Counter: 2110
Counter: 2111
Counter: 2112
Counter: 2113
Counter: 2114
Counter: 2115
Counter: 2116
Counter: 2117
Counter: 2118
Counter: 2119
Counter: 2120
Counter: 2121
Counter: 2122
Counter: 2123
Counter: 2124
Counter: 2125
Counter: 2126
Counter: 2127
Counter: 2128
Counter: 2129
Counter: 2130
Counter: 2131
Counter: 2132
Counter: 2133
Counter: 2134
Counter: 2135
Counter: 2136
Counter: 2137
Counter: 2138
Counter: 2139
Counter: 2140
Counter: 2141
Counter: 2142
Counter: 2143
Counter: 2144
Counter: 2145
Counter: 2146
Counter: 2147
Counter: 2148
Counter: 2149
Counter: 2150
Counter: 2151
Counter: 2152
Counter: 2153
Counter: 2154
Counter: 2155
Counter: 2156
Counter: 2157
Counter: 2158
Counter: 2159
Counte

Counter: 2906
Counter: 2907
Counter: 2908
Counter: 2909
Counter: 2910
Counter: 2911
Counter: 2912
Counter: 2913
Counter: 2914
Counter: 2915
Counter: 2916
Counter: 2917
Counter: 2918
Counter: 2919
Counter: 2920
Counter: 2921
Counter: 2922
Counter: 2923
Counter: 2924
Counter: 2925
Counter: 2926
Counter: 2927
Counter: 2928
Counter: 2929
Counter: 2930
Counter: 2931
Counter: 2932
Counter: 2933
Counter: 2934
Counter: 2935
Counter: 2936
Counter: 2937
Counter: 2938
Counter: 2939
Counter: 2940
Counter: 2941
Counter: 2942
Counter: 2943
Counter: 2944
Counter: 2945
Counter: 2946
Counter: 2947
Counter: 2948
Counter: 2949
Counter: 2950
Counter: 2951
Counter: 2952
Counter: 2953
Counter: 2954
Counter: 2955
Counter: 2956
Counter: 2957
Counter: 2958
Counter: 2959
Counter: 2960
Counter: 2961
Counter: 2962
Counter: 2963
Counter: 2964
Counter: 2965
Counter: 2966
Counter: 2967
Counter: 2968
Counter: 2969
Counter: 2970
Counter: 2971
Counter: 2972
Counter: 2973
Counter: 2974
Counter: 2975
Counter: 2976
Counte

Counter: 556
Counter: 557
Counter: 558
Counter: 559
Counter: 560
Counter: 561
Counter: 562
Counter: 563
Counter: 564
Counter: 565
Counter: 566
Counter: 567
Counter: 568
Counter: 569
Counter: 570
Counter: 571
Counter: 572
Counter: 573
Counter: 574
Counter: 575
Counter: 576
Counter: 577
Counter: 578
Counter: 579
Counter: 580
Counter: 581
Counter: 582
Counter: 583
Counter: 584
Counter: 585
Counter: 586
Counter: 587
Counter: 588
Counter: 589
Counter: 590
Counter: 591
Counter: 592
Counter: 593
Counter: 594
Counter: 595
Counter: 596
Counter: 597
Counter: 598
Counter: 599
Counter: 600
Counter: 601
Counter: 602
Counter: 603
Counter: 604
Counter: 605
Counter: 606
Counter: 607
Counter: 608
Counter: 609
Counter: 610
Counter: 611
Counter: 612
Counter: 613
Counter: 614
Counter: 615
Counter: 616
Counter: 617
Counter: 618
Counter: 619
Counter: 620
Counter: 621
Counter: 622
Counter: 623
Counter: 624
Counter: 625
Counter: 626
Counter: 627
Counter: 628
Counter: 629
Counter: 630
Counter: 631
Counter: 632

## Running Random Forest for Word2Vec
### Score: 0.43863

In [266]:
#trainDataVecs.shape
#testDataVecs.shape
#np.any(np.isnan(trainDataVecs))
from sklearn.preprocessing import Imputer
trainDataVecs = Imputer().fit_transform(trainDataVecs)
testDataVecs = Imputer().fit_transform(testDataVecs)

In [267]:
#np.isfinite(testDataVecs).all()

True

In [268]:
print(trainDataVecs[:,0].shape)
index = 0
for i in trainDataVecs[:,0]:
    if not np.isfinite(i):
        print(index, i)
    index +=1

(3079,)


In [269]:
from sklearn.ensemble import RandomForestClassifier
forest = RandomForestClassifier( n_estimators = 100 )

print("Fitting a random forest to labeled training data...")
forest = forest.fit( trainDataVecs, train["rating"] )

# Test & extract results 
result = forest.predict( testDataVecs )

# Write the test results 
output = pd.DataFrame( data={"id":test["id"], "rating":result} )
output.to_csv( "Word2Vec_AverageVectors.csv", index=False, quoting=3 )

Fitting a random forest to labeled training data...


## SVM with Word Vectors
Score: 0.3827

In [273]:
clf = svm.SVC(C = 1.0)
clf.fit(trainDataVecs, train["rating"])
predictions = clf.predict(testDataVecs)
output = pd.DataFrame(data={"id":test["id"], "rating":predictions})

                          

In [272]:
output.to_csv("Word2Vec_svm.csv", index=False, quoting=3)

## K-Means with Word Vectors
Score: 0.4500

In [275]:
from sklearn.cluster import KMeans
import time

start = time.time() # Start time

# Set "k" (num_clusters) to be 1/5th of the vocabulary size, or an
# average of 5 words per cluster
word_vectors = model.wv.syn0
num_clusters = int(word_vectors.shape[0] / 5)

# Initalize a k-means object and use it to extract centroids
kmeans_clustering = KMeans( n_clusters = num_clusters )
idx = kmeans_clustering.fit_predict( word_vectors )

# Get the end time and print how long the process took
end = time.time()
elapsed = end - start
print("Time taken for K Means clustering: ", elapsed, "seconds.")

Time taken for K Means clustering:  2.574972629547119 seconds.


In [276]:
# Create a Word / Index dictionary, mapping each vocabulary word to
# a cluster number                                                                                            
word_centroid_map = dict(zip( model.wv.index2word, idx ))

In [281]:
# For the first 10 clusters
for cluster in range(0,10):
    #
    # Print the cluster number  
    print("\nCluster %d" % cluster)
    #
    # Find all of the words for that cluster number, and print them out
    words = []
    for i in range(0,len(word_centroid_map.values())):
        if( list(word_centroid_map.values())[i] == cluster ):
            words.append(list(word_centroid_map.keys())[i])
    print(words)


Cluster 0
['class']

Cluster 1
['on', 'gives', 'questions', 'and']

Cluster 2
['one', 'my', 'worst']

Cluster 3
['times', 'course.', 'two', 'real', 'thing', 'why', 'learning', 'see', 'another', 'class!', 'doing', 'life', 'accounting', 'tough']

Cluster 4
['need', 'go', "you'll", 'read']

Cluster 5
['that', '-', 'we', 'any', '3', '&', 'into']

Cluster 6
['great', 'professor.']

Cluster 7
['easy,', 'super', 'everyone', '4', 'lectures.', 'lot', 'clear', 'knows', 'anything', 'teaches', 'straight', 'hard.', 'nothing', 'sometimes', 'hours']

Cluster 8
['hard,', 'difficult', 'homework', 'exams.']

Cluster 9
['comments']


In [282]:
def create_bag_of_centroids( wordlist, word_centroid_map ):
    #
    # The number of clusters is equal to the highest cluster index
    # in the word / centroid map
    num_centroids = max( word_centroid_map.values() ) + 1
    #
    # Pre-allocate the bag of centroids vector (for speed)
    bag_of_centroids = np.zeros( num_centroids, dtype="float32" )
    #
    # Loop over the words in the review. If the word is in the vocabulary,
    # find which cluster it belongs to, and increment that cluster count 
    # by one
    for word in wordlist:
        if word in word_centroid_map:
            index = word_centroid_map[word]
            bag_of_centroids[index] += 1
    #
    # Return the "bag of centroids"
    return bag_of_centroids

In [283]:
# Pre-allocate an array for the training set bags of centroids (for speed)
train_centroids = np.zeros( (train["review"].size, num_clusters), \
    dtype="float32" )

# Transform the training set reviews into bags of centroids
counter = 0
for review in clean_train_reviews:
    train_centroids[counter] = create_bag_of_centroids( review, \
        word_centroid_map )
    counter += 1

# Repeat for test reviews 
test_centroids = np.zeros(( test["review"].size, num_clusters), \
    dtype="float32" )

counter = 0
for review in clean_test_reviews:
    test_centroids[counter] = create_bag_of_centroids( review, \
        word_centroid_map )
    counter += 1

In [285]:
# Fit a random forest and extract predictions 
forest = RandomForestClassifier(n_estimators = 100)

# Fitting the forest may take a few minutes
print("Fitting a random forest to labeled training data...")
forest = forest.fit(train_centroids,train["rating"])
result = forest.predict(test_centroids)

# Write the test results 
output = pd.DataFrame(data={"id":test["id"], "rating":result})
output.to_csv( "BagOfCentroids.csv", index=False, quoting=3 )

Fitting a random forest to labeled training data...


# Naive Bayes
Naive bayes classifier: Score: 0.41 something

In [20]:
clf = naive_bayes.MultinomialNB()
clf.fit(X, y)
reviews = np.array(test.review)
reviews_vector = vectorizer.transform(reviews)
predictions = clf.predict(reviews_vector)

In [None]:
#ROC accuracy of the model
#roc_auc_score(y_test,clf.predict_proba(X_test)[:,1])

In [73]:
tf = pd.read_csv("test.csv", sep=',', encoding="latin_1")

In [76]:
reviews = np.array(tf.review)
reviews_vector = vectorizer.transform(reviews)
predictions = clf.predict(reviews_vector)
#print(reviews)
#print(reviews_vector)

In [22]:
# writing it to csv
output = pd.DataFrame(data={"id":test.id, "rating":predictions})
output.to_csv( "new_naive.csv", index=False, quoting=3 )
#pred = pd.DataFrame(predictions, columns=["rating"]).to_csv("24nov.csv", index=False)
#predictions.to_csv("submission.csv", columns=["id","rating"], index_lable=tf.id)

In [None]:
review_array = np.array(["Most disgusting and vile math instructor at Umass. His classes are a complete waste of time. Avoid like the plague. The worst of the worst."])
review_vector = vectorizer.transform(review_array)
print(clf.predict(review_vector))