In [1]:
import pandas as pd
import numpy as np
import tflearn 
from tflearn.data_utils import to_categorical, pad_sequences

In [2]:
df = pd.read_csv('ign.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,score_phrase,title,url,platform,score,genre,editors_choice,release_year,release_month,release_day
0,0,Amazing,LittleBigPlanet PS Vita,/games/littlebigplanet-vita/vita-98907,PlayStation Vita,9.0,Platformer,Y,2012,9,12
1,1,Amazing,LittleBigPlanet PS Vita -- Marvel Super Hero E...,/games/littlebigplanet-ps-vita-marvel-super-he...,PlayStation Vita,9.0,Platformer,Y,2012,9,12
2,2,Great,Splice: Tree of Life,/games/splice/ipad-141070,iPad,8.5,Puzzle,N,2012,9,12
3,3,Great,NHL 13,/games/nhl-13/xbox-360-128182,Xbox 360,8.5,Sports,N,2012,9,11
4,4,Great,NHL 13,/games/nhl-13/ps3-128181,PlayStation 3,8.5,Sports,N,2012,9,11


In [3]:
df.score_phrase.value_counts()

Great          4773
Good           4741
Okay           2945
Mediocre       1959
Amazing        1804
Bad            1269
Awful           664
Painful         340
Unbearable       72
Masterpiece      55
Disaster          3
Name: score_phrase, dtype: int64

In [4]:
positive = ['Amazing', 'Great', 'Good', 'Okay', 'Masterpiece']
negative = ['Mediocre', 'Bad', 'Awful', 'Painful', 'Unbearable', 'Disaster']

In [5]:
# Map sentiment to polarity sentiment
senti_title = df[['title','score_phrase']].copy()
senti_title['senti_polar']=senti_title.score_phrase.apply(lambda senti: 1 if senti in positive else 0)

In [6]:
# Count the number of unique words, so we can code them
import string
remove = string.punctuation + '\xc2\xb0'
words = set()
for title in senti_title['title']:
    for word in title.lower().translate(None, remove).split():
        words.add(word)
print(len(words))

8246


In [7]:
words_list = sorted(list(words))
wors_num = len(words)

In [8]:
# Make word2index and index2word dict
word2ind = {word:index for index, word in enumerate(words_list)}
ind2word = {index:word for word, index in word2ind.items()}

# Make sent2ind and ind2sent dict
sentiment = positive + negative
sent2ind = {sent:ind for ind,sent in enumerate(sentiment)}
ind2sent = {ind:sent for sent,ind in sent2ind.items()}

In [9]:
def encode(string):
    # Encode string
    words = string.lower().translate(None, remove).split()
    encode_string = []
    for word in words:
        encode_string.append(word2ind[word])
    return encode_string

In [10]:
encode('Splice: Tree of Life')

[6789, 7470, 5118, 4255]

In [11]:
data = senti_title.copy()
data['score_phrase'] = data['score_phrase'].replace(sent2ind)
data['title'] = data['title'].apply(encode)

In [12]:
ind2sent

{0: 'Amazing',
 1: 'Great',
 2: 'Good',
 3: 'Okay',
 4: 'Masterpiece',
 5: 'Mediocre',
 6: 'Bad',
 7: 'Awful',
 8: 'Painful',
 9: 'Unbearable',
 10: 'Disaster'}

In [13]:
data.head()

Unnamed: 0,title,score_phrase,senti_polar
0,"[4288, 5671, 7800]",0,1
1,"[4288, 5671, 7800, 4491, 7030, 3484, 2414]",0,1
2,"[6789, 7470, 5118, 4255]",1,1
3,"[4979, 33]",1,1
4,"[4979, 33]",1,1


In [14]:
# Get the max length of title
max_len = data['title'].map(len).max()
print(max_len)
maxlen = 20

16


In [15]:
## Split Training and test dataset
from sklearn.model_selection import train_test_split
from tflearn.data_utils import to_categorical, pad_sequences
train, test = train_test_split(data, test_size=0.05)

In [16]:
X_train = train['title']
Y_train = train['score_phrase']
Y_train_bl = train['senti_polar']

X_test = test['title']
Y_test = test['score_phrase']
Y_test_bl = test['senti_polar']

In [17]:
X_train = pad_sequences(X_train, maxlen=max_len, value=0.)
X_test = pad_sequences(X_test, maxlen=max_len, value=0.)

Y_train = to_categorical(Y_train, nb_classes=11)
Y_test = to_categorical(Y_test, nb_classes=11)

Y_train_bl = to_categorical(Y_train_bl, nb_classes=2)
Y_test_bl = to_categorical(Y_test_bl, nb_classes=2)

### Baseline Model
Use game title to predict emotion, which means the review score.
For baseline, we only need to predict polarity sentiment.

In [247]:
# Network Building 
net = tflearn.input_data([None, max_len]) # maxlen 16
net = tflearn.embedding(net, input_dim=wors_num, output_dim=128)
net = tflearn.lstm(net, 128, dropout=0.8)
net = tflearn.fully_connected(net, 2, activation='softmax')
net = tflearn.regression(net, optimizer='adam', learning_rate=0.001,
                        loss='categorical_crossentropy')

In [248]:
# Training
model = tflearn.DNN(net, tensorboard_verbose=0)
model.fit(X_train, Y_train_bl, show_metric=True,
          batch_size=32)

IndexError: list index out of range

In [246]:
data['senti_polar'].sum()/float(len(data))

0.76875167785234899

In [205]:
model.evaluate(X_test, Y_test_bl, batch_size=32)

[0.79679144799390578]

## Plus credits by using multi classes


In [18]:
# Network Building 
net_mc = tflearn.input_data([None, max_len]) # maxlen 16
net_mc = tflearn.embedding(net_mc, input_dim=wors_num, output_dim=128)
net_mc = tflearn.lstm(net_mc, 128, dropout=0.8)
net_mc = tflearn.fully_connected(net_mc, 11, activation='softmax')
net_mc = tflearn.regression(net_mc, optimizer='adam', learning_rate=0.001,
                        loss='categorical_crossentropy')

In [19]:
model_mc = tflearn.DNN(net_mc, tensorboard_verbose=0)
model_mc.fit(X_train, Y_train, validation_set=(X_test, Y_test),show_metric=True,
          batch_size=64)

Training Step: 2769  | total loss: [1m[32m0.98922[0m[0m | time: 7.260s
| Adam | epoch: 010 | loss: 0.98922 - acc: 0.6724 -- iter: 17664/17693
Training Step: 2770  | total loss: [1m[32m0.98678[0m[0m | time: 8.289s
| Adam | epoch: 010 | loss: 0.98678 - acc: 0.6708 | val_loss: 1.90089 - val_acc: 0.3916 -- iter: 17693/17693
--


In [20]:
model_mc.evaluate(X_test, Y_test, batch_size=64)

[0.3916309010317397]

In [54]:
predict = model_mc.predict(pad_sequences([[4053, 7242, 2188, 3699],], maxlen=max_len, value=0.))[0]
index = np.argmax(predict, axis=0)

In [55]:
ind2sent[index]

'Awful'

In [39]:
pad_sequences([[4053, 7242, 2188, 3699],], 
              maxlen=max_len, 
              value=0.)

array([[4053, 7242, 2188, 3699,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0]], dtype=int32)

In [32]:
senti_title[500:505]

Unnamed: 0,title,score_phrase,senti_polar
500,NHL FaceOff,Amazing,1
501,NFL GameDay,Great,1
502,Kileak: The DNA Imperative,Awful,0
503,Raven Project,Awful,0
504,Silverload,Okay,1


In [34]:
data[500:505]

Unnamed: 0,title,score_phrase,senti_polar
500,"[4979, 2663]",0,1
501,"[4977, 3033]",1,1
502,"[4053, 7242, 2188, 3699]",7,0
503,"[5836, 5649]",7,0
504,[6504],3,1
