# CNN in keras with pretrained word2vec weights
https://www.kaggle.com/marijakekic/cnn-in-keras-with-pretrained-word2vec-weights/notebook

In [1]:
import os
import sys
import json
import numpy as np
import pandas as pd

In [2]:
# Helper function
def writeProgress(msg, count, total):
    sys.stdout.write(msg + "{:.2%}\r".format(count/total))
    sys.stdout.flush()
    
def newPath(path):
    if not os.path.isdir(path):
        os.mkdir(path)

def read_json(src_path):
    with open(src_path, 'r') as json_file:
        data = json.load(json_file)
    return data

def write_json(data,dst_path):
    with open(dst_path, 'w') as outfile:
        json.dump(data, outfile)

In [3]:
columns = read_json('../orderedListGenres.json')
print(len(columns), columns)

20 ['Action', 'Adventure', 'Animation', 'Biography', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Family', 'Fantasy', 'History', 'Horror', 'Music', 'Mystery', 'Romance', 'Sci-Fi', 'Sport', 'Thriller', 'War', 'Western']


# load data

In [4]:
data = read_json('./input/mergeGenresMat.json')
print(len(data))

1631


In [5]:
train_data_IG=pd.read_csv('./stopword/train_IG.csv')
train_data_IG = train_data_IG.dropna()
train_data_IG = train_data_IG.reset_index(drop=True)
train_data_IG

Unnamed: 0,username,convert_text,genres
0,21bridgesmovie,chadwickboseman bringing bridges alma mater ho...,"['Action', 'Crime', 'Drama']"
1,21bridgesmovie,politics jk simmons captain mckenna bridges th...,"['Action', 'Crime', 'Drama']"
2,21bridgesmovie,thrilling score music bridges composed henry j...,"['Action', 'Crime', 'Drama']"
3,21bridgesmovie,experience manhunt century see bridges playing...,"['Action', 'Crime', 'Drama']"
4,21bridgesmovie,discover truth bridges starring chadwickbosema...,"['Action', 'Crime', 'Drama']"
...,...,...,...
30636,zombieland,got ta look real close one use zombieland skil...,"['Action', 'Comedy', 'Horror']"
30637,zombieland,actually lit zombieland artist dinotomic,"['Action', 'Comedy', 'Horror']"
30638,zombieland,horror comedy cast else could ask zombieland p...,"['Action', 'Comedy', 'Horror']"
30639,zombieland,said like actually said wrong zombieland playing,"['Action', 'Comedy', 'Horror']"


In [7]:
test_data=pd.read_csv('./stopword/test_imdb.csv')
test_data

Unnamed: 0,username,convert_text
0,21bridgesmovie,embattled nypd detective thrust citywide manhu...
1,47metersdown,four teen girls diving ruined underwater city ...
2,abeautifuldaymovie,based true story reallife friendship fred roge...
3,abominablemovie,three teenagers must help yeti return family a...
4,adastramovie,astronaut roy mcbride undertakes mission acros...
...,...,...
160,wrinklestheclown,florida parents hire wrinkles clown scare misb...
161,xmenmovies,jean grey begins develop incredible powers cor...
162,yardiefilm,british crime drama film directed idris elba b...
163,yesterdaymovie,struggling musician realizes person earth reme...


In [8]:
df = pd.concat([train_data_IG, test_data],ignore_index=True)
df

Unnamed: 0,username,convert_text,genres
0,21bridgesmovie,chadwickboseman bringing bridges alma mater ho...,"['Action', 'Crime', 'Drama']"
1,21bridgesmovie,politics jk simmons captain mckenna bridges th...,"['Action', 'Crime', 'Drama']"
2,21bridgesmovie,thrilling score music bridges composed henry j...,"['Action', 'Crime', 'Drama']"
3,21bridgesmovie,experience manhunt century see bridges playing...,"['Action', 'Crime', 'Drama']"
4,21bridgesmovie,discover truth bridges starring chadwickbosema...,"['Action', 'Crime', 'Drama']"
...,...,...,...
30801,wrinklestheclown,florida parents hire wrinkles clown scare misb...,
30802,xmenmovies,jean grey begins develop incredible powers cor...,
30803,yardiefilm,british crime drama film directed idris elba b...,
30804,yesterdaymovie,struggling musician realizes person earth reme...,


In [16]:
concat = df.groupby(['username'])['convert_text'].apply(' '.join).reset_index()
concat.to_csv('./concatUsername.csv')
concat

Unnamed: 0,username,convert_text
0,21bridgesmovie,chadwickboseman bringing bridges alma mater ho...
1,47metersdown,sharks hungry meters uncaged hits theaters aug...
2,abeautifuldaymovie,two weeks take trip back neighborhood beautifu...
3,abominablemovie,abominable movie loved one wish list get digit...
4,adastramovie,photographer stephenwilkes photo shoot natgeo ...
...,...,...
160,wrinklestheclown,tag someone deserves visit wrinkles wrinkles c...
161,xmenmovies,darkphoenix fanartfriday fameart xmen director...
162,yardiefilm,shoutout amlameenbaby sbtvonline pull yardie y...
163,yesterdaymovie,feelgood movie summer yesterday movie theaters...


In [17]:
print(concat.shape)
print(concat.isnull().sum())

(165, 2)
username        0
convert_text    0
dtype: int64


In [22]:
ids = concat['username'].tolist()
texts = concat['convert_text'].tolist()
print(len(ids), len(texts))

165 165


In [19]:
labels = [data[id] for id in ids]
len(labels)

165

# Tokenize

In [20]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical

Using TensorFlow backend.


In [23]:
NUM_WORDS=30000
tokenizer = Tokenizer(num_words=NUM_WORDS,filters='—!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n\'',
                      lower=True)

tokenizer.fit_on_texts(texts)

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Found 30968 unique tokens.


In [46]:
idx = 127
concat.iloc[idx, :]

username                                           spidermanmovie
convert_text    wait amazing spiderman theaters today see imax...
Name: 127, dtype: object

In [47]:
wordstring = concat.iloc[idx, :].convert_text
wordlist = wordstring.split()
wordfreq = [wordlist.count(w) for w in wordlist] # a list comprehension
print(len(wordlist), len(wordfreq))
freq_dic = dict(zip(wordlist, wordfreq))
print(len(freq_dic))

7488 7488
1937


In [48]:
sorted(freq_dic.items(), key=lambda d: d[1], reverse=True)

[('spiderman', 330),
 ('spidermanhomecoming', 185),
 ('amazing', 169),
 ('regram', 144),
 ('link', 109),
 ('bio', 109),
 ('spidermanfarfromhome', 82),
 ('see', 75),
 ('andrew', 71),
 ('garfield', 68),
 ('today', 63),
 ('get', 59),
 ('spidey', 59),
 ('premiere', 55),
 ('emma', 47),
 ('spideysighting', 47),
 ('tomholland', 45),
 ('stone', 45),
 ('watch', 45),
 ('tix', 42),
 ('digital', 42),
 ('new', 40),
 ('iamjamiefoxx', 39),
 ('peter', 37),
 ('world', 35),
 ('theaters', 34),
 ('day', 32),
 ('cast', 30),
 ('movie', 30),
 ('amazingspiderman', 29),
 ('latergram', 29),
 ('days', 29),
 ('weekend', 29),
 ('bluray', 29),
 ('one', 28),
 ('trailer', 27),
 ('tonight', 26),
 ('make', 26),
 ('fan', 26),
 ('look', 25),
 ('us', 24),
 ('tickets', 24),
 ('beamazing', 24),
 ('tomorrow', 23),
 ('spiderfans', 23),
 ('parker', 23),
 ('happy', 23),
 ('love', 23),
 ('danedehaan', 23),
 ('check', 22),
 ('nyc', 22),
 ('time', 22),
 ('nt', 22),
 ('photo', 22),
 ('theamazingspiderman', 21),
 ('thanks', 21),
 ('

In [None]:
sequences_train = tokenizer.texts_to_sequences(texts)
sequences_valid = tokenizer.texts_to_sequences(val_data.convert_text)

X_train = pad_sequences(sequences_train)
X_val = pad_sequences(sequences_valid,maxlen=X_train.shape[1])
labels = np.asarray(labels)

y_train = np.asarray(labels[train_data.index.values])
y_val = np.asarray(labels[val_data.index.values])

print('Shape of X train and X validation tensor:', X_train.shape,X_val.shape)
print('Shape of label train and validation tensor:', y_train.shape,y_val.shape)

# Embedding

In [None]:
import gensim
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess
from gensim.models.keyedvectors import KeyedVectors

outOfDict = []

word_vectors = KeyedVectors.load_word2vec_format('./model/GoogleNews-vectors-negative300.bin', binary=True)

EMBEDDING_DIM=300
vocabulary_size=min(len(word_index)+1,NUM_WORDS)
embedding_matrix = np.zeros((vocabulary_size, EMBEDDING_DIM))
for word, i in word_index.items():
    if i>=NUM_WORDS:
        continue
    try:
        embedding_vector = word_vectors[word]
        embedding_matrix[i] = embedding_vector
    except KeyError:
        embedding_matrix[i]=np.random.normal(0,np.sqrt(0.25),EMBEDDING_DIM)
        outOfDict.append(word)

In [None]:
write_json(outOfDict, 'unstemmed_outOfDict.json')

In [None]:
# del(word_vectors)

from keras.layers import Embedding
embedding_layer = Embedding(vocabulary_size,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            trainable=True)

## 儲存路徑

In [None]:
SAVE_NAME = 'merge_stride7-starwars'
HISTORY_SAVE = './history/' + SAVE_NAME + '.csv'
WEIGHTS_SAVE = './weight/' + SAVE_NAME + '.h5'

## Model Structure

In [None]:
from keras.layers import Dense, Input, GlobalMaxPooling1D
from keras.layers import Conv1D, MaxPooling1D, Embedding
from keras.models import Model
from keras.layers import Input, Dense, Embedding, Conv2D, MaxPooling2D, Dropout,concatenate
from keras.layers.core import Reshape, Flatten
from keras.callbacks import EarlyStopping
from keras.optimizers import Adam
from keras.models import Model
from keras import regularizers

sequence_length = X_train.shape[1]
filter_sizes = [3,4,5]
num_filters = 100
drop = 0.5
stride = 7

inputs = Input(shape=(sequence_length,))
embedding = embedding_layer(inputs)
reshape = Reshape((sequence_length,EMBEDDING_DIM,1))(embedding)

conv_0 = Conv2D(num_filters, (filter_sizes[0], EMBEDDING_DIM),activation='relu',kernel_regularizer=regularizers.l2(0.01))(reshape)
conv_1 = Conv2D(num_filters, (filter_sizes[1], EMBEDDING_DIM),activation='relu',kernel_regularizer=regularizers.l2(0.01))(reshape)
conv_2 = Conv2D(num_filters, (filter_sizes[2], EMBEDDING_DIM),activation='relu',kernel_regularizer=regularizers.l2(0.01))(reshape)

maxpool_0 = MaxPooling2D((sequence_length - filter_sizes[0] + 1, 1), strides=(stride,1))(conv_0)
maxpool_1 = MaxPooling2D((sequence_length - filter_sizes[1] + 1, 1), strides=(stride,1))(conv_1)
maxpool_2 = MaxPooling2D((sequence_length - filter_sizes[2] + 1, 1), strides=(stride,1))(conv_2)

merged_tensor = concatenate([maxpool_0, maxpool_1, maxpool_2], axis=1)
flatten = Flatten()(merged_tensor)
reshape = Reshape((3*num_filters,))(flatten)
dropout = Dropout(drop)(flatten)
output = Dense(units=20, activation='sigmoid',kernel_regularizer=regularizers.l2(0.01))(dropout)

# this creates a model that includes
model = Model(inputs, output)

In [None]:
model.summary()

In [None]:
adam = Adam(lr=1e-3)

model.compile(loss='binary_crossentropy',
              optimizer=adam,
              metrics=['acc'])
callbacks = [EarlyStopping(monitor='val_loss')]
history = model.fit(X_train, y_train, batch_size=32, epochs=100, verbose=1, validation_data=(X_val, y_val), callbacks=callbacks)  # starts training

In [None]:
# save model weights
model.save(WEIGHTS_SAVE)

In [None]:
# save history
acc = history.history['acc']
val_acc = history.history['val_acc']
loss = history.history['loss']
val_loss = history.history['val_loss']
epochs = range(1, len(acc) + 1)

his_df = pd.DataFrame(data = {
    'Epoch' : epochs,
    'Loss' : loss,
    'Acc' : acc,
    'Val_loss' : val_loss,
    'Val_acc' : val_acc
})
his_df = his_df[['Epoch', 'Loss', 'Acc', 'Val_loss', 'Val_acc']]
his_df.to_csv(HISTORY_SAVE, index = False)

In [None]:
# 繪製結果
import matplotlib.pyplot as plt

# plt.figure()

plt.plot(epochs, acc, 'b', label='Training acc')
plt.plot(epochs, val_acc, 'r', label='Validation acc')
plt.title('Training and validation accuracy')
plt.legend()

plt.show()

# plt.figure()

plt.plot(epochs, loss, 'b', label='Training loss')
plt.plot(epochs, val_loss, 'r', label='Validation loss')
plt.title('Training and validation loss')
plt.legend()

plt.show()

## Testing

In [None]:
sequences_test=tokenizer.texts_to_sequences(test_data.convert_text)
X_test = pad_sequences(sequences_test,maxlen=X_train.shape[1])
# pred = model.predict(X_test)

In [None]:
pred.shape

In [None]:
predictions = pred #pred_bool.astype(int)

results=pd.DataFrame(predictions, columns=columns)
results["username"] = test_data.username
ordered_cols = ["username"] + columns
results = results[ordered_cols] #To get the same column order
results.to_csv("./output/"+ SAVE_NAME + "_result.csv",index=False)

## Label top n dynamic

In [None]:
true_df= pd.read_csv('./input/true_df.csv')
true_df

In [None]:
results = pd.read_csv("./output/"+ SAVE_NAME + "_result.csv")
results

In [None]:
labels = read_json('../genresDic.json')

In [None]:
def countAcc(predList, trueList):
    fraction = 0
    denominator = len(trueList)
    for g in predList:
        if g in trueList:
            fraction += 1 
    return fraction / denominator

In [None]:
li = []
for i, row in true_df.iterrows():
    username = row['username']
    if username == results.loc[i,:]['username']: # username equals
        print(i, username)
        truth = labels[username]
        numTrue = sum(row[1:])
        print('count:', numTrue)
        
        s = results.loc[i,:][1:]
        top_n = s.sort_values(ascending=False)[:numTrue]
        inList = list(top_n.index)
        
        onehot = []
        for g in columns:
            onehot.append(int(g in inList))
            
        acc = countAcc(inList, truth)
        print(acc, inList, truth)
        
        appList = [username, acc, truth, inList] + onehot
        li.append(appList)
        

In [None]:
acc_df = pd.DataFrame(li, columns = ['username', 'acc', 'truth', 'top_n'] + columns)
acc_df

In [None]:
acc_df.to_csv("./output/"+ SAVE_NAME + "_bi.csv", index = 0)

In [None]:
acc_df['acc'].mean()

# Feature Extraction
## 測試reload model準確度

In [None]:
SAVE_NAME = 'test'

In [None]:
WEIGHTS_LOAD = './weight/merge_stride8-starwars.h5'

from keras.models import load_model

model = load_model(WEIGHTS_LOAD)

In [None]:
model.summary()

In [None]:
sequences_test=tokenizer.texts_to_sequences(test_data.convert_text)
X_test = pad_sequences(sequences_test,maxlen=X_train.shape[1])
pred = model.predict(X_test)
pred.shape

In [None]:
predictions = pred

results=pd.DataFrame(predictions, columns=columns)
results["username"] = test_data.username
ordered_cols = ["username"] + columns
results = results[ordered_cols] #To get the same column order
results.to_csv("./output/"+ SAVE_NAME + "_result.csv",index=False)

In [None]:
labels = read_json('../genresDic.json')

In [None]:
li = []
for i, row in true_df.iterrows():
    username = row['username']
    if username == results.loc[i,:]['username']: # username equals
        print(i, username)
        truth = labels[username]
        numTrue = sum(row[1:])
        print('count:', numTrue)
        
        s = results.loc[i,:][1:]
        top_n = s.sort_values(ascending=False)[:numTrue]
        inList = list(top_n.index)
        
        onehot = []
        for g in columns:
            onehot.append(int(g in inList))
            
        acc = countAcc(inList, truth)
        print(acc, inList, truth)
        
        appList = [username, acc, truth, inList] + onehot
        li.append(appList)

In [None]:
acc_df = pd.DataFrame(li, columns = ['username', 'acc', 'truth', 'top_n'] + columns)
acc_df['acc'].mean()

## 萃取97部電影imdb的feature

In [None]:
X_test.shape

In [None]:
extract_layer = Model(inputs=model.input,
                      outputs=model.get_layer('flatten_4').output)
#以这个model的预测值作为输出
extract_output = extract_layer.predict(X_test)
print(extract_output.shape)

In [None]:
idx = test_data.username.tolist()
print(len(idx), idx)

In [None]:
res=pd.DataFrame(extract_output, index = idx)
res

In [None]:
res.to_csv("./output/TextFeatureVec.csv", header=False)