# CNN in keras with pretrained word2vec weights
https://www.kaggle.com/marijakekic/cnn-in-keras-with-pretrained-word2vec-weights/notebook

In [1]:
import os
import sys
import json
import numpy as np
import pandas as pd

In [2]:
# Helper function
def writeProgress(msg, count, total):
    sys.stdout.write(msg + "{:.2%}\r".format(count/total))
    sys.stdout.flush()
    
def newPath(path):
    if not os.path.isdir(path):
        os.mkdir(path)

def read_json(src_path):
    with open(src_path, 'r') as json_file:
        data = json.load(json_file)
    return data

def write_json(data,dst_path):
    with open(dst_path, 'w') as outfile:
        json.dump(data, outfile)

In [3]:
columns = read_json('../orderedListGenres.json')
print(len(columns), columns)

20 ['Action', 'Adventure', 'Animation', 'Biography', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Family', 'Fantasy', 'History', 'Horror', 'Music', 'Mystery', 'Romance', 'Sci-Fi', 'Sport', 'Thriller', 'War', 'Western']


# load data

In [4]:
data = read_json('./input/mergeGenresMat.json')
print(len(data))

1631


In [5]:
train_data_IG=pd.read_csv('./stopword/train_IG.csv')
train_data_IG = train_data_IG.dropna()
train_data_IG = train_data_IG.reset_index(drop=True)
train_data_IG

Unnamed: 0,username,convert_text,genres
0,21bridgesmovie,chadwickboseman bringing bridges alma mater ho...,"['Action', 'Crime', 'Drama']"
1,21bridgesmovie,politics jk simmons captain mckenna bridges th...,"['Action', 'Crime', 'Drama']"
2,21bridgesmovie,thrilling score music bridges composed henry j...,"['Action', 'Crime', 'Drama']"
3,21bridgesmovie,experience manhunt century see bridges playing...,"['Action', 'Crime', 'Drama']"
4,21bridgesmovie,discover truth bridges starring chadwickbosema...,"['Action', 'Crime', 'Drama']"
...,...,...,...
30636,zombieland,got ta look real close one use zombieland skil...,"['Action', 'Comedy', 'Horror']"
30637,zombieland,actually lit zombieland artist dinotomic,"['Action', 'Comedy', 'Horror']"
30638,zombieland,horror comedy cast else could ask zombieland p...,"['Action', 'Comedy', 'Horror']"
30639,zombieland,said like actually said wrong zombieland playing,"['Action', 'Comedy', 'Horror']"


In [6]:
test_data=pd.read_csv('./stopword/test_imdb.csv')
test_data

Unnamed: 0,username,convert_text
0,21bridgesmovie,embattled nypd detective thrust citywide manhu...
1,47metersdown,four teen girls diving ruined underwater city ...
2,abeautifuldaymovie,based true story reallife friendship fred roge...
3,abominablemovie,three teenagers must help yeti return family a...
4,adastramovie,astronaut roy mcbride undertakes mission acros...
...,...,...
160,wrinklestheclown,florida parents hire wrinkles clown scare misb...
161,xmenmovies,jean grey begins develop incredible powers cor...
162,yardiefilm,british crime drama film directed idris elba b...
163,yesterdaymovie,struggling musician realizes person earth reme...


In [7]:
df = pd.concat([train_data_IG, test_data],ignore_index=True)
df

Unnamed: 0,username,convert_text,genres
0,21bridgesmovie,chadwickboseman bringing bridges alma mater ho...,"['Action', 'Crime', 'Drama']"
1,21bridgesmovie,politics jk simmons captain mckenna bridges th...,"['Action', 'Crime', 'Drama']"
2,21bridgesmovie,thrilling score music bridges composed henry j...,"['Action', 'Crime', 'Drama']"
3,21bridgesmovie,experience manhunt century see bridges playing...,"['Action', 'Crime', 'Drama']"
4,21bridgesmovie,discover truth bridges starring chadwickbosema...,"['Action', 'Crime', 'Drama']"
...,...,...,...
30801,wrinklestheclown,florida parents hire wrinkles clown scare misb...,
30802,xmenmovies,jean grey begins develop incredible powers cor...,
30803,yardiefilm,british crime drama film directed idris elba b...,
30804,yesterdaymovie,struggling musician realizes person earth reme...,


In [8]:
concat = df.groupby(['username'])['convert_text'].apply(' '.join).reset_index()
concat.to_csv('./concatUsername.csv')
concat

Unnamed: 0,username,convert_text
0,21bridgesmovie,chadwickboseman bringing bridges alma mater ho...
1,47metersdown,sharks hungry meters uncaged hits theaters aug...
2,abeautifuldaymovie,two weeks take trip back neighborhood beautifu...
3,abominablemovie,abominable movie loved one wish list get digit...
4,adastramovie,photographer stephenwilkes photo shoot natgeo ...
...,...,...
160,wrinklestheclown,tag someone deserves visit wrinkles wrinkles c...
161,xmenmovies,darkphoenix fanartfriday fameart xmen director...
162,yardiefilm,shoutout amlameenbaby sbtvonline pull yardie y...
163,yesterdaymovie,feelgood movie summer yesterday movie theaters...


In [9]:
print(concat.shape)
print(concat.isnull().sum())

(165, 2)
username        0
convert_text    0
dtype: int64


In [10]:
ids = concat['username'].tolist()
texts = concat['convert_text'].tolist()
print(len(ids), len(texts))

165 165


In [11]:
labels = [data[id] for id in ids]
len(labels)

165

# Tokenize

In [12]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical

Using TensorFlow backend.


In [54]:
NUM_WORDS=30000
tokenizer = Tokenizer(num_words=NUM_WORDS,filters='—!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n\'',
                      lower=True)

tokenizer.fit_on_texts(texts)

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))
# print(word_index)

Found 30968 unique tokens.


# Embedding

In [55]:
import gensim
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess
from gensim.models.keyedvectors import KeyedVectors

outOfDict = []

word_vectors = KeyedVectors.load_word2vec_format('./model/GoogleNews-vectors-negative300.bin', binary=True)

In [118]:
idx = 166 - 2
concat.iloc[idx, :]

username                                               zombieland
convert_text    omg like totally zoeydeutch birthday today hap...
Name: 164, dtype: object

In [119]:
wordstring = concat.iloc[idx, :].convert_text
wordlist = wordstring.split()
words = []
wordfreq = []
for w in wordlist:
    try:
        embedding_vector = word_vectors[w]
        words.append(w)
        wordfreq.append(wordlist.count(w))
    except KeyError:
        continue

print(len(words), len(wordfreq))
freq_dic = dict(zip(words, wordfreq))
print(len(freq_dic))

1208 1208
533


In [120]:
freq_sort = sorted(freq_dic.items(), key=lambda d: d[1], reverse=True)
freq_sort

[('double', 61),
 ('tap', 59),
 ('theaters', 23),
 ('digital', 21),
 ('get', 15),
 ('see', 14),
 ('back', 13),
 ('playing', 12),
 ('nt', 10),
 ('days', 10),
 ('rule', 10),
 ('today', 9),
 ('new', 9),
 ('october', 9),
 ('link', 9),
 ('bio', 9),
 ('us', 9),
 ('zombie', 9),
 ('like', 8),
 ('happy', 8),
 ('one', 8),
 ('tonight', 8),
 ('columbus', 7),
 ('little', 7),
 ('bluray', 7),
 ('comedy', 7),
 ('miss', 7),
 ('art', 7),
 ('movie', 7),
 ('year', 7),
 ('birthday', 6),
 ('tallahassee', 6),
 ('wichita', 6),
 ('rock', 6),
 ('cast', 6),
 ('rules', 6),
 ('love', 6),
 ('time', 5),
 ('know', 5),
 ('premiere', 5),
 ('two', 5),
 ('tomorrow', 5),
 ('everyone', 5),
 ('zombies', 5),
 ('scenes', 5),
 ('make', 5),
 ('week', 5),
 ('got', 5),
 ('coming', 4),
 ('america', 4),
 ('party', 4),
 ('alternate', 4),
 ('extended', 4),
 ('lot', 4),
 ('good', 4),
 ('director', 4),
 ('season', 4),
 ('said', 4),
 ('seconds', 4),
 ('em', 4),
 ('ready', 4),
 ('dead', 4),
 ('tag', 4),
 ('madison', 4),
 ('holiday', 4),


In [121]:
for tup in freq_sort:
    print(tup[0])

double
tap
theaters
digital
get
see
back
playing
nt
days
rule
today
new
october
link
bio
us
zombie
like
happy
one
tonight
columbus
little
bluray
comedy
miss
art
movie
year
birthday
tallahassee
wichita
rock
cast
rules
love
time
know
premiere
two
tomorrow
everyone
zombies
scenes
make
week
got
coming
america
party
alternate
extended
lot
good
director
season
said
seconds
em
ready
dead
tag
madison
holiday
look
sure
actually
omg
join
nut
shut
special
welcome
home
fun
right
pick
copy
inspired
follow
dying
would
world
check
arrives
watch
family
life
comes
bonus
including
hilarious
reel
laughs
thursday
show
years
woody
ho
ultra
hd
dvd
artist
real
ha
long
take
friends
yesterday
filmmakers
early
screening
fans
babylon
everywhere
binge
may
else
math
adds
talent
first
pm
pt
original
download
available
want
skills
aka
hard
always
takes
let
holidays
eww
kickass
partner
hits
bad
celebrate
locked
loaded
content
nine
blooper
boom
yeah
writers
ruben
action
meet
oct
heard
caught
light
start
mall
use
side
