# Abstract

# Introduction

# Background

# Methods

### Folders
* Model folder: word2vec model and DNN model (with accuracy of 18%~19%)
* Cnnmodel folder: CNN model (with acuracy of 28%~30%)
* Rnnmodel folder: LSTM model (with accuracy of 39%~40%)


### Scripts
* *handle_json.py* - The original lines.json file contains all lines from the show "Friends", which was in JSON format, we converted the character names into numbers and write them into data/feature_raw.txt. We also pick only the 6 main characters' lines (6 main characters: Ross, Rachel, Joey, Chandler, Monica and Phoebe).


* *extract_label_and_sentence.py* - Extract labels from data/feature_raw.txt and write them into data/label.txt, also extract the segmantations into data/sentence.txt.


* *extract_feature.py* - Train word vectors using word2vec (4 dimensions) and calculate the feature vectors of each sentence (take the average of the word vectors in each sentence), then write feature vectors into data/feature.txt. This process is mainly for DNN training because CNN and LSTM use embedding which doesn't train word vector the same way.


* *main_word2vec.py* - DNN with 3 hidden layers. The neuron numbers of each layer is 40, 20 and 10, respectively. The input dimension is 4 (4 features) and the output dimension is 6 (6 characters). The first 2 layers‘ activation function is sigmoid and the last layer's is softmax. The learning rate is 0.0001 and there are 1000 iterations. Note that there is a parameter, is_train, in the model, if is_train is True, it starts to train a new model, otherwise it takes the trained model.


* *main.py* - Similar to main_word2vec.py, but it is DNN with embedding.


* *data_helpers.py* - Helps to batch process the data


* *cnn_model.py* - CNN with an embedding layer (100 dimensions word vectos), a CNN layer, a pool layer and a softmax layer to output the probability of each label.


* *textCNN.py* - It takes the cnn_model.py to train or test the lines data. It takes 90% of the lines for training and 10% of them for testing. The accuracy is 28%~30%. The learn rate is 0.0001.

* *textRNN.py* - RNN with an embedding layer, a bi-lstm layer, a concat layer, a fully connected layer and a softmax layer. It takes 90% of the lines for training and 10% of them for testing. The accuracy is 39%~40%. The learn rate is 0.0001.


### handle_json.py

In [31]:
import json

def main():
  dict = {}
  select = {}   # select the characters

  with open('lines.json', 'r', encoding='utf-8') as f:
    text = json.load(f)
  for i in range(len(text)):
    name = text[i]['character']
    if name not in dict:
      dict[name] = 1
    else:
      dict[name] += 1
  line_left = 0
  for k,v in dict.items():
    if v > 1000:    # pick only the main characters who has more than 1000 lines
      select[k] = v
      line_left += v
  print(select)

  #print(select)
  print("Now there are {}% lines left, {} people left.".format(line_left/28877*100, len(select)))

  # label with numbers
  label = {}
  now_label = 0
  for k,v in select.items():
    if k in label:
      continue
    label[k] = now_label
    now_label += 1

  # write it to feature_raw.txt file (label + sentence)
  with open('data/feature_raw.txt', 'w', encoding='utf-8') as f:
    for i in range(len(text)):
      name = text[i]['character']
      sentence = text[i]['text']
      if name in select:
        f.write(str(label[name]) + '\t')
        f.write(sentence + '\n')

if __name__ == '__main__':
  main()

{'Monica': 4213, 'Joey': 4308, 'Phoebe': 3795, 'Chandler': 4235, 'Ross': 4475, 'Rachel': 4641}
Now there are 88.88388683034941% lines left, 6 people left.


### extract_label_and_sentence.py

In [5]:
import nltk
import numpy as np
import string

from nltk.tokenize import WordPunctTokenizer  

stopwords = nltk.corpus.stopwords.words('english')
punctuation = string.punctuation

# Read the file
def read_file(file):
  with open(file, 'r', encoding='utf-8') as f:
    lines = f.readlines()
  for i in range(len(lines)):
    lines[i] = lines[i].strip()
  return lines

def get_label_and_sentence(content):
  label, sentence = [], []
  for i in range(len(content)):
    seg = content[i].split()
    label.append(int(seg[0]))   #labels

    #take a line
    sent = ' '.join(seg[1:])
    words = WordPunctTokenizer().tokenize(sent)
    this_sentence = []
    for ele in words:
      if (ele not in punctuation) and (ele not in stopwords):
        this_sentence.append(ele)
    sentence.append(this_sentence)

  return label, sentence

def main():
  content = read_file("data/feature_raw.txt")
  label, sentence = get_label_and_sentence(content)   # Get labels and sentence list

  # Write label
  with open("data/label.txt", 'w', encoding='utf-8') as f:
    for i in range(len(label)):
      f.write(str(label[i]) + '\n')  
  # Write lines
  with open('data/sentence.txt', 'w', encoding='utf-8') as f:
    for i in range(len(sentence)):
      for j in range(len(sentence[i])):
        f.write(sentence[i][j])
        if j != len(sentence[i])-1:
          f.write('\t')
      f.write('\n')


if __name__ == '__main__':
  main()

### extract_feature.py

In [7]:
import logging
import multiprocessing
import numpy as np

from gensim.models import Word2Vec
from gensim.models.word2vec import LineSentence
from gensim.models.keyedvectors import KeyedVectors

feature_size = 4

'''
word2vec
dimension：feature_size，Iterations：200
'''
def word_to_vec(file_in, file_out1, file_out2):
	logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s')
	logging.root.setLevel(level=logging.INFO)
	model = Word2Vec(LineSentence(file_in), size=feature_size, window=5, min_count=1, workers=multiprocessing.cpu_count(), iter=200)
	model.save(file_out1)
	model.wv.save_word2vec_format(file_out2, binary=False)

def read_sentence(file):
	res = []
	with open(file, 'r', encoding='utf-8') as f:
		lines = f.readlines()
	for i in range(len(lines)):
		line = lines[i].split()
		res.append(line)
	return res

def main():
	train_vector = True		# True = Train, False = Take a trained model

	if train_vector:
		word_to_vec('data/sentence.txt', 'model/word.model', 'model/word.vector')
	
	# tale trained model
	word_vectors = KeyedVectors.load_word2vec_format('model/word.vector', binary=False)

	feature = []		# Feature vector of all sentences
	sentence = read_sentence('data/sentence.txt')
	for i in range(len(sentence)):
		fea = np.zeros(feature_size)		# Feature vector of a specific sentence
		for j in range(len(sentence[i])):
			v = word_vectors[sentence[i][j]]		# word vector in the sentence
			fea += v
		fea = fea / len(sentence[i])		# feature vector of the sentence, weighted average of the word vectors
		feature.append(fea)

	# Add feature vector into the feature.txt file
	with open('data/feature.txt', 'w', encoding='utf-8') as f:
		for i in range(len(feature)):
			for j in range(feature[i].shape[0]):
				f.write(str(feature[i][j]))
				if j != feature[i].shape[0]-1:
					f.write('\t')
			f.write('\n')

if __name__ == '__main__':
	main()

2018-03-25 00:43:37,927: INFO: collecting all words and their counts
2018-03-25 00:43:37,929: INFO: PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2018-03-25 00:43:38,017: INFO: PROGRESS: at sentence #10000, processed 90100 words, keeping 8189 word types
2018-03-25 00:43:38,099: INFO: PROGRESS: at sentence #20000, processed 180347 words, keeping 11504 word types
2018-03-25 00:43:38,142: INFO: collected 13319 word types from a corpus of 230239 raw words and 25667 sentences
2018-03-25 00:43:38,143: INFO: Loading a fresh vocabulary
2018-03-25 00:43:38,198: INFO: min_count=1 retains 13319 unique words (100% of original 13319, drops 0)
2018-03-25 00:43:38,205: INFO: min_count=1 leaves 230239 word corpus (100% of original 230239, drops 0)
2018-03-25 00:43:38,298: INFO: deleting the raw counts dictionary of 13319 items
2018-03-25 00:43:38,303: INFO: sample=0.001 downsamples 57 most-common words
2018-03-25 00:43:38,305: INFO: downsampling leaves estimated 185858 word corpus 

2018-03-25 00:43:43,887: INFO: worker thread finished; awaiting finish of 2 more threads
2018-03-25 00:43:43,895: INFO: worker thread finished; awaiting finish of 1 more threads
2018-03-25 00:43:43,902: INFO: worker thread finished; awaiting finish of 0 more threads
2018-03-25 00:43:43,904: INFO: EPOCH - 15 : training on 230239 raw words (185605 effective words) took 0.3s, 547825 effective words/s
2018-03-25 00:43:44,249: INFO: worker thread finished; awaiting finish of 3 more threads
2018-03-25 00:43:44,251: INFO: worker thread finished; awaiting finish of 2 more threads
2018-03-25 00:43:44,257: INFO: worker thread finished; awaiting finish of 1 more threads
2018-03-25 00:43:44,262: INFO: worker thread finished; awaiting finish of 0 more threads
2018-03-25 00:43:44,263: INFO: EPOCH - 16 : training on 230239 raw words (185814 effective words) took 0.4s, 528408 effective words/s
2018-03-25 00:43:44,520: INFO: worker thread finished; awaiting finish of 3 more threads
2018-03-25 00:43:44,

2018-03-25 00:43:49,121: INFO: worker thread finished; awaiting finish of 3 more threads
2018-03-25 00:43:49,124: INFO: worker thread finished; awaiting finish of 2 more threads
2018-03-25 00:43:49,135: INFO: worker thread finished; awaiting finish of 1 more threads
2018-03-25 00:43:49,136: INFO: worker thread finished; awaiting finish of 0 more threads
2018-03-25 00:43:49,138: INFO: EPOCH - 32 : training on 230239 raw words (185964 effective words) took 0.3s, 662376 effective words/s
2018-03-25 00:43:49,689: INFO: worker thread finished; awaiting finish of 3 more threads
2018-03-25 00:43:49,691: INFO: worker thread finished; awaiting finish of 2 more threads
2018-03-25 00:43:49,693: INFO: worker thread finished; awaiting finish of 1 more threads
2018-03-25 00:43:49,706: INFO: worker thread finished; awaiting finish of 0 more threads
2018-03-25 00:43:49,708: INFO: EPOCH - 33 : training on 230239 raw words (185828 effective words) took 0.6s, 327790 effective words/s
2018-03-25 00:43:50,

2018-03-25 00:43:55,411: INFO: EPOCH - 48 : training on 230239 raw words (185833 effective words) took 0.3s, 662521 effective words/s
2018-03-25 00:43:55,667: INFO: worker thread finished; awaiting finish of 3 more threads
2018-03-25 00:43:55,670: INFO: worker thread finished; awaiting finish of 2 more threads
2018-03-25 00:43:55,678: INFO: worker thread finished; awaiting finish of 1 more threads
2018-03-25 00:43:55,686: INFO: worker thread finished; awaiting finish of 0 more threads
2018-03-25 00:43:55,688: INFO: EPOCH - 49 : training on 230239 raw words (185859 effective words) took 0.3s, 680453 effective words/s
2018-03-25 00:43:56,094: INFO: worker thread finished; awaiting finish of 3 more threads
2018-03-25 00:43:56,096: INFO: worker thread finished; awaiting finish of 2 more threads
2018-03-25 00:43:56,107: INFO: worker thread finished; awaiting finish of 1 more threads
2018-03-25 00:43:56,114: INFO: worker thread finished; awaiting finish of 0 more threads
2018-03-25 00:43:56,

2018-03-25 00:44:00,766: INFO: worker thread finished; awaiting finish of 0 more threads
2018-03-25 00:44:00,768: INFO: EPOCH - 65 : training on 230239 raw words (186106 effective words) took 0.3s, 683401 effective words/s
2018-03-25 00:44:01,078: INFO: worker thread finished; awaiting finish of 3 more threads
2018-03-25 00:44:01,081: INFO: worker thread finished; awaiting finish of 2 more threads
2018-03-25 00:44:01,084: INFO: worker thread finished; awaiting finish of 1 more threads
2018-03-25 00:44:01,094: INFO: worker thread finished; awaiting finish of 0 more threads
2018-03-25 00:44:01,096: INFO: EPOCH - 66 : training on 230239 raw words (185829 effective words) took 0.3s, 572053 effective words/s
2018-03-25 00:44:01,379: INFO: worker thread finished; awaiting finish of 3 more threads
2018-03-25 00:44:01,381: INFO: worker thread finished; awaiting finish of 2 more threads
2018-03-25 00:44:01,382: INFO: worker thread finished; awaiting finish of 1 more threads
2018-03-25 00:44:01,

2018-03-25 00:44:06,037: INFO: worker thread finished; awaiting finish of 1 more threads
2018-03-25 00:44:06,043: INFO: worker thread finished; awaiting finish of 0 more threads
2018-03-25 00:44:06,044: INFO: EPOCH - 82 : training on 230239 raw words (185813 effective words) took 0.3s, 711179 effective words/s
2018-03-25 00:44:06,317: INFO: worker thread finished; awaiting finish of 3 more threads
2018-03-25 00:44:06,320: INFO: worker thread finished; awaiting finish of 2 more threads
2018-03-25 00:44:06,331: INFO: worker thread finished; awaiting finish of 1 more threads
2018-03-25 00:44:06,332: INFO: worker thread finished; awaiting finish of 0 more threads
2018-03-25 00:44:06,333: INFO: EPOCH - 83 : training on 230239 raw words (185864 effective words) took 0.3s, 648888 effective words/s
2018-03-25 00:44:06,595: INFO: worker thread finished; awaiting finish of 3 more threads
2018-03-25 00:44:06,597: INFO: worker thread finished; awaiting finish of 2 more threads
2018-03-25 00:44:06,

2018-03-25 00:44:11,463: INFO: worker thread finished; awaiting finish of 2 more threads
2018-03-25 00:44:11,472: INFO: worker thread finished; awaiting finish of 1 more threads
2018-03-25 00:44:11,476: INFO: worker thread finished; awaiting finish of 0 more threads
2018-03-25 00:44:11,477: INFO: EPOCH - 99 : training on 230239 raw words (185870 effective words) took 0.3s, 659932 effective words/s
2018-03-25 00:44:11,723: INFO: worker thread finished; awaiting finish of 3 more threads
2018-03-25 00:44:11,724: INFO: worker thread finished; awaiting finish of 2 more threads
2018-03-25 00:44:11,728: INFO: worker thread finished; awaiting finish of 1 more threads
2018-03-25 00:44:11,735: INFO: worker thread finished; awaiting finish of 0 more threads
2018-03-25 00:44:11,736: INFO: EPOCH - 100 : training on 230239 raw words (185823 effective words) took 0.3s, 728084 effective words/s
2018-03-25 00:44:12,014: INFO: worker thread finished; awaiting finish of 3 more threads
2018-03-25 00:44:12

2018-03-25 00:44:16,391: INFO: worker thread finished; awaiting finish of 3 more threads
2018-03-25 00:44:16,393: INFO: worker thread finished; awaiting finish of 2 more threads
2018-03-25 00:44:16,398: INFO: worker thread finished; awaiting finish of 1 more threads
2018-03-25 00:44:16,404: INFO: worker thread finished; awaiting finish of 0 more threads
2018-03-25 00:44:16,405: INFO: EPOCH - 116 : training on 230239 raw words (185759 effective words) took 0.3s, 681464 effective words/s
2018-03-25 00:44:16,657: INFO: worker thread finished; awaiting finish of 3 more threads
2018-03-25 00:44:16,661: INFO: worker thread finished; awaiting finish of 2 more threads
2018-03-25 00:44:16,663: INFO: worker thread finished; awaiting finish of 1 more threads
2018-03-25 00:44:16,669: INFO: worker thread finished; awaiting finish of 0 more threads
2018-03-25 00:44:16,670: INFO: EPOCH - 117 : training on 230239 raw words (185712 effective words) took 0.3s, 707606 effective words/s
2018-03-25 00:44:1

2018-03-25 00:44:21,038: INFO: EPOCH - 132 : training on 230239 raw words (185822 effective words) took 0.3s, 579226 effective words/s
2018-03-25 00:44:21,301: INFO: worker thread finished; awaiting finish of 3 more threads
2018-03-25 00:44:21,305: INFO: worker thread finished; awaiting finish of 2 more threads
2018-03-25 00:44:21,312: INFO: worker thread finished; awaiting finish of 1 more threads
2018-03-25 00:44:21,319: INFO: worker thread finished; awaiting finish of 0 more threads
2018-03-25 00:44:21,320: INFO: EPOCH - 133 : training on 230239 raw words (185867 effective words) took 0.3s, 663974 effective words/s
2018-03-25 00:44:21,570: INFO: worker thread finished; awaiting finish of 3 more threads
2018-03-25 00:44:21,580: INFO: worker thread finished; awaiting finish of 2 more threads
2018-03-25 00:44:21,583: INFO: worker thread finished; awaiting finish of 1 more threads
2018-03-25 00:44:21,590: INFO: worker thread finished; awaiting finish of 0 more threads
2018-03-25 00:44:2

2018-03-25 00:44:26,433: INFO: worker thread finished; awaiting finish of 0 more threads
2018-03-25 00:44:26,435: INFO: EPOCH - 149 : training on 230239 raw words (185846 effective words) took 0.3s, 551173 effective words/s
2018-03-25 00:44:26,690: INFO: worker thread finished; awaiting finish of 3 more threads
2018-03-25 00:44:26,697: INFO: worker thread finished; awaiting finish of 2 more threads
2018-03-25 00:44:26,704: INFO: worker thread finished; awaiting finish of 1 more threads
2018-03-25 00:44:26,710: INFO: worker thread finished; awaiting finish of 0 more threads
2018-03-25 00:44:26,711: INFO: EPOCH - 150 : training on 230239 raw words (185789 effective words) took 0.3s, 685693 effective words/s
2018-03-25 00:44:26,963: INFO: worker thread finished; awaiting finish of 3 more threads
2018-03-25 00:44:26,965: INFO: worker thread finished; awaiting finish of 2 more threads
2018-03-25 00:44:26,973: INFO: worker thread finished; awaiting finish of 1 more threads
2018-03-25 00:44:2

2018-03-25 00:44:31,548: INFO: worker thread finished; awaiting finish of 1 more threads
2018-03-25 00:44:31,557: INFO: worker thread finished; awaiting finish of 0 more threads
2018-03-25 00:44:31,558: INFO: EPOCH - 166 : training on 230239 raw words (186030 effective words) took 0.3s, 582848 effective words/s
2018-03-25 00:44:31,815: INFO: worker thread finished; awaiting finish of 3 more threads
2018-03-25 00:44:31,817: INFO: worker thread finished; awaiting finish of 2 more threads
2018-03-25 00:44:31,822: INFO: worker thread finished; awaiting finish of 1 more threads
2018-03-25 00:44:31,826: INFO: worker thread finished; awaiting finish of 0 more threads
2018-03-25 00:44:31,828: INFO: EPOCH - 167 : training on 230239 raw words (185864 effective words) took 0.3s, 697659 effective words/s
2018-03-25 00:44:32,067: INFO: worker thread finished; awaiting finish of 3 more threads
2018-03-25 00:44:32,068: INFO: worker thread finished; awaiting finish of 2 more threads
2018-03-25 00:44:3

2018-03-25 00:44:36,394: INFO: worker thread finished; awaiting finish of 2 more threads
2018-03-25 00:44:36,403: INFO: worker thread finished; awaiting finish of 1 more threads
2018-03-25 00:44:36,404: INFO: worker thread finished; awaiting finish of 0 more threads
2018-03-25 00:44:36,406: INFO: EPOCH - 183 : training on 230239 raw words (185708 effective words) took 0.3s, 553702 effective words/s
2018-03-25 00:44:36,654: INFO: worker thread finished; awaiting finish of 3 more threads
2018-03-25 00:44:36,658: INFO: worker thread finished; awaiting finish of 2 more threads
2018-03-25 00:44:36,665: INFO: worker thread finished; awaiting finish of 1 more threads
2018-03-25 00:44:36,667: INFO: worker thread finished; awaiting finish of 0 more threads
2018-03-25 00:44:36,668: INFO: EPOCH - 184 : training on 230239 raw words (185900 effective words) took 0.3s, 717047 effective words/s
2018-03-25 00:44:36,911: INFO: worker thread finished; awaiting finish of 3 more threads
2018-03-25 00:44:3

2018-03-25 00:44:41,229: INFO: worker thread finished; awaiting finish of 3 more threads
2018-03-25 00:44:41,230: INFO: worker thread finished; awaiting finish of 2 more threads
2018-03-25 00:44:41,234: INFO: worker thread finished; awaiting finish of 1 more threads
2018-03-25 00:44:41,244: INFO: worker thread finished; awaiting finish of 0 more threads
2018-03-25 00:44:41,246: INFO: EPOCH - 200 : training on 230239 raw words (185882 effective words) took 0.3s, 726415 effective words/s
2018-03-25 00:44:41,247: INFO: training on a 46047800 raw words (37168939 effective words) took 62.7s, 593212 effective words/s
2018-03-25 00:44:41,249: INFO: saving Word2Vec object under model/word.model, separately None
2018-03-25 00:44:41,250: INFO: not storing attribute vectors_norm
2018-03-25 00:44:41,252: INFO: not storing attribute cum_table
2018-03-25 00:44:41,297: INFO: saved model/word.model
2018-03-25 00:44:41,299: INFO: storing 13319x4 projection weights into model/word.vector
2018-03-25 00:4

### main_word2vec.py

In [14]:
import tensorflow as tf
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# Read features
def read_feature(file):
  print("reading feature information...\n")
  res = []
  with open(file, 'r', encoding='utf-8') as f:
    lines = f.readlines()
  for line in lines:
    line = line.split()
    for i in range(len(line)):
      line[i] = float(line[i])
    res.append(line)
  return np.array(res)

# Read labels
def read_label(file):
  print("reading label information...\n")
  res = []
  with open(file, 'r', encoding='utf-8') as f:
    lines = f.readlines()
  for line in lines:
    line = int(line.strip())
    res.append(line)
  return np.array(res)

def addLayer(inputData, inSize, outSize, activity_function = None):  
    Weights = tf.Variable(tf.random_normal([inSize, outSize]))   
    basis = tf.Variable(tf.random_uniform([1,outSize], -1, 1))    
    weights_plus_b = tf.matmul(inputData, Weights) + basis  
    #Wx_plus_b = tf.nn.dropout(weights_plus_b, keep_prob = 0.8)     # To prevent overfitting

    if activity_function is None:  
        ans = weights_plus_b  
    else:  
        ans = activity_function(weights_plus_b)
    return ans  

def net(x_data, y_data, x_test, y_test):
    is_train = True


    insize = x_data.shape[1]
    outsize = 8
    xs = tf.placeholder(tf.float32,[None, insize]) 
    ys = tf.placeholder(tf.float32,[None, outsize]) 
    keep_prob = tf.placeholder(tf.float32)  
      
    l1 = addLayer(xs, insize, 40,activity_function=None)  
    l2 = addLayer(l1, 40, 20,activity_function=tf.nn.sigmoid)  
    l3 = addLayer(l2, 20, 10,activity_function=tf.nn.softmax)  
    l4 = addLayer(l3, 10, outsize,activity_function=tf.nn.softmax)


    y = l4
    #loss = tf.reduce_sum(tf.reduce_sum(tf.square((ys-l4)),reduction_indices = [1]))  
    #loss = -tf.reduce_mean(ys * tf.log(l3))
    #loss = tf.reduce_sum(tf.square((ys-y)))
    loss = -tf.reduce_sum(ys * tf.log(y))
    #loss = tf.reduce_sum(-tf.reduce_sum(ys * tf.log(y),reduction_indices=[1]))  # loss  
    train =  tf.train.GradientDescentOptimizer(0.00001).minimize(loss) 

    # Turn 1 dimensional label vectors to 14 dimensional vectors which has only one element = 1
    new_ydata = []
    for i in range(y_data.shape[0]):
      new_ydata.append([0]*outsize)
      new_ydata[i][y_data[i]] = 1
      # print(new_ydata[i])
    new_ydata = np.array(new_ydata)
        
    saver=tf.train.Saver()
    with tf.Session() as sess:
        init = tf.global_variables_initializer()
        sess.run(init)
        if is_train: 
            run_step = 4000
            for i in range(run_step):  
                sess.run(train,feed_dict={xs:x_data,ys:new_ydata})  
                if i%50 == 0:  
                    print(sess.run(loss,feed_dict={xs:x_data,ys:new_ydata}))
            # save the model
            saver=tf.train.Saver(max_to_keep=1)
            saver.save(sess,'model/net.ckpt')
        else:     # take a trained model
            saver.restore(sess, 'model/net.ckpt')
            print("save success!")

        # Prediction
        res = sess.run(fetches=y, feed_dict={xs: x_test})
        new_res = []
        for ele in res:
            mmax = -1111
            index = -1
            for i in range(outsize):
                if ele[i] > mmax:
                    index, mmax  = i, ele[i]
            new_res.append(index)
        #print(new_res)
        new_res = np.array(new_res)
        counter = 0
        for i in range(len(new_res)):
          if(y_test[i] == new_res[i]):
            counter += 1
        print("Accuracy: ", counter/len(new_res))
        print(classification_report(new_res, y_test))

def main():
  feature = read_feature('data/feature.txt')
  label = read_label('data/label.txt')

  x_train , x_test , y_train , y_test = train_test_split(feature, label, test_size = 0.1,random_state=0)
  net(x_train, y_train, x_test, y_test)

if __name__ == '__main__':
  main()

reading feature information...

reading label information...

52205.2
44056.9
42801.9
42327.2
42073.9
41916.0
41805.3
41722.5
41660.9
41614.4
41577.8
41548.0
41523.3
41502.3
41484.3
41468.7
41455.0
41442.9
41432.0
41422.1
41413.2
41405.0
41397.5
41390.5
41384.2
41378.2
41372.7
41367.6
41362.8
41358.3
41354.1
41350.1
41346.4
41342.8
41339.5
41336.3
41333.3
41330.4
41327.7
41325.0
41322.5
41320.0
41317.7
41315.4
41313.2
41311.0
41308.9
41306.9
41304.9
41303.0
41301.1
41299.3
41297.5
41295.9
41294.2
41292.6
41291.0
41289.5
41288.0
41286.6
41285.2
41283.8
41282.4
41281.1
41279.8
41278.5
41277.2
41276.0
41274.7
41273.5
41272.2
41271.0
41269.8
41268.6
41267.4
41266.2
41265.0
41263.9
41262.7
41261.5
Accuracy:  0.1978963770938839
             precision    recall  f1-score   support

          0       0.07      0.25      0.11       122
          1       0.09      0.17      0.12       228
          2       0.03      0.24      0.05        37
          3       0.13      0.18      0.15       309
  

### main.py

In [16]:
import tensorflow as tf
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import data_helpers
#from text_cnn import TextCNN
from tensorflow.contrib import learn
from sklearn.metrics import accuracy_score  


def read_feature(file):
  print("reading feature information...\n")
  res = []
  with open(file, 'r', encoding='utf-8') as f:
    lines = f.readlines()
  for line in lines:
    line = line.split()
    for i in range(len(line)):
      line[i] = float(line[i])
    res.append(line)
  return np.array(res)

def read_label(file):
  print("reading label information...\n")
  res = []
  with open(file, 'r', encoding='utf-8') as f:
    lines = f.readlines()
  for line in lines:
    line = int(line.strip())
    res.append(line)
  return np.array(res)

def addLayer(inputData, inSize, outSize, activity_function = None):  
    Weights = tf.Variable(tf.random_normal([inSize, outSize]))   
    basis = tf.Variable(tf.random_uniform([1,outSize], -1, 1))    
    weights_plus_b = tf.matmul(inputData, Weights) + basis  
    Wx_plus_b = tf.nn.dropout(weights_plus_b, keep_prob = 1)     # To prevent overfitting

    if activity_function is None:  
        ans = weights_plus_b  
    else:  
        ans = activity_function(weights_plus_b)
    return ans  

def net(x_data, y_data, x_test, y_test):
    is_train = True


    insize = x_data.shape[1]
    outsize = 6
    xs = tf.placeholder(tf.float32,[None, insize])   
    ys = tf.placeholder(tf.float32,[None, outsize])  
    keep_prob = tf.placeholder(tf.float32)  
      
    l1 = addLayer(xs, insize, 40,activity_function=tf.nn.sigmoid)  
    l2 = addLayer(l1, 40, 20,activity_function=tf.nn.sigmoid)  
    l3 = addLayer(l2, 20, 10,activity_function=tf.nn.sigmoid)  
    l4 = addLayer(l3, 10, outsize,activity_function=tf.nn.softmax)
    #l5 = addLayer(l4, 10, outsize,activity_function=tf.nn.softmax)


    y = l4
    #loss = tf.reduce_sum(tf.reduce_sum(tf.square((ys-l4)),reduction_indices = [1]))  
    #loss = -tf.reduce_mean(ys * tf.log(l3))
    #loss = tf.reduce_sum(tf.square((ys-y)))
    #oss = -tf.reduce_sum(ys * tf.log(y))
    loss = tf.reduce_mean(-tf.reduce_sum(ys * tf.log(y),reduction_indices=[1]))  # loss  
    train =  tf.train.GradientDescentOptimizer(0.0001).minimize(loss) 

    new_ydata = []
    for i in range(y_data.shape[0]):
      new_ydata.append([0]*outsize)
      new_ydata[i][y_data[i]] = 1
      # print(new_ydata[i])
    new_ydata = np.array(new_ydata)
        
    saver=tf.train.Saver()
    with tf.Session() as sess:
        init = tf.global_variables_initializer()
        sess.run(init)
        if is_train: 
            run_step = 1000
            for i in range(run_step):  
                sess.run(train,feed_dict={xs:x_data,ys:new_ydata})  
                if i%50 == 0:  
                    print(sess.run(loss,feed_dict={xs:x_data,ys:new_ydata}))
            # save the model
            saver=tf.train.Saver(max_to_keep=1)
            saver.save(sess,'model/net.ckpt')
        else:     # use an existing model
            saver.restore(sess, 'model/net.ckpt')
            print("save success!")

        # Prediction
        res = sess.run(fetches=y, feed_dict={xs: x_test})
        new_res = []
        for ele in res:
            mmax = -1111
            index = -1
            for i in range(outsize):
                if ele[i] > mmax:
                    index, mmax  = i, ele[i]
            new_res.append(index) 
        #print(new_res)
        new_res = np.array(new_res)
        counter = 0
        for i in range(len(new_res)):
          if (y_test[i] == new_res[i]):
            counter += 1
        #print("Accuracy: ", counter/len(new_res))
        print("Accuracy: ", accuracy_score(y_test, new_res))

def main():
  #feature = read_feature('data/feature.txt')
  #label = read_label('data/label.txt')

  print("Loading data...")
  x_text, y = data_helpers.load_data_and_labels("data/sentence.txt", "data/label.txt")

  '''
  outsize = 8
  new_ydata = []
  for i in range(len(y)):
    new_ydata.append([0]*outsize)
    new_ydata[i][y[i]] = 1
    #print(new_ydata[i])
  new_ydata = np.array(new_ydata)
  y = new_ydata'''

  # Build vocabulary
  max_document_length = max([len(x.split(" ")) for x in x_text])
  vocab_processor = learn.preprocessing.VocabularyProcessor(max_document_length)
  #feature = np.array(list(vocab_processor.fit_transform(x_text)))
  #label = np.array(y)
  x = np.array(list(vocab_processor.fit_transform(x_text)))

  #print(x.shape)
  #print(max_document_length)

  # Randomly shuffle data
  np.random.seed(10)
  shuffle_indices = np.random.permutation(np.arange(len(y)))

  feature = np.array(x)[shuffle_indices]
  label = np.array(y)[shuffle_indices]

  x_train , x_test , y_train , y_test = train_test_split(feature, label, test_size = 0.1)
  net(x_train, y_train, x_test, y_test)

if __name__ == '__main__':
  main()

Loading data...
3.5121
3.49469
3.47797
3.46151
3.44524
3.42944
3.41388
3.39861
3.38354
3.36865
3.35384
3.33911
3.32435
3.30993
3.29582
3.28172
3.26782
3.2541
3.24049
3.22707
Accuracy:  0.162835995325


### cnn_model.py

In [27]:
import tensorflow as tf
import numpy as np


class TextCNN(object):
    """
    A CNN for text classification.
    Uses an embedding layer, followed by a convolutional, max-pooling and softmax layer.
    """
    def __init__(
      self, sequence_length, num_classes, vocab_size,
      embedding_size, filter_sizes, num_filters, l2_reg_lambda=0.0):

        # Placeholders for input, output and dropout
        self.input_x = tf.placeholder(tf.int32, [None, sequence_length], name="input_x")
        self.input_y = tf.placeholder(tf.float32, [None, num_classes], name="input_y")
        self.dropout_keep_prob = tf.placeholder(tf.float32, name="dropout_keep_prob")

        # Keeping track of l2 regularization loss (optional)
        l2_loss = tf.constant(0.0)

        # Embedding layer
        with tf.device('/cpu:0'), tf.name_scope("embedding"):
            self.W = tf.Variable(
                tf.random_uniform([vocab_size, embedding_size], -1.0, 1.0),
                name="W")
            self.embedded_chars = tf.nn.embedding_lookup(self.W, self.input_x)
            self.embedded_chars_expanded = tf.expand_dims(self.embedded_chars, -1)

        # Create a convolution + maxpool layer for each filter size
        pooled_outputs = []
        for i, filter_size in enumerate(filter_sizes):
            with tf.name_scope("conv-maxpool-%s" % filter_size):
                # Convolution Layer
                filter_shape = [filter_size, embedding_size, 1, num_filters]
                W = tf.Variable(tf.truncated_normal(filter_shape, stddev=0.1), name="W")
                b = tf.Variable(tf.constant(0.1, shape=[num_filters]), name="b")
                conv = tf.nn.conv2d(
                    self.embedded_chars_expanded,
                    W,
                    strides=[1, 1, 1, 1],
                    padding="VALID",
                    name="conv")
                # Apply nonlinearity
                h = tf.nn.relu(tf.nn.bias_add(conv, b), name="relu")
                # Maxpooling over the outputs
                pooled = tf.nn.max_pool(
                    h,
                    ksize=[1, sequence_length - filter_size + 1, 1, 1],
                    strides=[1, 1, 1, 1],
                    padding='VALID',
                    name="pool")
                pooled_outputs.append(pooled)

        # Combine all the pooled features
        num_filters_total = num_filters * len(filter_sizes)
        self.h_pool = tf.concat(pooled_outputs, 3)
        self.h_pool_flat = tf.reshape(self.h_pool, [-1, num_filters_total])

        # Add dropout
        with tf.name_scope("dropout"):
            self.h_drop = tf.nn.dropout(self.h_pool_flat, self.dropout_keep_prob)

        # Final (unnormalized) scores and predictions
        with tf.name_scope("output"):
            W = tf.get_variable(
                "W",
                shape=[num_filters_total, num_classes],
                initializer=tf.contrib.layers.xavier_initializer())
            b = tf.Variable(tf.constant(0.1, shape=[num_classes]), name="b")
            l2_loss += tf.nn.l2_loss(W)
            l2_loss += tf.nn.l2_loss(b)
            self.scores = tf.nn.xw_plus_b(self.h_drop, W, b, name="scores")
            self.predictions = tf.argmax(self.scores, 1, name="predictions")

        # Calculate mean cross-entropy loss
        with tf.name_scope("loss"):
            losses = tf.nn.softmax_cross_entropy_with_logits(logits=self.scores, labels=self.input_y)
            self.loss = tf.reduce_mean(losses) + l2_reg_lambda * l2_loss

        # Accuracy
        with tf.name_scope("accuracy"):
            correct_predictions = tf.equal(self.predictions, tf.argmax(self.input_y, 1))
            self.accuracy = tf.reduce_mean(tf.cast(correct_predictions, "float"), name="accuracy")


### textCNN.py

In [None]:
import tensorflow as tf
import numpy as np
import os
import time
import datetime
import data_helpers
#from cnn_model import TextCNN
from tensorflow.contrib import learn

# Parameters
# ==================================================

# Data loading params
tf.flags.DEFINE_float("dev_sample_percentage", 0.1, "Percentage of the training data to use for validation")
tf.flags.DEFINE_string("feature_file", "data/sentence.txt", "feature data (sentence).")
tf.flags.DEFINE_string("label_file", "data/label.txt", "label data (number).")

# Model Hyperparameters
tf.flags.DEFINE_integer("embedding_dim", 100, "Dimensionality of character embedding (default: 128)")
tf.flags.DEFINE_string("filter_sizes", "2,3,4,5", "Comma-separated filter sizes (default: '3,4,5')")
tf.flags.DEFINE_integer("num_filters", 256, "Number of filters per filter size (default: 128)")
tf.flags.DEFINE_float("dropout_keep_prob", 0.5, "Dropout keep probability (default: 0.5)")
tf.flags.DEFINE_float("l2_reg_lambda", 0.0001, "L2 regularization lambda (default: 0.0)")

# Training parameters
tf.flags.DEFINE_integer("batch_size", 120, "Batch Size (default: 64)")
tf.flags.DEFINE_integer("num_epochs", 20, "Number of training epochs (default: 200)")
tf.flags.DEFINE_integer("evaluate_every", 100, "Evaluate model on dev set after this many steps (default: 100)")
tf.flags.DEFINE_integer("checkpoint_every", 1, "Save model after this many steps (default: 100)")
tf.flags.DEFINE_integer("num_checkpoints", 5, "Number of checkpoints to store (default: 5)")
# Misc Parameters
tf.flags.DEFINE_boolean("allow_soft_placement", True, "Allow device soft device placement")
tf.flags.DEFINE_boolean("log_device_placement", False, "Log placement of ops on devices")

FLAGS = tf.flags.FLAGS
FLAGS._parse_flags()
print("\nParameters:")
for attr, value in sorted(FLAGS.__flags.items()):
    print("{}={}".format(attr.upper(), value))
print("")


# Data Preparation
# ==================================================

# Load data
print("Loading data...")
x_text, y = data_helpers.load_data_and_labels(FLAGS.feature_file, FLAGS.label_file)

outsize = 6
new_ydata = []
for i in range(len(y)):
  new_ydata.append([0]*outsize)
  new_ydata[i][y[i]] = 1
  #print(new_ydata[i])
new_ydata = np.array(new_ydata)
y = new_ydata

# Build vocabulary
max_document_length = max([len(x.split(" ")) for x in x_text])
vocab_processor = learn.preprocessing.VocabularyProcessor(max_document_length)
x = np.array(list(vocab_processor.fit_transform(x_text)))

print(x.shape)
print(max_document_length)

# Randomly shuffle data
np.random.seed(10)
shuffle_indices = np.random.permutation(np.arange(len(y)))

x_shuffled = np.array(x)[shuffle_indices]
y_shuffled = np.array(y)[shuffle_indices]

# Split train/test set
# TODO: This is very crude, should use cross-validation
dev_sample_index = -1 * int(FLAGS.dev_sample_percentage * float(len(y)))
x_train, x_dev = x_shuffled[:dev_sample_index], x_shuffled[dev_sample_index:]
y_train, y_dev = y_shuffled[:dev_sample_index], y_shuffled[dev_sample_index:]

del x, y, x_shuffled, y_shuffled


print("Vocabulary Size: {:d}".format(len(vocab_processor.vocabulary_)))
print("Train/Dev split: {:d}/{:d}".format(len(y_train), len(y_dev)))


# Training
# ==================================================


os.environ['CUDA_VISIBLE_DEVICES'] = '3'
config = tf.ConfigProto(allow_soft_placement=True)          #my modification
config.gpu_options.allow_growth = True

with tf.Graph().device('/gpu:3'):
    session_conf = tf.ConfigProto(
      allow_soft_placement=FLAGS.allow_soft_placement,
      log_device_placement=FLAGS.log_device_placement)
    sess = tf.Session(config=session_conf)
    with sess.as_default():
        cnn = TextCNN(
            sequence_length=x_train.shape[1],
            num_classes=y_train.shape[1],
            vocab_size=len(vocab_processor.vocabulary_),
            embedding_size=FLAGS.embedding_dim,
            filter_sizes=list(map(int, FLAGS.filter_sizes.split(","))),
            num_filters=FLAGS.num_filters,
            l2_reg_lambda=FLAGS.l2_reg_lambda)

        # Define Training procedure
        global_step = tf.Variable(0, name="global_step", trainable=False)
        optimizer = tf.train.AdamOptimizer(0.0001)
        grads_and_vars = optimizer.compute_gradients(cnn.loss)
        train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step)

        # Keep track of gradient values and sparsity (optional)
        grad_summaries = []
        for g, v in grads_and_vars:
            if g is not None:
                grad_hist_summary = tf.summary.histogram("{}/grad/hist".format(v.name), g)
                sparsity_summary = tf.summary.scalar("{}/grad/sparsity".format(v.name), tf.nn.zero_fraction(g))
                grad_summaries.append(grad_hist_summary)
                grad_summaries.append(sparsity_summary)
        grad_summaries_merged = tf.summary.merge(grad_summaries)

        # Output directory for models and summaries
        timestamp = str(int(time.time()))
        #out_dir = os.path.abspath(os.path.join(os.path.curdir, "runs", timestamp))
        #print("Writing to {}\n".format(out_dir))

        # Summaries for loss and accuracy
        loss_summary = tf.summary.scalar("loss", cnn.loss)
        acc_summary = tf.summary.scalar("accuracy", cnn.accuracy)
        saver = tf.train.Saver(tf.global_variables(), max_to_keep=FLAGS.num_checkpoints)

        # Write vocabulary
        #vocab_processor.save(os.path.join(out_dir, "vocab"))

        # Initialize all variables
        sess.run(tf.global_variables_initializer())

        def train_step(x_batch, y_batch):
            """
            A single training step
            """
            feed_dict = {
              cnn.input_x: x_batch,
              cnn.input_y: y_batch,
              cnn.dropout_keep_prob: FLAGS.dropout_keep_prob
            }
            _, step, loss, accuracy = sess.run(
                [train_op, global_step, cnn.loss, cnn.accuracy],
                feed_dict)
            time_str = datetime.datetime.now().isoformat()
            print("{}: step {}, loss {:g}, acc {:g}".format(time_str, step, loss, accuracy))
            #train_summary_writer.add_summary(summaries, step)

        def dev_step(x_batch, y_batch, writer=None):
            """
            Evaluates model on a dev set
            """
            feed_dict = {
              cnn.input_x: x_batch,
              cnn.input_y: y_batch,
              cnn.dropout_keep_prob: 1
            }
            step, loss, accuracy = sess.run(
                [global_step, cnn.loss, cnn.accuracy],
                feed_dict)
            time_str = datetime.datetime.now().isoformat()
            print("{}: step {}, loss {:g}, acc {:g}".format(time_str, step, loss, accuracy))
            #if writer:
                #writer.add_summary(summaries, step)
            return accuracy



        is_train = False         

        if is_train:
            for kk in range(10):
                # Generate batches
                batches = data_helpers.batch_iter(
                    list(zip(x_train, y_train)), FLAGS.batch_size, FLAGS.num_epochs)
                # Training loop. For each batch...
                for batch in batches:
                    x_batch, y_batch = zip(*batch)
                    train_step(x_batch, y_batch)
                    current_step = tf.train.global_step(sess, global_step)
                    if current_step % FLAGS.evaluate_every == 0:
                        print("\n\n\nEvaluation:")
                        test_acc = dev_step(x_dev, y_dev, writer=None)
                        print("accuracy on test data is: {}\n\n\n".format(test_acc))
                        saver.save(sess,'cnnmodel/net.ckpt')
        else:
            saver.restore(sess, 'cnnmodel/net.ckpt')
            print("reload success!")
            test_acc = dev_step(x_dev, y_dev, writer=None)            
            print("\n\nmodel accuracy on test data is: {}%\n\n".format(test_acc*100))


AttributeError: module 'tensorflow' has no attribute 'flags'