<h1> Predict the type of question with Doc2Vec and Word2vec</h1>

In [3168]:
from __future__ import division
from gensim import utils
from gensim.models.doc2vec import LabeledSentence, TaggedDocument
from gensim.models import Doc2Vec
from gensim import corpora, models, similarities

# numpy
import numpy as np
import re
# random
from random import shuffle,sample

# classifier
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor

import psycopg2
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

%pylab inline

Populating the interactive namespace from numpy and matplotlib


`%matplotlib` prevents importing * from pylab and numpy


<h1> Connect to the DB </h1>

In [3169]:
conn = psycopg2.connect("host=localhost port=5432 dbname=amazon user=postgres password=darkmatter")
cur = conn.cursor()

In [3535]:
cur.execute("SELECT question,qestion_type,qestion_type_bow,qestion_type_human from training \
WHERE qestion_type_human  = 'yes/no' and (name='Attila' or name='Ruth' ) ;")
result=cur.fetchall()
Qyn=[val[0] for val in result]
Qyn_Type_data=[val[1] for val in result]
Qyn_Type_bow=[val[2] for val in result]

In [3536]:
cur.execute("SELECT question,qestion_type,qestion_type_bow,qestion_type_human from training \
WHERE qestion_type_human  = 'open-ended' and (name='Attila' or name='Ruth') ;")
result=cur.fetchall()
Qoe=[val[0] for val in result]
Qoe_Type_data=[val[1] for val in result]
Qoe_Type_bow=[val[2] for val in result]

In [3285]:
len(Qyn)/(len(Qyn)+len(Qoe))

0.7176669484361792

<h1> setup training </h1>

In [3537]:
def process_line(sentence):
    filter_text=' '.join(re.findall("[a-z']+", sentence.lower())) #removed ?
    #return nltk.word_tokenize(filter_text)
    return filter_text.replace('?',' ? ').split()
stoplist = set('for a of the and to in rt'.split())

<h3> turn questions into bag of words. sample 1/2 for training </h3>

In [3538]:
qs_yn= [[word for word in process_line(sentence) if word not in stoplist] for sentence in Qyn ]
#shuffle and take the first 1/2 for training
shuffle(qs_yn)
qs_yn_sample=qs_yn[:int(len(qs_yn)/2)]

qs_oe= [[word for word in process_line(sentence) if word not in stoplist] for sentence in Qoe ]
#shuffle and take the first 1/2 for training, can also do random sampling
shuffle(qs_oe)
qs_oe_sample=qs_oe[:int(len(qs_oe)/2)]

<h3> setup labeles </h3>

In [1040]:
docs=[]
for i,words in enumerate(qs_yn):
    docs.append(TaggedDocument(words[:3],['YN_'+str(i)]))
for i,words in enumerate(qs_oe):
    docs.append(TaggedDocument(words[:3],['OE_'+str(i)]))

<h3> Train doc2vec </h3>

In [1108]:
model = Doc2Vec(min_count=1, window=10, size=4, sample=1e-4, negative=2, workers=2)
model.build_vocab(docs)
model.train(docs)



5673

In [1109]:
#model.docvecs['OE_7']

<h1> setup logistic regresion training </h1>

In [1110]:
all_yn_arrays = np.zeros((len(qs_oe_sample), 4))
all_oe_arrays = np.zeros((len(qs_oe_sample), 4))
all_yn_labels = np.zeros(len(qs_oe_sample))
all_oe_labels = np.zeros(len(qs_oe_sample))

#setup training for Y/N questions

## KEEP same size of YN and OE when training #####

for i in range(len(qs_oe_sample)):
    all_yn_arrays[i] = model.docvecs['YN_'+str(i)]
    all_yn_labels[i] = 1

#setup training for open-ended questions
for ii in range(len(qs_oe_sample)):
    all_oe_arrays[ii] = model.docvecs['OE_'+str(ii)]
    all_oe_labels[ii] = 0

<h3> Setup test labels for logistic regresion </h3>

In [1111]:
#get length
Nyn=len(all_oe_arrays)
Noe=len(all_oe_arrays)

ratio=0.5

train_arrays = np.vstack((all_yn_arrays[:Nyn*ratio],all_oe_arrays[:Noe*ratio]))
train_labels = np.hstack((all_yn_labels[:Nyn*ratio],all_oe_labels[:Noe*ratio]))

test_arrays = np.vstack((all_yn_arrays[Nyn*ratio:],all_oe_arrays[Noe*ratio:]))
test_labels = np.hstack((all_yn_labels[Nyn*ratio:],all_oe_labels[Noe*ratio:]))

<h1> Run Logistic Resgresion Fit </h1>

In [1112]:
classifier = LogisticRegression()
classifier.fit(train_arrays, train_labels)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

<h3> Test Logistice Regrsion on training set (used to create the doc2vec model) </h3>

In [1113]:
classifier.score(test_arrays, test_labels)

0.45555555555555555

<h1> Test on data not used to train the word2vec model </h1>

<h3> Y/N </h3>

In [1114]:
import warnings
warnings.filterwarnings('ignore')
prediction=[classifier.predict(model.infer_vector(line_arr[:3])) for line_arr in qs_yn[-100:]]

In [1115]:
sum([val==1 for val in prediction])/len(prediction)

array([ 0.53])

In [1116]:
#[(val1,val2) for val1,val2 in zip(prediction,qs_yn[-200:])]

<h3> OE </h3>

In [1117]:
prediction_oe=[classifier.predict(model.infer_vector(line_arr[:3])) for line_arr in qs_oe[-100:]]
sum([val==0 for val in prediction_oe])/len(prediction_oe)

array([ 0.49])

<h1> Test on a sentence </h1>

In [742]:
lin_arr="can I use this with an iphone".split()
classifier.predict(model.infer_vector(lin_arr))

array([ 1.])

In [743]:
lin_arr="how do you plug this is".split()
classifier.predict(model.infer_vector(lin_arr))

array([ 0.])

<h1> Use Bigrams and first word of a sentence for logistic regresion </h1>

In [3179]:
QmodelB=models.Word2Vec.load('/home/ubuntu/TallLabs/models/QmodelB')

In [3491]:
#use the length of qs_oe_sample to get a 50/50 sample
#only look at questions with more than 3 word inputs
#shuffle and take the first 1/2 for training
qs_yn=[val for val in qs_yn if len(val)>2]
shuffle(qs_yn)
qs_yn_sample=qs_yn[:int(len(qs_yn)/2)]

#shuffle and take the first 1/2 for training, can also do random sampling
qs_oe=[val for val in qs_oe if len(val)>2]
shuffle(qs_oe)
qs_oe_sample=qs_oe[:int(len(qs_oe)/2)]

all_yn_arrays=[]
all_oe_arrays=[]
all_yn_labels=[]
all_oe_labels=[]

all_yn_arrays2=[]
all_oe_arrays2=[]
all_yn_labels2=[]
all_oe_labels2=[]

all_yn_arrays3=[]
all_oe_arrays3=[]
all_yn_labels3=[]
all_oe_labels3=[]

all_yn_arrays4=[]
all_oe_arrays4=[]
all_yn_labels4=[]
all_oe_labels4=[]

all_yn_labels=[]
all_oe_labels=[]
len_yn=[]
len_oe=[]

#setup training for Y/N questions ... only tranin on the first word!
for i in range(len(qs_oe_sample)):
    if(qs_yn_sample[i]==[]):
        qs_yn_sample[i]=['']
    if (qs_yn_sample[i][0] in QmodelB)&(qs_yn_sample[i][1] in QmodelB)&(qs_yn_sample[i][2] in QmodelB):
        #print(qs_yn_sample[i][2])
        all_yn_arrays.append(QmodelB[qs_yn_sample[i][0]])
        all_yn_arrays2.append(QmodelB[qs_yn_sample[i][1]])
        all_yn_arrays3.append(QmodelB[qs_yn_sample[i][2]])
        #all_yn_arrays4.append(QmodelB[qs_yn_sample[i][3]])
        len_yn.append(len(qs_yn_sample[i]))
        all_yn_labels.append(1)
    else:
        print('fail')

#setup training for open-ended questions
for ii in range(len(qs_oe_sample)):
    if(qs_oe_sample[ii]==[]):
        qs_oe_sample[ii]=['']
    if (qs_oe_sample[ii][0] in QmodelB)&(qs_oe_sample[ii][1] in QmodelB)&(qs_oe_sample[ii][2] in QmodelB):
        #print(qs_oe_sample[ii][2])
        all_oe_arrays.append(QmodelB[qs_oe_sample[ii][0]])
        all_oe_arrays2.append(QmodelB[qs_oe_sample[ii][1]])
        all_oe_arrays3.append(QmodelB[qs_oe_sample[ii][2]])
        #all_oe_arrays4.append(QmodelB[qs_oe_sample[ii][3]])
        len_oe.append(len(qs_oe_sample[ii]))
        all_oe_labels.append(0)
    else:
        print('fail',qs_oe_sample[ii])
    
#get length
Noe=len(all_oe_arrays)
ratio=0.99
N=int(Noe*ratio)

len_yn=np.array(len_yn)
len_oe=np.array(len_oe)

train_arrays = np.vstack((all_yn_arrays[:N],all_oe_arrays[:N]))
len_train=np.hstack((len_yn[:N],len_oe[:N]))
len_train=len_train.reshape(len(len_train),1)
train_labels = np.hstack((all_yn_labels[:N],all_oe_labels[:N]))
train_arrays2 = np.vstack((all_yn_arrays2[:N],all_oe_arrays2[:N]))
train_arrays3 = np.vstack((all_yn_arrays3[:N],all_oe_arrays3[:N]))
train_arrays4 = np.vstack((all_yn_arrays4[:N],all_oe_arrays4[:N]))


test_arrays = np.vstack((all_yn_arrays[N:],all_oe_arrays[N:]))
len_test=np.hstack((len_yn[N:],len_oe[N:]))
len_test=len_test.reshape(len(len_test),1)
test_labels = np.hstack((all_yn_labels[N:],all_oe_labels[N:]))
test_arrays2 = np.vstack((all_yn_arrays2[N:],all_oe_arrays2[N:]))
test_arrays3 = np.vstack((all_yn_arrays3[N:],all_oe_arrays3[N:]))
test_arrays4 = np.vstack((all_yn_arrays4[N:],all_oe_arrays4[N:]))


fail
fail
fail ['is', 'that', 'harcore', 'or', 'usual', 'nitro', 'tech']
fail ['is', 'this', 'shibari', 'my', 'wand', 'or', 'new', 'generation', 'hitachi', 'wand']
fail ['what', 'warrebnty', 'does', 'it', 'have']


In [3459]:
#plot length of words
#bins=range(25)
#plt.hist(len_yn,bins=bins,normed=1)
#plt.hist(len_oe,color='red',alpha=0.5,bins=bins,normed=1)

<h3> USA PCA to reduce to 2 principle components </h3>

In [3444]:
#pca_train_vec=np.vstack((train_arrays,test_arrays))
#pca_train_vec2=np.vstack((train_arrays2,test_arrays2))
#pca_train_vec3=np.vstack((train_arrays3,test_arrays3))
#pca_train_vec4=np.vstack((train_arrays4,test_arrays4))

In [3479]:
all_first_three_words=[[word for word in words][:3] for words in qs_yn+qs_oe]
all_first_three_wordVec =[]
for words in all_first_three_words:
    for word in words:
        try:
            all_first_three_wordVec.append(QmodelB[word])
        except KeyError:
            print('fail')
all_words=[QmodelB[word] for word in QmodelB.index2word]

fail
fail
fail
fail
fail
fail
fail
fail
fail
fail
fail
fail
fail
fail


In [3492]:
pca = PCA(n_components=1, whiten=True)
pcaFit=pca.fit(all_first_three_wordVec)
#pcaFit=pca.fit(np.vstack((pca_train_vec,pca_train_vec2,pca_train_vec3)))
vectors2d_train = pcaFit.transform(train_arrays)
vectors2d_test = pcaFit.transform(test_arrays)

#pcaFit2=pca.fit(pca_train_vec2)
vectors2d_train2 = pcaFit.transform(train_arrays2)
vectors2d_test2 = pcaFit.transform(test_arrays2)

#pcaFit3=pca.fit(pca_train_vec3)
vectors2d_train3 = pcaFit.transform(train_arrays3)
vectors2d_test3 = pcaFit.transform(test_arrays3)

#vectors2d_train4 = pcaFit.transform(train_arrays4)
#vectors2d_test4 = pcaFit.transform(test_arrays4)

In [3493]:
vectors2d_train=np.hstack((vectors2d_train,vectors2d_train2,vectors2d_train3))
vectors2d_test=np.hstack((vectors2d_test,vectors2d_test2,vectors2d_test3))

#With length info
#vectors2d_train=np.hstack((vectors2d_train,vectors2d_train2,vectors2d_train3,vectors2d_train4,len_train))
#vectors2d_test=np.hstack((vectors2d_test,vectors2d_test2,vectors2d_test3,vectors2d_test4,len_test))

#With length info
#vectors2d_train=np.hstack((vectors2d_train,vectors2d_train2,vectors2d_train3,len_train))
#vectors2d_test=np.hstack((vectors2d_test,vectors2d_test2,vectors2d_test3,len_test))

In [3497]:
#classifier = LogisticRegression(penalty='l2')
classifier=RandomForestClassifier()
classifier.fit(vectors2d_train, train_labels)
#classifier.fit(TA, train_labels)
#classifier.fit(train_arrays, train_labels)
#classifier.fit(c1*train_arrays+c2*train_arrays2+c3*train_arrays3+c4*train_arrays4, train_labels)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [3498]:
#classifier.score(TestA, test_labels)
#classifier.score(c1*test_arrays+c2*test_arrays2+c3*test_arrays3+c4*test_arrays4, test_labels)
classifier.score(vectors2d_test, test_labels)

1.0

In [3499]:
classifier.feature_importances_

array([ 0.43848656,  0.25901269,  0.30250075])

<h1> test on real questions </h1>

In [3500]:
import warnings
warnings.filterwarnings('ignore')

prediction=[]
for line_arr in qs_yn[-int(len(qs_yn)/2):]:
    try:
        #prediction.append(classifier.predict(c1*QmodelB[line_arr[0]]+c2*QmodelB[line_arr[1]]+c3*QmodelB[line_arr[2]]\
        #                                    +c4*QmodelB[line_arr[3]]))
        # Use with PCA, each word->1 number through PCA
        prediction.append(classifier.predict(np.hstack((pcaFit.transform(QmodelB[line_arr[0]]),\
                                            pcaFit.transform(QmodelB[line_arr[1]]),\
                                            pcaFit.transform(QmodelB[line_arr[2]]),\
                                            #pcaFit.transform(QmodelB[line_arr[3]]),\
                                            #np.array(len(line_arr)).reshape(1,1) \
                                            ))))
        # Each word->vector of size 100, no PCA
        #prediction.append(classifier.predict(np.hstack((QmodelB[line_arr[0]],\
        #                                    QmodelB[line_arr[1]],\
        #                                    QmodelB[line_arr[2]],\
        #                                    ))))
    except KeyError:
        prediction.append(1)
sum(prediction)/len(prediction)

array([ 0.75721154])

In [3254]:
#recall
sum(prediction)/(sum(prediction)+sum(prediction2))

array([ 0.91071429])

In [2665]:
#score of their algorythm
sum(val=='yes/no' for val in Qyn_Type_data[-200:])/200
#sum(val=='yes/no' for val in Qyn_Type_bow[-200:])/200

0.71

In [3501]:
prediction2=[]
for line_arr in qs_oe[-int(len(qs_oe)/2):]:
    try:
        #prediction2.append(classifier.predict(c1*QmodelB[line_arr[0]]+c2*QmodelB[line_arr[1]]+c3*QmodelB[line_arr[2]]\
        #                                    +c4*QmodelB[line_arr[3]]))
        prediction2.append(classifier.predict(np.hstack((pcaFit.transform(QmodelB[line_arr[0]]),\
                                            pcaFit.transform(QmodelB[line_arr[1]]),\
                                            pcaFit.transform(QmodelB[line_arr[2]]),\
                                            #pcaFit.transform(QmodelB[line_arr[3]]),\
                                            #np.array(len(line_arr)).reshape(1,1)\
                                            ))))
        #prediction.append(classifier.predict(np.hstack((QmodelB[line_arr[0]],\
        #                            QmodelB[line_arr[1]],\
        #                            QmodelB[line_arr[2]],\
        #                            ))))
    except KeyError:
        prediction2.append(1)
sum([int(val==0) for val in prediction2])/len(prediction2)

0.74846625766871167

In [3264]:
#score of their algorythm
#sum(val=='open-ended' for val in Qoe_Type_bow[-200:])/200
sum(val=='open-ended' for val in Qoe_Type_data[-200:])/200

0.84

In [1543]:
len(train_arrays)

242

<h3> save classifier </h3>

In [1212]:
from sklearn.externals import joblib
joblib.dump(classifier, 'three_word_logreg_py2.pkl',protocol=2) 

['three_word_logreg_py2.pkl',
 'three_word_logreg_py2.pkl_01.npy',
 'three_word_logreg_py2.pkl_02.npy',
 'three_word_logreg_py2.pkl_03.npy',
 'three_word_logreg_py2.pkl_04.npy']

<h1> Model on first word only </h1>

In [3539]:
#use the length of qs_oe_sample to get a 50/50 sample
#only look at questions with more than 3 word inputs
#shuffle and take the first 1/2 for training
qs_yn=[val for val in qs_yn if len(val)>0]
shuffle(qs_yn)
qs_yn_sample=qs_yn[:int(len(qs_yn)/2)]

#shuffle and take the first 1/2 for training, can also do random sampling
qs_oe=[val for val in qs_oe if len(val)>0]
shuffle(qs_oe)
qs_oe_sample=qs_oe[:int(len(qs_oe)/2)]

all_yn_arrays=[]
all_oe_arrays=[]
all_yn_labels=[]
all_oe_labels=[]

all_yn_labels=[]
all_oe_labels=[]
len_yn=[]
len_oe=[]

#setup training for Y/N questions ... only tranin on the first word!
for i in range(len(qs_oe_sample)):
    if(qs_yn_sample[i]==[]):
        qs_yn_sample[i]=['']
    if (qs_yn_sample[i][0] in QmodelB):
        #print(qs_yn_sample[i][2])
        all_yn_arrays.append(QmodelB[qs_yn_sample[i][0]])
        len_yn.append(len(qs_yn_sample[i]))
        all_yn_labels.append(1)
    else:
        print('fail')

#setup training for open-ended questions
for ii in range(len(qs_oe_sample)):
    if(qs_oe_sample[ii]==[]):
        qs_oe_sample[ii]=['']
    if (qs_oe_sample[ii][0] in QmodelB):
        #print(qs_oe_sample[ii][2])
        all_oe_arrays.append(QmodelB[qs_oe_sample[ii][0]])
        len_oe.append(len(qs_oe_sample[ii]))
        all_oe_labels.append(0)
    else:
        print('fail',qs_oe_sample[ii])
    
#get length
Noe=len(all_oe_arrays)
ratio=0.99
N=int(Noe*ratio)

len_yn=np.array(len_yn)
len_oe=np.array(len_oe)

train_arrays = np.vstack((all_yn_arrays[:N],all_oe_arrays[:N]))
len_train=np.hstack((len_yn[:N],len_oe[:N]))
len_train=len_train.reshape(len(len_train),1)
train_labels = np.hstack((all_yn_labels[:N],all_oe_labels[:N]))


test_arrays = np.vstack((all_yn_arrays[N:],all_oe_arrays[N:]))
len_test=np.hstack((len_yn[N:],len_oe[N:]))
len_test=len_test.reshape(len(len_test),1)
test_labels = np.hstack((all_yn_labels[N:],all_oe_labels[N:]))

fail


In [3540]:
#pca_train_vec=np.vstack((train_arrays,test_arrays))
#pca = PCA(n_components=1, whiten=True)
#pcaFit=pca.fit(pca_train_vec)
#pcaFit=pca.fit(pca_train_vec)
#vectors2d_train = pcaFit.transform(train_arrays)
#vectors2d_test = pcaFit.transform(test_arrays)

classifier = LogisticRegression(penalty='l2')
#classifier=RandomForestClassifier()
classifier.fit(train_arrays, train_labels)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [3541]:
classifier.score(test_arrays, test_labels)

0.66666666666666663

<h3> test on real questions </h3>

In [3542]:
import warnings
warnings.filterwarnings('ignore')

prediction=[]
for line_arr in qs_yn[-int(len(qs_yn)/2):]:
    try:
        #prediction.append(classifier.predict(c1*QmodelB[line_arr[0]]+c2*QmodelB[line_arr[1]]+c3*QmodelB[line_arr[2]]\
        #                                    +c4*QmodelB[line_arr[3]]))
        #prediction.append(classifier.predict(pcaFit.transform(QmodelB[line_arr[0]])))
        prediction.append(classifier.predict(QmodelB[line_arr[0]]))
    except KeyError:
        prediction.append(1)
sum(prediction)/len(prediction)

0.86616161616161613

In [3543]:
prediction2=[]
for line_arr in qs_oe[-int(len(qs_oe)/2):]:
    try:
        #prediction2.append(classifier.predict(c1*QmodelB[line_arr[0]]+c2*QmodelB[line_arr[1]]+c3*QmodelB[line_arr[2]]\
        #                                    +c4*QmodelB[line_arr[3]]))
        #prediction2.append(classifier.predict(pcaFit.transform(QmodelB[line_arr[0]])))
        prediction2.append(classifier.predict(QmodelB[line_arr[0]]))
    except KeyError:
        prediction2.append(1)
        
sum([int(val==0) for val in prediction2])/len(prediction2)

0.78169014084507038

In [3544]:
from sklearn.externals import joblib
joblib.dump(classifier, '/home/ubuntu/TallLabs/models/first_word_logreg_py2.pkl',protocol=2) 

['/home/ubuntu/TallLabs/models/first_word_logreg_py.pkl',
 '/home/ubuntu/TallLabs/models/first_word_logreg_py.pkl_01.npy',
 '/home/ubuntu/TallLabs/models/first_word_logreg_py.pkl_02.npy',
 '/home/ubuntu/TallLabs/models/first_word_logreg_py.pkl_03.npy',
 '/home/ubuntu/TallLabs/models/first_word_logreg_py.pkl_04.npy']

<h3> backup of model </h3>

In [None]:
#use the length of qs_oe_sample to get a 50/50 sample
#only look at questions with more than 3 word inputs
#shuffle and take the first 1/2 for training
qs_yn=[val for val in qs_yn if len(val)>3]
shuffle(qs_yn)
qs_yn_sample=qs_yn[:int(len(qs_yn)/2)]

#shuffle and take the first 1/2 for training, can also do random sampling
qs_oe=[val for val in qs_oe if len(val)>3]
shuffle(qs_oe)
qs_oe_sample=qs_oe[:int(len(qs_oe)/2)]

'''
all_yn_arrays = np.zeros((len(qs_oe_sample), 100))
all_oe_arrays = np.zeros((len(qs_oe_sample), 100))
all_yn_labels = np.zeros(len(qs_oe_sample))
all_oe_labels = np.zeros(len(qs_oe_sample))

all_yn_arrays2 = np.zeros((len(qs_oe_sample), 100))
all_oe_arrays2 = np.zeros((len(qs_oe_sample), 100))
all_yn_labels2 = np.zeros(len(qs_oe_sample))
all_oe_labels2 = np.zeros(len(qs_oe_sample))

all_yn_arrays3 = np.zeros((len(qs_oe_sample), 100))
all_oe_arrays3 = np.zeros((len(qs_oe_sample), 100))
all_yn_labels3 = np.zeros(len(qs_oe_sample))
all_oe_labels3 = np.zeros(len(qs_oe_sample))

all_yn_arrays4 = np.zeros((len(qs_oe_sample), 100))
all_oe_arrays4 = np.zeros((len(qs_oe_sample), 100))
all_yn_labels4 = np.zeros(len(qs_oe_sample))
all_oe_labels4 = np.zeros(len(qs_oe_sample))
'''
all_yn_arrays=[]
all_oe_arrays=[]
all_yn_labels=[]
all_oe_labels=[]

all_yn_arrays2=[]
all_oe_arrays2=[]
all_yn_labels2=[]
all_oe_labels2=[]

all_yn_arrays3=[]
all_oe_arrays3=[]
all_yn_labels3=[]
all_oe_labels3=[]

len_yn=[]
len_oe=[]

#setup training for Y/N questions ... only tranin on the first word!
for i in range(len(qs_oe_sample)):
    if(qs_yn_sample[i]==[]):
        qs_yn_sample[i]=['']
    try:
        all_yn_arrays[i] = QmodelB[qs_yn_sample[i][0]]
        all_yn_arrays2[i] = QmodelB[qs_yn_sample[i][1]]
        all_yn_arrays3[i] = QmodelB[qs_yn_sample[i][2]]
        #all_yn_arrays4[i] = QmodelB[qs_yn_sample[i][3]]
        len_yn.append(len(qs_yn_sample[i]))
    except KeyError:
        pass
        all_yn_arrays[i] = all_yn_arrays[i-1]
        all_yn_arrays2[i] = all_yn_arrays2[i-1]
        all_yn_arrays3[i] = all_yn_arrays3[i-1]
        #all_yn_arrays4[i] = all_yn_arrays4[i-1]
        len_yn.append(len(qs_yn_sample[i]))
        #print('fail')
    all_yn_labels[i] = 1

#setup training for open-ended questions
for ii in range(len(qs_oe_sample)):
    if(qs_oe_sample[ii]==[]):
        qs_oe_sample[ii]=['']
    try:
        all_oe_arrays[ii] = QmodelB[qs_oe_sample[ii][0]]
        all_oe_arrays2[ii] = QmodelB[qs_oe_sample[ii][1]]
        all_oe_arrays3[ii] = QmodelB[qs_oe_sample[ii][2]]
        #all_oe_arrays4[ii] = QmodelB[qs_oe_sample[ii][3]]
        len_oe.append(len(qs_oe_sample[ii]))
    except KeyError:
        pass
        all_oe_arrays[ii]=all_oe_arrays[ii-1]
        all_oe_arrays2[ii]=all_oe_arrays2[ii-1]
        all_oe_arrays3[ii]=all_oe_arrays3[ii-1]
        #all_oe_arrays4[ii]=all_oe_arrays4[ii-1]
        len_oe.append(len(qs_oe_sample[ii]))
        print('fail',qs_oe_sample[ii])
    all_oe_labels[ii] = 0
    
#get length
ratio=0.7
N=int(Nyn*ratio)

len_yn=np.array(len_yn)
len_oe=np.array(len_oe)

train_arrays = np.vstack((all_yn_arrays[:N],all_oe_arrays[:N]))
len_train=np.hstack((len_yn[:N],len_oe[:N]))
len_train=len_train.reshape(len(len_train),1)
train_labels = np.hstack((all_yn_labels[:N],all_oe_labels[:N]))
train_arrays2 = np.vstack((all_yn_arrays2[:N],all_oe_arrays2[:N]))
train_arrays3 = np.vstack((all_yn_arrays3[:N],all_oe_arrays3[:N]))
train_arrays4 = np.vstack((all_yn_arrays4[:N],all_oe_arrays4[:N]))


test_arrays = np.vstack((all_yn_arrays[N:],all_oe_arrays[N:]))
len_test=np.hstack((len_yn[N:],len_oe[N:]))
len_test=len_test.reshape(len(len_test),1)
test_labels = np.hstack((all_yn_labels[N:],all_oe_labels[N:]))
test_arrays2 = np.vstack((all_yn_arrays2[N:],all_oe_arrays2[N:]))
test_arrays3 = np.vstack((all_yn_arrays3[N:],all_oe_arrays3[N:]))
test_arrays4 = np.vstack((all_yn_arrays4[N:],all_oe_arrays4[N:]))
