## TF-IDF LAC 2017 (concise case)

modelling concise case of LAC 2017 use TF_IDF as word embedding

In [1]:
import pandas as pd
import numpy as np
import re
from bs4 import BeautifulSoup
from gensim.models import Word2Vec
import ast
# from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.externals import joblib



In [2]:
# for model
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.layers import Flatten
from keras.layers import Embedding
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D
from keras.models import model_from_json
from keras.callbacks import CSVLogger


Using TensorFlow backend.


In [3]:
DATA_PATH="../Data/Raw/"
DATA_PATH2="../..//Data/Manipulation/"
MODEL_PATH="../../Models/"

### TFIDF Vectorize

In [4]:
dta_tr=pd.read_csv(DATA_PATH2+"data_train_tokenize_concise_only2_train.csv")
dta_te=pd.read_csv(DATA_PATH2+"data_train_tokenize_concise_only2_test.csv")

In [5]:
dta_tr["true_text"]=dta_tr.token_short_des.apply(lambda x: " ".join(ast.literal_eval(x)))
dta_te["true_text"]=dta_te.token_short_des.apply(lambda x: " ".join(ast.literal_eval(x)))

In [6]:
docs=dta_tr.true_text.tolist()
docs_te=dta_te.true_text.tolist()

In [7]:
docs[:2]

['formulated oil free hydrating botanicals remarkably improves skin texture abused hands restores soft smooth refined hands',
 '150cm mini microphone compatible iphone various smartphones also ipad apple computer macbook dual headed design allows two people using simultaneously features high sensitivity omni directional sounds output perfect audio video recording 3 5mm standard connector jack convenient clip design clip collar 3 5mm standard connector jack convenient clip design clip collar']

In [8]:
%%time
tfidf_trans_75=TfidfVectorizer(max_features=75)
tfidf_trans_75.fit(docs)

Wall time: 1.3 s


In [9]:
%%time
tfidf_trans_150=TfidfVectorizer(max_features=150)
tfidf_trans_150.fit(docs)

Wall time: 1.38 s


In [10]:
%%time
tfidf_trans_225=TfidfVectorizer(max_features=225)
tfidf_trans_225.fit(docs)

Wall time: 1.41 s


In [11]:
A=tfidf_trans_75.transform([docs[0]])

In [12]:
A.toarray()

array([[0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.57384093, 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.607389  , 0.        , 0.54934979,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.  

In [13]:
joblib.dump(tfidf_trans_75,MODEL_PATH+"tfidf75_transform.pkl")
joblib.dump(tfidf_trans_150,MODEL_PATH+"tfidf150_transform.pkl")
joblib.dump(tfidf_trans_225,MODEL_PATH+"tfidf225_transform.pkl")

['../../Models/tfidf225_transform.pkl']

### Modeling Part

In [28]:
EPOCH=8
BATCH=590

In [29]:
def cur_model(input_size):
    model = Sequential()
#     model.add(Embedding(50, 12, input_length=input_size))
    
#     Embedding(vocab_size, embd2.layer1_size, weights=[embedding_vectors], 
#                             input_length=max_length, trainable=False)
    
#     model.add(Conv1D(filters=8, kernel_size=5, activation='relu'))
#     model.add(MaxPooling1D(pool_size=3))
#     model.add(Flatten())
    model.add(Dense(200,input_dim=input_size, activation='relu'))
    model.add(Dense(200,input_dim=input_size, activation='relu'))
    model.add(Dropout(0.8))
    model.add(Dense(70, activation='relu'))
    model.add(Dense(70, activation='relu'))
    model.add(Dropout(0.8))
    model.add(Dense(15, activation='relu'))
    model.add(Dense(15, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    print(model.summary())
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [30]:
Y_true=dta_tr["conci"]

In [31]:
X75=tfidf_trans_75.transform(dta_tr.true_text)
X150=tfidf_trans_150.transform(dta_tr.true_text)
X225=tfidf_trans_225.transform(dta_tr.true_text)

In [None]:
# define model 75
model75=cur_model(75)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_22 (Dense)             (None, 200)               15200     
_________________________________________________________________
dense_23 (Dense)             (None, 200)               40200     
_________________________________________________________________
dropout_7 (Dropout)          (None, 200)               0         
_________________________________________________________________
dense_24 (Dense)             (None, 70)                14070     
_________________________________________________________________
dense_25 (Dense)             (None, 70)                4970      
_________________________________________________________________
dropout_8 (Dropout)          (None, 70)                0         
_________________________________________________________________
dense_26 (Dense)             (None, 15)                1065      
__________

In [None]:
%%time
csv_logger = CSVLogger(DATA_PATH2+'log75.csv', append=True, separator=',')
model75.fit(X75, Y_true, epochs=EPOCH, batch_size=BATCH,verbose=0,callbacks=[csv_logger])

In [None]:
# define model 150
model150=cur_model(150)

In [None]:
%%time
csv_logger = CSVLogger(DATA_PATH2+'log150.csv', append=True, separator=',')
model150.fit(X150, Y_true, epochs=EPOCH, batch_size=BATCH,verbose=0,callbacks=[csv_logger])

In [None]:
# define model 225
model225=cur_model(225)

In [None]:
%%time
csv_logger = CSVLogger(DATA_PATH2+'log225.csv', append=True, separator=',')
model225.fit(X225, Y_true, epochs=EPOCH, batch_size=BATCH,verbose=0,callbacks=[csv_logger])

In [None]:
# Model Saving

# serialize model to JSON
for model,mod_name in [(model75,"tfidf_model75"),(model150,"tfidf_model150"),(model225,"tfidf_model225")]:
    model_json = model.to_json()
    with open(MODEL_PATH+mod_name+".json", "w") as json_file:
        json_file.write(model_json)
    # serialize weights to HDF5
    model.save_weights(MODEL_PATH+mod_name+"_weight.h5")
    print("Saved model to disk")

### Evaluate

In [None]:
for model,X in [(model75,X75),(model150,X150),(model225,X225)]:
    loss, acc = model.evaluate(X, Y_true, verbose=0)
    print('Train Accuracy: %f' % (acc*100))

In [None]:
X75_te=tfidf_trans_75.transform(dta_te.true_text)
X150_te=tfidf_trans_150.transform(dta_te.true_text)
X225_te=tfidf_trans_225.transform(dta_te.true_text)

Y_test=dta_te.conci

In [None]:
for model,X in [(model75,X75_te),(model150,X150_te),(model225,X225_te)]:
    loss, acc = model.evaluate(X, Y_test, verbose=0)
    print('Test Accuracy: %f' % (acc*100))

In [27]:
# 6 630