**Notebook Objective:**

Objective of the notebook is to look at the different pretrained embeddings provided in the dataset and to see how they are useful in the model building process. 

First let us import the necessary modules and read the input data.

In [None]:
#############CNN##############
import numpy as np
import pandas as pd
import gc
import keras 
from keras.models import Model
from sklearn.utils.class_weight import compute_class_weight 
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation, CuDNNGRU, Conv1D, CuDNNLSTM
from keras.layers import Bidirectional, GlobalMaxPool1D
from keras.models import Model
from keras import initializers, regularizers, constraints, optimizers, layers
from keras import backend as K
from keras.engine.topology import Layer
from sklearn.model_selection import train_test_split

In [None]:
data=pd.read_csv('../input/ai-data/data.tsv',delimiter='\t', header=None)
data.columns=['query_id','query','passage','label','passage_id']

In [None]:
data=pd.read_csv("../input/movie-title-analysis/train.csv")

In [None]:

train_X, val_X, train_y, val_y= train_test_split(data.drop('Categories', axis=1), data['Categories'], test_size=0.10)


In [None]:
##Get Glove embeddings in a dictionary with every word as key and its embeddings as its values
glove_embeddings={}
file=open("../input/quora-insincere-questions-classification/embeddings/glove.840B.300d/glove.840B.300d.txt")
for line in file:
    tokens= line.split(" ")
    word = tokens[0]
    vec = tokens[1:]
    glove_embeddings[word]=np.asarray(vec, dtype='float32')
file.close()

In [None]:
######Movie Overview Analysis
##Convert text to sequences
embed_size = 300 # how big is each word vector
max_features = 10000 # how many unique words to use (i.e num rows in embedding vector)
maxlen = 30 # max number of words in a question to use

## fill up the missing values
train_X = train_X["overview"].fillna("_na_").values


val_X = val_X["overview"].fillna("_na_").values

tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(train_X))
train_X = tokenizer.texts_to_sequences(train_X)
val_X = tokenizer.texts_to_sequences(val_X)
train_X = pad_sequences(train_X, maxlen=maxlen)
val_X = pad_sequences(val_X, maxlen=maxlen)

embedding_matrix = np.zeros((max_features, embed_size))
for word, i in tokenizer.word_index.items():
    if i >= max_features: 
        continue
    embedding_vector = glove_embeddings.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector



In [None]:
del glove_embeddings
gc.collect()

In [None]:
reduce_lr = keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.2,
                              patience=1, min_lr=0.001)

In [None]:
inp1=keras.layers.Input(shape=(maxlen_ques,), dtype='float32')
x=keras.layers.Embedding(input_dim=max_features_ques,output_dim=embed_size, weights=[embedding_matrix_ques], trainable=False)(inp1)
x=keras.layers.Bidirectional(CuDNNLSTM(64,return_sequences=True))(x)
x=keras.layers.Bidirectional(CuDNNLSTM(32, return_sequences=True))(x)
x=keras.layers.GlobalMaxPool1D()(x)
x=Dropout(0.1)(x)
x=keras.layers.Dense(64 ,activation='relu')(x)

inp2=keras.layers.Input(shape=(maxlen_sent,), dtype='float32')
y=keras.layers.Embedding(input_dim=max_features_sent,output_dim=embed_size, weights=[embedding_matrix_sent],trainable=False)(inp2)
y=keras.layers.Bidirectional(CuDNNLSTM(64,return_sequences=True))(y)
y=keras.layers.Bidirectional(CuDNNLSTM(32, return_sequences=True))(y)
y=keras.layers.GlobalMaxPool1D()(y)
y=Dropout(0.1)(y)
y=keras.layers.Dense(64 ,activation='relu')(y)

merge=keras.layers.multiply([x,y])
dense=keras.layers.Dense(32,activation='relu')(merge)
out=Dense(2, activation='softmax')(dense)
model=Model(inputs=[inp1, inp2], outputs=out)

In [None]:
inp1=keras.layers.Input(shape=(maxlen,), dtype='float32')
x=keras.layers.Embedding(input_dim=max_features,output_dim=embed_size, weights=[embedding_matrix])(inp1)
x=keras.layers.Bidirectional(CuDNNLSTM(64,return_sequences=True))(x)
x=keras.layers.Bidirectional(CuDNNLSTM(32))(x)
#x=keras.layers.GlobalMaxPool1D()(x)
#x=Dropout(0.1)(x)
x=keras.layers.Dense(64 ,activation='relu')(x)
out=Dense(4, activation='softmax')(x)
model=Model(inputs=inp1, outputs=out)

In [None]:
model.summary()

In [None]:
model.compile(optimizer=keras.optimizers.Adam(lr=0.001), loss='categorical_crossentropy', metrics=['accuracy'])
model.fit(x=train_X, y=pd.get_dummies(train_y),batch_size=32,epochs=100 ,
         validation_data=(val_X,pd.get_dummies(val_y)))

In [None]:
pred=model.predict([val_X_ques,val_X_sent], batch_size=512, verbose=1)[:,1]
test_data1 = val.loc[:,['query_id','label']]
test_data1.loc[:,'pred'] = pred
test_data1.loc[:,'rank1'] = test_data1.groupby('query_id')['pred'].rank(ascending=False)
eval_data = test_data1.loc[test_data1.label == 1, :]
eval_data.loc[:,'score'] = eval_data['rank1'].apply(lambda x : 1/x)
eval_data.score.mean()

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB,MultinomialNB
import lightgbm as lgbm
import xgboost as xgb

In [None]:
train_y = train['label'].values
val_y = val['label'].values

In [None]:
tfidf_vect=TfidfVectorizer(stop_words='english',max_df=0.9, min_df=2)
tfidf_train=tfidf_vect.fit_transform(train['passage'])
tfidf_val=tfidf_vect.transform(val['passage'])

In [None]:
lgbm1=lgbm.LGBMClassifier(silent=False, max_depth=5,n_estimators=1000)
lgbm1.fit(tfidf_train, train_y)

In [None]:
pred=lgbm1.predict_proba(tfidf_val)[:,1]
test_data1 = val.loc[:,['query_id','label']]
test_data1.loc[:,'pred'] = pred
test_data1.loc[:,'rank1'] = test_data1.groupby('query_id')['pred'].rank(ascending=False)
eval_data = test_data1.loc[test_data1.label == 1, :]
eval_data.loc[:,'score'] = eval_data['rank1'].apply(lambda x : 1/x)
eval_data.score.mean()