In [1]:
import numpy as np
import pandas as pd
import re
from nltk.corpus import stopwords
import pickle
import xgboost as xgb

# Part 1: Using the Quora Database to Train Semantic Similarity Model

In [2]:
train_data = pd.read_csv('../input/quora-question-pairs/train.csv.zip')
test_data = pd.read_csv('../input/quora-question-pairs/test.csv.zip')

In [3]:
train_data.shape

In [4]:
train_data = train_data.sample(int(0.4*len(train_data)))
train_data.reset_index(inplace = True)

In [5]:
train_data.shape

In [6]:
train_data.head()

In [7]:
train_data.is_duplicate.sum()

In [8]:
train_data.dropna(axis = 0, inplace = True)
test_data.dropna(axis = 0, inplace = True)

In [9]:
train_data.isnull().sum()

In [10]:
regular_expression = re.compile('[' + re.escape('!@#$^&*\'()+=-_,./:;<>?"[\\]^_`{|}~')+'\\r\\t\\n]')
stopwords_eng = stopwords.words('english')

In [11]:
def preprocess_data(data, remove_characters = regular_expression, stopwords_list = None):    
    assert type(data) == type('Document'), "Data is not a string"
    assert type(remove_characters) == re.Pattern, "Characters to remove are not a regex Pattern object."
    
    temp_data = data.lower()
    temp_data = remove_characters.sub(' ', temp_data)
    split_list= temp_data.split()
    temp_data = [word for word in split_list if word not in stopwords_list]
    return temp_data
    
    
    
    

In [12]:
def find_common_words(record):
    
    q1 = set(record['preprocessed_q1'])
    q2 = set(record['preprocessed_q2'])
    return q1 & q2

In [13]:
def find_total_words(record):
        
    q1 = set(record['preprocessed_q1'])
    q2 = set(record['preprocessed_q2'])
    return len(q1) + len(q2)
    

In [14]:
def find_shared_ratio(record):
    
    intersect_words = 0
    if len(record['shared_words']) > 0:
        intersect_words = len(record['shared_words'])
    return intersect_words / record['total_words']

In [15]:
counts = pd.Series(train_data['qid1'].tolist() + train_data['qid2'].tolist()).value_counts()

In [16]:
train_data.loc[0, 'question2']

In [17]:
train_data['preprocessed_q1'] = train_data['question1'].apply(lambda x: preprocess_data(x, stopwords_list = stopwords_eng))
print("Q1 done")
train_data['preprocessed_q2'] = train_data['question2'].apply(lambda x: preprocess_data(x, stopwords_list = stopwords_eng))
print("Q2 done")

In [18]:
from gensim.models import Word2Vec

In [19]:
db = pd.read_csv('../input/stackexchange/database.csv')

In [20]:
db.head()

In [21]:
db.preprocessed_q1 = db.preprocessed_q1.apply(lambda x: re.sub("[' ]",'',x.strip('][')).strip().split(','))

In [22]:
db.preprocessed_q1[0]

In [23]:
train_data.head()

In [24]:
from gensim.models import FastText

In [25]:
train_data = list(train_data['preprocessed_q1']) + list(train_data['preprocessed_q2']) + db['preprocessed_q1'].tolist()

In [29]:
model = FastText(vector_size=32, window=5, min_count=1)
model.build_vocab(train_data)
model.train(train_data, total_examples=len(train_data), epochs=10)

In [30]:
# model = Word2Vec(list(train_data['preprocessed_q1']) + list(train_data['preprocessed_q2']) + list(db['preprocessed_q1']), window = 5, vector_size = 32, epochs = 10, min_count = 1)

In [31]:
(list(train_data['preprocessed_q1']) + list(train_data['preprocessed_q2']) + db['preprocessed_q1'].tolist())[len(train_data.preprocessed_q1)+len(train_data.preprocessed_q2) + 300]

In [32]:
model.wv['\u200brename']

In [34]:
#model.save('word2vecmodel.model')
model.save('fasttextmodel.model')

In [43]:
train_data.sample(3)

In [44]:
train_data['shared_words'] = train_data.apply(find_common_words, axis = 1)
train_data['total_words'] =  train_data.apply(find_total_words, axis = 1)


In [45]:
to_drop = train_data[train_data['total_words'] == 0].index

In [46]:
train_data = train_data.drop(to_drop, axis = 0)
train_data.reset_index(inplace = True)

In [47]:
train_data['shared_ratio'] = train_data.apply(find_shared_ratio, axis = 1)

In [48]:
train_data['countq1'] = train_data['qid1'].apply(lambda x: counts[x])
train_data['countq2'] = train_data['qid2'].apply(lambda x: counts[x])

TF-IDF Vectorizer

In [49]:
# from sklearn.feature_extraction.text import TfidfVectorizer

In [50]:
# vectorizer = TfidfVectorizer()

In [51]:
# tf_idf_tokens_q1 = vectorizer.fit_transform(train_data['preprocessed_q1'])
# tf_idf_tokens_q2 = vectorizer.fit_transform(train_data['preprocessed_q2'])

In [53]:
document_vectors_q1 = pd.DataFrame()
for document in train_data['preprocessed_q1']:
    temp_vector = pd.DataFrame()
    
    for word in document:
        embedding = model.wv[word]
        temp_vector = temp_vector.append(pd.Series(embedding), ignore_index = True)
    current_vector = temp_vector.mean()
    document_vectors_q1 = document_vectors_q1.append(current_vector, ignore_index = True)


print("Done,", document_vectors_q1.shape)

document_vectors_q2 = pd.DataFrame()
for document in train_data['preprocessed_q2']:
    temp_vector = pd.DataFrame()
    
    for word in document:
        embedding = model.wv[word]
        temp_vector = temp_vector.append(pd.Series(embedding), ignore_index = True)
    current_vector = temp_vector.mean()
    document_vectors_q2 = document_vectors_q2.append(current_vector, ignore_index = True)   
    
    

In [None]:
# tf_idf_tokens_q1.todense()

In [None]:
q1 = pd.DataFrame(document_vectors_q1)
q2 = pd.DataFrame(document_vectors_q2)

In [None]:
numeric_features = train_data.loc[:, ['shared_ratio','countq1', 'countq2', 'qid1', 'qid2', 'total_words']]
numeric_features.shape

In [None]:
q1.fillna(-9999, inplace = True)
q2.fillna(-9999, inplace = True)

In [None]:
q1.columns = ['q1_'+ str(colname) for colname in q1.columns]
q2.columns = ['q2_'+ str(colname) for colname in q1.columns]

In [None]:
X = pd.concat((q1, q2, numeric_features), axis = 1)
Y = train_data['is_duplicate']

In [None]:
from sklearn.model_selection import train_test_split

xtrain, xtest, ytrain, ytest = train_test_split(X, Y, test_size = 0.25, random_state = 61221)

In [None]:
from sklearn.ensemble import BaggingClassifier
from sklearn.naive_bayes import GaussianNB

In [None]:
classifier = BaggingClassifier(base_estimator = GaussianNB(), n_estimators = 15, random_state = 61221).fit(xtrain, ytrain)

# Testing

In [None]:
ypred = classifier.predict(xtest)

In [None]:
from sklearn.metrics import classification_report, confusion_matrix

In [None]:
print(classification_report(ytest, ypred))

In [None]:
confusion_matrix(ytest, ypred)

In [None]:
pickle.dump(classifier, open('model_semantic_similarity.pickle', 'wb'))


XGBoost

In [None]:
xtrain.head()

In [None]:
ytrain.head()

In [None]:
xtest.head()

In [None]:
training_data = xgb.DMatrix(xtrain, label = ytrain)
param = {'max_depth': 2, 'eta': 1, 'objective': 'binary:logistic'}
param['eval_metric'] = 'auc'

In [None]:
num_round = 10
best = xgb.train(param, training_data, num_round)

In [None]:
best.save_model('xgbooster.model')

In [None]:
test_data = xgb.DMatrix(xtest)
ypred = best.predict(test_data)

In [None]:
ypred = np.round(ypred)

In [None]:
ytest

In [None]:
print(classification_report(ytest, ypred))