In [None]:

import sys
import os
import time
import numpy as np
import pandas as pd
import pickle
import re
from collections import defaultdict
import random
import random
import csv
import sqlite3 as lite
import unicodedata
from sklearn.model_selection import train_test_split

In [None]:
start_time = time.clock()

chunk_size = 1000000
tsv_path = "../res/data.tsv"
# the columns in the tsv source
header_names = ["q_id","question","answer","label","a_id"]

#Will be created if doesnot exists
db_path  = "data.db"
table_name = "dataset"

con = None

try:

    file_read = 0
    con = lite.connect(db_path)

    for chunk in pd.read_csv(tsv_path,sep='\t',chunksize=chunk_size, header=None, names=header_names, encoding='utf8'):
        file_read+=chunk.shape[0]
        print(file_read, "\t Done")
        chunk.to_sql(name=table_name, con=con, if_exists='append', index=False)

except lite.Error:
    print("SQLite Error")
    sys.exit(1)

finally:
    if con:
        con.close()

print("--- %s seconds ---" % (time.clock() - start_time))

In [None]:
'''
BM25 is only a point to start with. 
Ideally the samples should be selected in a more sophisticated manner.
'''
#print(os.path.basename(__file__), "\n\n")
start_time = time.clock()

docIDFDict = {}
avgDocLength = 0
docIDFDict_file    = "../res/docIDFDict.pickle"
avgDocLength_file  = "../res/avgDocLength.pickle"

fileObject = open(docIDFDict_file, 'rb')
docIDFDict = pickle.load(fileObject)
fileObject.close()

fileObject = open(avgDocLength_file, 'rb')
avgDocLength = pickle.load(fileObject)
fileObject.close()

def GetBM25Score(Query, Passage, k1=1.5, b=0.75, delimiter=' ') :
    global docIDFDict,avgDocLength

    #remove special characters from query and passage
    Query = re.sub(r"[^a-zA-Z0-9]+", ' ', Query )
    Passage = re.sub(r"[^a-zA-Z0-9]+", ' ', Passage)

    query_words= Query.strip().lower().split(delimiter)
    passage_words = Passage.strip().lower().split(delimiter)
    passageLen = len(passage_words)
    docTF = {}
    for word in set(query_words):   #Find Term Frequency of all query unique words
        docTF[word] = passage_words.count(word)
    commonWords = set(query_words) & set(passage_words)
    tmp_score = []
    for word in commonWords :
        numer = (docTF[word] * (k1+1))   #Numerator part of BM25 Formula
        denom = ((docTF[word]) + k1*(1 - b + b*passageLen/avgDocLength)) #Denominator part of BM25 Formula
        if(word in docIDFDict) :
            tmp_score.append(docIDFDict[word] * numer / denom)
    score = sum(tmp_score)
    return score


print("BM25 Initialized")

chunk_size = 1000000
header_names = ["q_id","question","answer","label","a_id"]
bm25_header_names = ["q_id","question","answer","label","a_id","bm25_score"]
#Will be created if doesnot exists
db_path  = "data.db"
table_name = "dataset"
bm25_table_name = "dataset_bm25"

sql_read_query = "SELECT * FROM "+table_name

con = None
try:
    file_read = 0
    con = lite.connect(db_path)
    print("SQLite connection establshed")

    for chunk in pd.read_sql_query(sql=sql_read_query, con=con,chunksize=chunk_size):
        chunk = chunk.reindex(columns=bm25_header_names)
        chunk["bm25_score"] = chunk.apply(lambda x: GetBM25Score(x.question, x.answer), axis=1)
        chunk.to_sql(name=bm25_table_name, con=con, if_exists='append', index=False)
        file_read += chunk.shape[0]
        print(file_read, "\t Done")

    cursor = con.cursor()
    cursor.execute('''DROP TABLE dataset''')
    con.commit()

except lite.Error as e:
    print("SQLite Error")
    print(e)
    sys.exit(1)

finally:
    if con:
        con.close()

print("--- %s seconds ---" % (time.clock() - start_time))

In [None]:

start_time = time.clock()
curr_time = time.clock()

chunk_size = 100000 #because 10 answers for a question
bm25_header_names = ["q_id","question","answer","label","a_id","bm25_score"]
db_path  = "data.db"
rearranged_table_name = "dataset_rearranged"
table_name = "dataset_bm25"

total_rows = 5241880

# sql_read_query = "SELECT * FROM "+ rearranged_table_name # +" ORDER BY bm25_score"
sql_read_query = "SELECT * FROM "+ table_name +" ORDER BY question , bm25_score"
sql_drop_query = "DROP TABLE " + table_name


con = None
try:
    skip_count = 0
    file_read = 0
    counter = 0
    con = lite.connect(db_path)
    print("SQLite connection established")

    for chunk in pd.read_sql_query(sql=sql_read_query, con=con,chunksize=chunk_size):
        file_read += chunk.shape[0]
        chunk.to_sql(name=rearranged_table_name, con=con, if_exists='append', index=False)
        time_taken = time.clock() - curr_time
        print(file_read," Done\t Time taken : ",(time_taken), "\t ETA : ", (((total_rows-file_read)/chunk_size)*time_taken))
        curr_time = time.clock()

    cursor = con.cursor()
    cursor.execute(sql_drop_query)
    con.commit()

except lite.Error as e:
    print("SQLite Error")
    print(e)
    sys.exit(1)

finally:
    if con:
        con.close()

print("--- %s seconds ---" % (time.clock() - start_time))

In [None]:
tart_time = time.clock()
curr_time = time.clock()

chunk_size = 100000 #because 10 answers for a question
bm25_header_names = ["q_id","question","answer","label","a_id","bm25_score"]
trimmed_header_names = ["question","answer","label"]
db_path  = "data.db"
rearranged_table_name = "dataset_rearranged"
selected_table_name = "dataset_selected2"
tsv_op = "../res/dataset.tsv"

total_rows = 5241880

# sql_read_query = "SELECT * FROM "+ rearranged_table_name # +" ORDER BY bm25_score"
sql_read_query = "SELECT * FROM "+ rearranged_table_name# +" ORDER BY question , bm25_score"
sql_drop_query = "DROP TABLE " + rearranged_table_name


con = None
try:
    skip_count = 0
    file_read = 0
    counter = 0
    
    tmp = 0
    con = lite.connect(db_path)
    print("SQLite connection established")

    for chunk in pd.read_sql_query(sql=sql_read_query, con=con,chunksize=chunk_size):
        file_read += chunk.shape[0]
        #if file_read<5200000:
        #  continue
        datalist = []

        for chunklet in np.array_split(chunk, (chunk.shape[0]/10)):
            chunklet = chunklet.reset_index()
            true_index = chunklet.index[chunklet['label'] == 1].tolist()
            for true_id in true_index:
                unicodedata.normalize('NFD', chunklet.iloc[true_id]['question']).encode('ascii', 'ignore')
                unicodedata.normalize('NFD', chunklet.iloc[true_id]['answer']).encode('ascii', 'ignore')
                datalist.append(chunklet.iloc[true_id])
                datalist.append(chunklet.iloc[true_id])
                chunklet = chunklet.drop([true_id])
            chunklet = chunklet.reset_index()
            unicodedata.normalize('NFD', chunklet.iloc[0]['question']).encode('ascii', 'ignore')
            unicodedata.normalize('NFD', chunklet.iloc[0]['answer']).encode('ascii', 'ignore')
            
            unicodedata.normalize('NFD', chunklet.iloc[1]['question']).encode('ascii', 'ignore')
            unicodedata.normalize('NFD', chunklet.iloc[1]['answer']).encode('ascii', 'ignore')
            datalist.append(chunklet.iloc[0])
            datalist.append(chunklet.iloc[-1])
            counter+=10

        selected_chunk = pd.DataFrame(datalist, columns=bm25_header_names)
        del selected_chunk['q_id']
        del selected_chunk['a_id']
        del selected_chunk['bm25_score']
        selected_chunk.to_sql(name=selected_table_name, con=con, if_exists='append', index=False)
        #selected_chunk.to_sql(name=selected_table_name, con=con, if_exists='append', index=False)
        tmp += 1
        #unicodedata.normalize('NFD', selected_chunk).encode('ascii', 'ignore')
#         with open(tsv_op, 'a') as f:
#             print(selected_chunk)
#             selected_chunk.to_csv(f, sep='\t', encoding='utf-8', index=False)

        time_taken = time.clock() - curr_time
        print(file_read," Done\t Time taken : ",(time_taken), "\t ETA : ", (((total_rows-file_read)/chunk_size)*time_taken))
        curr_time = time.clock()
        #if tmp == 2 :
            #break

    # cursor = con.cursor()
    # cursor.execute(sql_drop_query)
    # con.commit()

except lite.Error as e:
    print("SQLite Error")
    print(e)
    sys.exit(1)

finally:
    if con:
        con.close()

print("--- %s seconds ---" % (time.clock() - start_time))

In [None]:
# import unicodedata
start_time = time.clock()
curr_time = time.clock()

chunk_size = 100000 #because 10 answers for a question
trimmed_header_names = ["question","answer","label"]
db_path  = "data.db"
rearranged_table_name = "dataset_rearranged"
selected_table_name = "dataset_selected2"
tsv_op = "../res/dataset.tsv"

total_rows = 5241880

# sql_read_query = "SELECT * FROM "+ rearranged_table_name # +" ORDER BY bm25_score"
sql_read_query = "SELECT * FROM "+ selected_table_name# +" ORDER BY question , bm25_score"
sql_drop_query = "DROP TABLE " + rearranged_table_name
encoding = "utf-8"



con = None
try:
    skip_count = 0
    file_read = 0
    counter = 0
    con = lite.connect(db_path)
    print("SQLite connection established")

    for chunk in pd.read_sql_query(sql=sql_read_query, con=con,chunksize=chunk_size):
        file_read += chunk.shape[0]
        datalist = []
        #print(chunk['label'])
        for chunklet in chunk.iterrows():
            chunklet = chunklet[1]
            
            #chunklet['question'] = unicodedata.normalize('NFD', chunklet['question']).decode(encoding).encode('ascii', 'ignore')
            #chunklet['answer'] = unicodedata.normalize('NFD', chunklet['answer']).decode(encoding).encode('ascii', 'ignore')
            chunklet['question'] = unicodedata.normalize('NFD', chunklet['question']).encode('ascii', 'ignore').decode(encoding)
            chunklet['answer'] = unicodedata.normalize('NFD', chunklet['answer']).encode('ascii', 'ignore').decode(encoding)
            #print(chunklet)
            #exit(0)
            datalist.append(chunklet)
            
        selected_chunk = pd.DataFrame(datalist, columns=trimmed_header_names)
        with open(tsv_op, 'a') as f:
            selected_chunk.to_csv(f, sep='\t', encoding='utf-8', index=False, header = False)

        time_taken = time.clock() - curr_time
        print(file_read," Done\t Time taken : ",(time_taken), "\t ETA : ", (((total_rows-file_read)/chunk_size)*time_taken))
        curr_time = time.clock()

    # cursor = con.cursor()
    # cursor.execute(sql_drop_query)
    # con.commit()

except lite.Error as e:
    print("SQLite Error")
    print(e)
    sys.exit(1)

finally:
    if con:
        con.close()

print("--- %s seconds ---" % (time.clock() - start_time))

In [None]:
fraction = 0.01
start_time = time.clock()
curr_time = time.clock()

total_rows = 2096752 #approximately, if 4 answers for every question
chunk_size = 100000
tsv_path = "../res/dataset.tsv"
subset = "../res/datasubset.tsv"
header_names = ["question","answer","label"]
file_read = 0

for chunk in pd.read_csv(tsv_path,sep='\t',chunksize=chunk_size, header=None, names=header_names, encoding='utf8'):
    file_read += chunk.shape[0]
    new_chunk = chunk.sample(frac=fraction).reset_index(drop=True)
    
    with open(subset, 'a') as f:
            new_chunk.to_csv(f, sep='\t', encoding='utf-8', index=False, header = False)
    
    time_taken = time.clock() - curr_time
    print(file_read," Done\t Time taken : ",(time_taken), "\t ETA : ", (((total_rows-file_read)/chunk_size)*time_taken))
    curr_time = time.clock()
    
    
print("--- %s seconds ---" % (time.clock() - start_time))

In [None]:
def get_imbalance_report(df):
    true_count = (df['label'] == 1).sum()
    false_count = (df['label'] == 0).sum()
    
    print(df.label.unique())
    #print(df['label'])
    
    total_count = true_count+false_count
    print("true : ", (true_count/total_count))
    print("false : ", (false_count/total_count))
    #print("label : ", (label_count))

In [None]:
start_time = time.clock()
curr_time = time.clock()

test_frac = 0.2 # out of total
valid_frac = 0.25 #out of test, i.e. 1- test

total_rows = 2096752 #approximately, if 4 answers for every question
chunk_size = 100000
subset = "../res/datasubset.tsv"

train_set = "../res/train_set.tsv"
test_set = "../res/test_set.tsv"
valid_set = "../res/valid_set.tsv"


header_names = ["question","answer","label"]
file_read = 0

for chunk in pd.read_csv(subset, sep='\t', chunksize=chunk_size, header=None, names=header_names, encoding='utf8'):
    file_read += chunk.shape[0]
    get_imbalance_report(chunk)
    train_chunk, test_chunk = train_test_split(chunk, test_size=test_frac)
    
    train_chunk, valid_chunk = train_test_split(train_chunk, test_size=valid_frac)
    
    with open(train_set, 'a') as f:
            train_chunk.to_csv(f, sep='\t', encoding='utf-8', index=False, header = False)
    
    with open(test_set, 'a') as f:
            test_chunk.to_csv(f, sep='\t', encoding='utf-8', index=False, header = False)
            
    with open(valid_set, 'a') as f:
            valid_chunk.to_csv(f, sep='\t', encoding='utf-8', index=False, header = False)
    
    time_taken = time.clock() - curr_time
    print(file_read," Done\t Time taken : ",(time_taken), "\t ETA : ", (((total_rows-file_read)/chunk_size)*time_taken))
    curr_time = time.clock()
    
    
print("--- %s seconds ---" % (time.clock() - start_time))

In [None]:
start_time = time.clock()
curr_time = time.clock()

test_frac = 0.2 # out of total
valid_frac = 0.25 #out of test, i.e. 1- test

total_rows = 2096752 #approximately, if 4 answers for every question
chunk_size = 100000
subset = "../res/datasubset.tsv"

train_set = "../res/train_set.tsv"
test_set = "../res/test_set.tsv"
valid_set = "../res/valid_set.tsv"


header_names = ["question","answer","label"]
file_read = 0



    
print("\n\n\nTrain Chunk\n")
train_chunk = pd.read_csv(train_set, sep='\t', header=None, names=header_names, encoding='utf8')
get_imbalance_report(train_chunk)

print("\n\n\nTest Chunk\n")
test_chunk = pd.read_csv(test_set, sep='\t', header=None, names=header_names, encoding='utf8')
get_imbalance_report(test_chunk)

print("\n\n\nValid Chunk\n")
valid_chunk = pd.read_csv(valid_set, sep='\t', header=None, names=header_names, encoding='utf8')
get_imbalance_report(valid_chunk)