In [17]:
import os
import pandas as pd
import random
from collections import defaultdict
from sklearn.utils import shuffle

In [18]:
def select_n_words(str, n):
    l = str.split(' ')[:n]
    return (" ".join(l))

def train_test_split(data):
    train = random.sample(data, int(len(data) * 0.7))
    test = list(set(data) - set(train))
    return train, test

In [19]:
def get_data(q, s, res, len1, len2):
    
    query = select_n_words(q, len1)
    statute = select_n_words(s, len2)

    data = {'Query': query, 'Statute': statute, 'Result': int(res)}

    return data

In [20]:
def get_data_frame():
    
    # print('You can choose any one type DF as input\n\t Press 0 for Unbalanced test_set \n\t Press 1 for Balances test set')
    # data_set_type = int(input())

    train_df = pd.DataFrame(columns= ['Query', 'Statute', 'Result'])
    test_df = pd.DataFrame(columns= ['Query', 'Statute', 'Result'])

    len_q = {}
    main_loc = os.path.join(os.getcwd(), "statute_detection")
    query_doc = os.path.join(main_loc, "Query_doc.txt")
    query_list ={}
    f = open(query_doc, "r")
    for line in f:
        query_id, query = line.split('||')
        query_list[query_id]= query
        len_q[query_id] = len(query.split(' '))
    f.close()


    len_s ={}
    statutes_loc = os.path.join (main_loc, "Object_statutes")
    statutes_list ={}
    for file in os.listdir(statutes_loc):
        statute_name= file[:file.find('.')]
        f= open(os.path.join(statutes_loc,file), "r",encoding='utf-8')
        statute =(" ".join(f.readlines()))
        statutes_list[statute_name]=statute
        len_s[statute_name]  = len(statute.split(' '))
        f.close()

    pos_statute_dict = {statute : [] for statute in statutes_list}
    neg_statute_dict = {statute : [] for statute in statutes_list}
    final_doc_loc = os.path.join(main_loc, "relevance_judgments_statutes.txt")
    f = open(final_doc_loc, "r")
    for line in f:
        query_id, x, statute_id, res = line.split(' ')
        if statute_id in statutes_list:
            if int(res)==1:
                pos_statute_dict[statute_id].append(query_id)
            else:
                neg_statute_dict[statute_id].append(query_id)
    f.close()
    del_statute_set = {k for k,v in pos_statute_dict.items() if len(v) < 3}

    #print(f'No of Statutes with less than 3 positive instances in the data_set = {len(del_statute_set)}')
    if(len(del_statute_set)!=0):
        #print(del_statute_set)
        for s_id in del_statute_set:
            del statutes_list[s_id]
        #print(f'After deleting all such Statutes we only left with {len(statutes_list)} Statutes.')

    for s_id in statutes_list:
        pos = pos_statute_dict[s_id]
        neg = neg_statute_dict[s_id]
        #print(f'For Statute {s_id}, \tNo of positive instances = {len(pos)}  \tNo of negative instances = {len(neg)}')
        time_of_append = int(len(neg)/len(pos))+1

        pos_train , pos_test = train_test_split(pos)
        neg_train, neg_test = train_test_split(neg)

        # print(f'\tInitially the train set has {len(pos_train)} positive {len(neg_train)} negative cases.')
        # print(f'\t After upscaling the train set has {time_of_append*len(pos_train)} positive {len(neg_train)} negative cases.')
        # print(f'\t Test set has {len(pos_test)} positive {len(neg_test)} negative cases.')

        for q_id in pos_train:
            # if(model=='BERT'):
            #     print(f'For Query : {q_id} and Statute : {s_id}')
            data = get_data(query_list[q_id], statutes_list[s_id], 1, len_q[q_id], len_s[s_id])
            for _ in range(time_of_append):
                train_df = train_df.append(data, ignore_index= True)

        for q_id in neg_train:
            # if(model=='BERT'):
            #     print(f'For Query : {q_id} and Statute : {s_id}')
            data = get_data(query_list[q_id], statutes_list[s_id], 0, len_q[q_id], len_s[s_id])
            train_df = train_df.append(data, ignore_index= True)

        for q_id in pos_test:
            # if(model=='BERT'):
            #     print(f'For Query : {q_id} and Statute : {s_id}')
            data = get_data(query_list[q_id], statutes_list[s_id], 1, len_q[q_id], len_s[s_id])
            # if data_set_type==0:
            #     time_of_append =1
            for _ in range(time_of_append):
                test_df = test_df.append(data, ignore_index= True)

        for q_id in neg_test:
            # if(model=='BERT'):
            #     print(f'For Query : {q_id} and Statute : {s_id}')
            data = get_data(query_list[q_id], statutes_list[s_id], 0, len_q[q_id], len_s[s_id])
            test_df = test_df.append(data, ignore_index= True)

        
    train_df = shuffle(train_df)
    test_df = shuffle(test_df)
    return train_df, test_df

In [21]:

df1, df2 =get_data_frame()


In [22]:
df1.head()

Unnamed: 0,Query,Statute,Result
766,"The detenu P1, a French national, at the relev...",Title: Power to make orders detaining certain ...,1
1380,This appeal is preferred against the judgment ...,Title: Punishment for voluntarily causing hurt...,1
927,The appellants were tried for offences on the ...,Title: Punishment for wrongful restraint\n Des...,0
1189,These appeals involving common questions of la...,Title: Enforcement of decrees and orders of Su...,1
1260,The appellant P1 is convicted by the Additiona...,Title: Voluntarily causing hurt by dangerous w...,1


In [23]:
df2.head()

Unnamed: 0,Query,Statute,Result
138,These writ petitions are filed as Public Inter...,Title: Special leave to appeal by the Supreme ...,0
882,The appellant before us was examined as prime ...,Title: Cheating and dishonestly inducing deliv...,0
545,These appeals are directed against the judgmen...,Title: Murder\n Desc: Except in the cases here...,1
285,Having been selected by the Public Service Com...,Title: Power to examine the accused\n Desc: (1...,0
470,This appeal is preferred against the judgment ...,Title: Abetment of suicide\n Desc: If any pers...,0


In [24]:
df1.to_csv('train1.csv', index=False)
df2.to_csv('test1.csv', index=False)