In [1]:
import lightgbm
from scipy import ndimage
import pandas as pd

import boto3
import boto
import s3fs
import multiprocessing as mp
import os
import pyarrow.parquet as pq
import random
from sklearn.utils import shuffle
import numpy as np

from sklearn.model_selection import train_test_split

In [2]:
import gzip
def get_data(file_loc, nsamples=100000):
    with gzip.open(file_loc,'rt') as f:
        data = []
        for line in f:
            if nsamples == 0:
                break
            new_arr = []
            line=line.strip()
            arr = line.split(' ')
            score = arr[0]
            q_id = arr[1].split(':')[1]
            new_arr.append(int(score))
            new_arr.append(int(q_id))
            arr = arr[2:]
            ''' Extract each feature from the feature vector '''
            for el in arr:
                new_arr.append(float(el.split(':')[1]))
            data.append(new_arr)
            nsamples -= 1
        f.close()
        return np.array(data)

In [3]:
def add_new_columns(df, ws, sc, tp):
    nrows = df.shape[0]
    df['label2'] = 0.0
    for i in range(1,len(ws)+1):
        cn = "d_{}".format(i)
        v = np.random.normal(loc=0, scale=sc, size=nrows)
        if i < len(ws): 
            df[cn] = v
        df['label2'] = df['label2'] + v*ws[i-1]
    df['label2'] = np.abs(df['label2'])
    df['label2'] = df['label2'].astype(int).apply(lambda x:tp if x>tp else x)
    return df    

In [4]:
training_a = get_data("./data/train.txt.gz",nsamples=1000000)
test_a = get_data("./data/test.txt.gz",nsamples=100000)
valid_a = get_data("./data/vali.txt.gz",nsamples=100000)

In [10]:
#50%, 30% and 10% noise

l={}
l[50] = np.array([0.2,0.3,0.5])
l[30] = np.array([0.3,0.4,0.3])
l[10] = np.array([0.4,0.5,0.1])
sc = 4
tp = 3

for pn, ws in l.items():
    _cols = ['label', 'query_id'] + ["c_{}".format(x) for x in range(training_a.shape[1]-2)]
    
    training_data = pd.DataFrame(training_a, columns=_cols)
    test_data = pd.DataFrame(test_a, columns=_cols)
    valid_data = pd.DataFrame(valid_a, columns=_cols)
    
    training_data = add_new_columns(training_data, ws, sc, tp)
    test_data = add_new_columns(test_data, ws, sc, tp)
    valid_data = add_new_columns(valid_data, ws, sc, tp)
    
    training_data['label']=training_data['label'].astype(int)
    test_data['label']=test_data['label'].astype(int)
    valid_data['label']=valid_data['label'].astype(int)
    
    training_data['query_id']=training_data['query_id'].astype(int)
    test_data['query_id']=test_data['query_id'].astype(int)
    valid_data['query_id']=valid_data['query_id'].astype(int)    
    
    _cols2 = ['label', 'label2','query_id'] + \
        ["d_{}".format(x) for x in range(1, len(ws))] + \
        ["c_{}".format(x) for x in range(training_a.shape[1]-2)]    
    training_data[_cols2].to_csv('./data/train{}.txt'.format(pn), index=False)
    test_data[_cols2].to_csv('./data/test{}.txt'.format(pn), index=False)
    valid_data[_cols2].to_csv('./data/vali{}.txt'.format(pn), index=False)    

In [76]:
training_data.head(100)

Unnamed: 0,label,query_id,c_0,c_1,c_2,c_3,c_4,c_5,c_6,c_7,...,c_129,c_130,c_131,c_132,c_133,c_134,c_135,label2,d_1,d_2
0,2,1,3.0,3.0,0.0,0.0,3.0,1.000000,1.000000,0.000000,...,116.0,64034.0,13.0,3.0,0.0,0.0,0.0,2,-0.062433,3.830544
1,2,1,3.0,0.0,3.0,0.0,3.0,1.000000,0.000000,1.000000,...,124.0,64034.0,1.0,2.0,0.0,0.0,0.0,1,-2.949107,0.016994
2,0,1,3.0,0.0,2.0,0.0,3.0,1.000000,0.000000,0.666667,...,124.0,3344.0,14.0,67.0,0.0,0.0,0.0,3,-2.607316,-3.266606
3,2,1,3.0,0.0,3.0,0.0,3.0,1.000000,0.000000,1.000000,...,123.0,63933.0,1.0,3.0,0.0,0.0,0.0,1,4.868679,-0.077604
4,1,1,3.0,0.0,3.0,0.0,3.0,1.000000,0.000000,1.000000,...,256.0,49697.0,1.0,13.0,0.0,0.0,0.0,3,6.350008,1.942858
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0,16,5.0,0.0,0.0,2.0,6.0,0.714286,0.000000,0.000000,...,5338.0,39632.0,2.0,5.0,0.0,2.0,57.0,0,-1.319587,1.160848
96,0,16,7.0,0.0,2.0,0.0,7.0,1.000000,0.000000,0.285714,...,5885.0,51991.0,1.0,1.0,0.0,0.0,0.0,1,1.021165,-3.119088
97,0,16,6.0,2.0,2.0,0.0,6.0,0.857143,0.285714,0.285714,...,50712.0,51991.0,1.0,1.0,0.0,0.0,0.0,1,7.994489,-2.816388
98,0,16,3.0,0.0,3.0,3.0,3.0,0.428571,0.000000,0.428571,...,1284.0,65535.0,7.0,1.0,0.0,0.0,0.0,3,-1.705455,9.865841
