In [104]:
import lightgbm
from scipy import ndimage
import pandas as pd

import boto3
import boto
import s3fs
import multiprocessing as mp
import os
import pyarrow.parquet as pq
import random
from sklearn.utils import shuffle
import numpy as np

from sklearn.model_selection import train_test_split

In [105]:
def get_data(file_loc, nsamples=100000):
    f = open(file_loc, 'r')
    data = []
    for line in f:
        if nsamples == 0:
            break
        new_arr = []
        line=line.strip()
        arr = line.split(' ')
        score = arr[0]
        q_id = arr[1].split(':')[1]
        new_arr.append(int(score))
        new_arr.append(int(q_id))
        arr = arr[2:]
        ''' Extract each feature from the feature vector '''
        for el in arr:
            new_arr.append(float(el.split(':')[1]))
        data.append(new_arr)
        nsamples -= 1
    f.close()
    return np.array(data)

In [254]:

def add_new_columns(df):
    nrows = df.shape[0]
    for n in range(1,3):
        cn = "d_{}".format(n)
        df[cn] = np.random.normal(loc=0, scale=4, size=nrows)
    df['label2'] = np.abs(df['d_1']*0.3+df['d_2']*0.2+np.random.normal(loc=0, scale=4, size=nrows)*0.5)
    df['label2'] = df['label2'].astype(int).apply(lambda x:3 if x>3 else x)
    return df
    

In [189]:
training_a = get_data("./data/train.txt",nsamples=1000000)
test_a = get_data("./data/test.txt",nsamples=100000)
valid_a = get_data("./data/vali.txt",nsamples=100000)

In [246]:
_cols = ['label', 'query_id'] + ["c_{}".format(x) for x in range(training_a.shape[1]-2)]

In [247]:
training_data = pd.DataFrame(training_a, columns=_cols)
test_data = pd.DataFrame(test_a, columns=_cols)
valid_data = pd.DataFrame(valid_a, columns=_cols)

In [255]:
training_data = add_new_columns(training_data)
test_data = add_new_columns(test_data)
valid_data = add_new_columns(valid_data)

In [256]:
training_data.label2.value_counts()

0    227476
1    193717
3    162381
2    139838
Name: label2, dtype: int64

In [257]:
training_data['label']=training_data['label'].astype(int)
test_data['label']=test_data['label'].astype(int)
valid_data['label']=valid_data['label'].astype(int)

training_data['query_id']=training_data['query_id'].astype(int)
test_data['query_id']=test_data['query_id'].astype(int)
valid_data['query_id']=valid_data['query_id'].astype(int)

In [258]:
_cols = ['label', 'label2','query_id','d_1','d_2'] + ["c_{}".format(x) for x in range(training_a.shape[1]-2)]

In [259]:
training_data[_cols].to_csv('./data/train2.txt', index=False)
test_data[_cols].to_csv('./data/test2.txt', index=False)
valid_data[_cols].to_csv('./data/vali2.txt', index=False)

In [253]:
training_data.head()

Unnamed: 0,label,query_id,c_0,c_1,c_2,c_3,c_4,c_5,c_6,c_7,...,c_129,c_130,c_131,c_132,c_133,c_134,c_135,d_1,d_2,label2
0,2,1,3.0,3.0,0.0,0.0,3.0,1.0,1.0,0.0,...,116.0,64034.0,13.0,3.0,0.0,0.0,0.0,-5.957317,-0.982826,3
1,2,1,3.0,0.0,3.0,0.0,3.0,1.0,0.0,1.0,...,124.0,64034.0,1.0,2.0,0.0,0.0,0.0,2.00118,4.419589,2
2,0,1,3.0,0.0,2.0,0.0,3.0,1.0,0.0,0.666667,...,124.0,3344.0,14.0,67.0,0.0,0.0,0.0,-0.035982,-0.782006,0
3,2,1,3.0,0.0,3.0,0.0,3.0,1.0,0.0,1.0,...,123.0,63933.0,1.0,3.0,0.0,0.0,0.0,2.567808,1.970488,0
4,1,1,3.0,0.0,3.0,0.0,3.0,1.0,0.0,1.0,...,256.0,49697.0,1.0,13.0,0.0,0.0,0.0,-0.180856,-4.272283,0
