In [2]:
import numpy as np
import pandas as pd
import random
import re
import itertools

In [12]:
# Additional helper functions required for the code to run:
# This function defines the column headers for the data frames
def getColumnHeaders():
    return pd.Series(data=['Index','label','integer_1','integer_2','integer_3',
                                 'integer_4','integer_5','integer_6','integer_7','integer_8','integer_9',
                                 'integer_10','integer_11','integer_12','integer_13','categorical_1',
                                 'categorical_2','categorical_3','categorical_4','categorical_5','categorical_6',
                                 'categorical_7','categorical_8','categorical_9','categorical_10','categorical_11',
                                 'categorical_12','categorical_13','categorical_14','categorical_15','categorical_16',
                                 'categorical_17','categorical_18','categorical_19','categorical_20','categorical_21',
                                 'categorical_22','categorical_23','categorical_24','categorical_25','categorical_26'])

# this function generateNIndecesFrom takes a range of Indeces and randomly takes n of them, returns a pandas Series object
def generateNIndecesFrom(n, rangeOfIndeces):
    # print("Generating " + str(n) + " indeces from range")
    allIndeces = random.sample(rangeOfIndeces, n)
    allIndeces = pd.Series(data = allIndeces)
    allIndeces = allIndeces.sort_values().reset_index().drop(['index'],axis=1)
    allIndeces.columns = ['Index'];
    return allIndeces

# the generateSubSet function takes in a file, a dataFrame to put the data in as well as index values which should be extracted,
# a number of rows per itteration (this is used to not overload the memory) , total number of rows in the file, the column headers for the new dataframe
def generateSubSet(file,dataFrame,indexValues,numRowsPerItteration,totalNumRows,column_headers):
    totalNumIterations = int(totalNumRows/numRowsPerItteration)
    # print("Number of itterations = " + str(totalNumIterations))
    totalNumRowsTraversed = 0
    prevsize = 0
    for i in range(totalNumIterations + 1):
#         
#         print("Itteration number: " + str(i))
#         print("skipRows: " + str(i * numRowsPerItteration))
#         print("Read in : " + str(numRowsPerItteration))
        curData = pd.read_table(file,skiprows = i * numRowsPerItteration, nrows = numRowsPerItteration,header=None)
        curData.index = [i for i in range(i*numRowsPerItteration,i*numRowsPerItteration + curData.shape[0])]
        totalNumRowsTraversed = totalNumRowsTraversed + curData.shape[0]

#         print(curData.shape)
#         print(curData.index.shape)

        curData['Index'] = curData.index
        curData.columns = column_headers
        
        curIndexRange = indexValues['Index'][(indexValues['Index'] < (i*numRowsPerItteration + numRowsPerItteration)) & (indexValues['Index'] > (i*numRowsPerItteration-1))]
        curData = curData[curData['Index'].isin(curIndexRange)]

        dataFrame = pd.concat([dataFrame,curData])
        
        # clear_output()
        # print("Extraction Stats: " + str(dataFrame.shape[0]) + " percent: " + str(dataFrame.shape[0] / indexValues.shape[0] * 100) + "%")
        # print("Document Stats: " + str(totalNumRowsTraversed) + " percent: " + str(totalNumRowsTraversed/totalNumRows*100) + "%")
        if (dataFrame.shape[0] - prevsize) > 500000:
            prevsize = dataFrame.shape[0]
#             dataFrame.to_csv(frameSaveName)
#         elif dataFrame.shape[0] == indexValues.shape[0]:
#             print("Finished with the data collection")
# #             dataFrame.to_csv(frameSaveName)
#             break
    # print("Extraction is Done, now saving frame")        
    return dataFrame

# This method generates is a wrapper around the generateSubset to generate the subset and save the dataframe to a csv file (for being able to make use of it after)
def generateAndSaveSubset(file,dataFrame,indexValues,numRowsPerItteration,totalNumRows,column_headers,frameSaveName):
    dataFrame = generateSubSet(file,dataFrame,indexValues,numRowsPerItteration,totalNumRows,column_headers)
    dataFrame.to_csv(frameSaveName)
    return dataFrame

def read_data(data_path, train_path, validation_path, test_path):

    print(data_path)
    print(train_path)
    print(validation_path)
    print(test_path)
    
    #get the ids
    try:
        trainIndeces = pd.read_csv(train_path,squeeze = True)
        validationIndeces = pd.read_csv(validation_path,squeeze = True)
        testingIndeces = pd.read_csv(test_path,squeeze = True)
    except:
        print("There were not 1000000 data points")
        trainIndeces = generateNIndecesFrom(1000000,list(twoMIndeces['Index']))
        trainIndeces.to_csv('train_ids.txt',index=False,header=False)

        remainingIndeces = twoMIndeces['Index'][~twoMIndeces['Index'].isin(trainIndeces.values)]
        validationIndeces = generateNIndecesFrom(250000,list(remainingIndeces))
        validationIndeces.to_csv('validation_ids.txt',index=False,header=False)

        testingIndeces = twoMIndeces['Index'][~(twoMIndeces['Index'].isin(trainIndeces.values) | twoMIndeces['Index'].isin(validationIndeces.values))]
        testingIndeces = generateNIndecesFrom(750000,list(testingIndeces))
        testingIndeces.to_csv('test_ids.txt',index=False,header=False)
    
    print(trainIndeces.head())
    print(validationIndeces.head())
    print(testingIndeces.head())
    
    # Generate the actual data files
    column_headers = getColumnHeaders()
    # print("No 1M collection")
    train1M = pd.DataFrame()
    train1M = generateSubSet(data_path,train1M,trainIndeces,4000000,46000000,column_headers)


    # print("No 250k collection")
    validation250k = pd.DataFrame()
    validation250k = generateAndSaveSubset(data_path,validation250k,validationIndeces,4000000,46000000,column_headers)


    # print("No 750k collection")
    test750k = pd.DataFrame()
    test750k = generateAndSaveSubset(data_path,test750k,validationIndeces,4000000,46000000,column_headers)

    # train_data, train_target = np.zeros((1000000, 39)), np.zeros((1000000,))
    # validation_data, validation_target = np.zeros((250000, 39)), np.zeros((250000,))
    # test_data, test_target = np.zeros((750000, 39)), np.zeros((750000,))


    # return train1M.to_array(), trainIndeces.to_array(), validation250k.to_array(), validation_target, test_data, test_target
    return

def preprocess_int_data(data, features):
    n = len([f for f in features if f < 13])
    return np.zeros((data.shape[0], n))


def preprocess_cat_data(data, features, preprocess):
    return None


In [14]:
def check_output_features(fs):
    assert len(fs) < 39


def check_output_read_data(data, target, n):
    assert data.shape[0] == n
    assert target.shape[0] == n


def check_output_preprocess(preprocess):
    assert (preprocess == 'onehot') or (preprocess == 'rate') or (preprocess == 'tfidf')


def check_output_preprocess_int_data(data, fs):
    n = len([f for f in fs if f < 13])
    assert data.shape[1] == n


def check_output_preprocess_cat_data(data, fs, preprocess):
    pass


def read_features(path):
    features = []
    with open(path) as f:
        for line in f:
            features.append(int(line.strip()))
    return features


def read_preprocess(path):
    with open(path) as f:
        for line in f:
            return line.strip()


if __name__ == '__main__':
    data = './dac/train.txt'
    train = 'train_ids.txt'
    validation = 'validation_ids.txt'
    test = 'test_ids.txt'
    features = 'features.txt'
    preprocess = 'preprocess.txt'

    train_data, train_target, validation_data, validation_target, test_data, test_target = \
        read_data(data, train, validation, test)

    check_output_read_data(train_data, train_target, 1000000)
    check_output_read_data(validation_data, validation_target, 250000)
    check_output_read_data(test_data, test_target, 750000)

    features = read_features(options.features)

    check_output_features(features)

    preprocess = read_preprocess(options.preprocess)

    check_output_preprocess(preprocess)

    train_int_data = preprocess_int_data(train_data, features)
    validation_int_data = preprocess_int_data(validation_data, features)
    test_int_data = preprocess_int_data(test_data, features)

    check_output_preprocess_int_data(train_int_data, features)
    check_output_preprocess_int_data(validation_int_data, features)
    check_output_preprocess_int_data(test_int_data, features)

    train_cat_data = preprocess_cat_data(train_data, features, preprocess)
    validation_cat_data = preprocess_cat_data(validation_data, features, preprocess)
    test_cat_data = preprocess_cat_data(test_data, features, preprocess)

    check_output_preprocess_cat_data(train_cat_data, features, preprocess)
    check_output_preprocess_cat_data(validation_cat_data, features, preprocess)
    check_output_preprocess_cat_data(test_cat_data, features, preprocess)


./dac/train.txt
train_ids.txt
validation_ids.txt
test_ids.txt
0    135
1    177
2    250
3    329
4    425
Name: 19, dtype: int64
0     716
1     773
2     986
3     997
4    1073
Name: 601, dtype: int64
0     86
1    108
2    116
3    195
4    215
Name: 38, dtype: int64


KeyboardInterrupt: 