In [1]:
import numpy as np
import pandas as pd
import random
import re
import itertools
from IPython.display import clear_output

In [14]:
# Additional helper functions required for the code to run:
# This function defines the column headers for the data frames
def getColumnHeaders():
    return pd.Series(data=['label','integer_1','integer_2','integer_3',
                                 'integer_4','integer_5','integer_6','integer_7','integer_8','integer_9',
                                 'integer_10','integer_11','integer_12','integer_13','categorical_1',
                                 'categorical_2','categorical_3','categorical_4','categorical_5','categorical_6',
                                 'categorical_7','categorical_8','categorical_9','categorical_10','categorical_11',
                                 'categorical_12','categorical_13','categorical_14','categorical_15','categorical_16',
                                 'categorical_17','categorical_18','categorical_19','categorical_20','categorical_21',
                                 'categorical_22','categorical_23','categorical_24','categorical_25','categorical_26','Index'])

# this function generateNIndecesFrom takes a range of Indeces and randomly takes n of them, returns a pandas Series object
def generateNIndecesFrom(n, rangeOfIndeces):
    # print("Generating " + str(n) + " indeces from range")
    allIndeces = random.sample(rangeOfIndeces, n)
    allIndeces = pd.Series(data = allIndeces)
    allIndeces = allIndeces.sort_values().reset_index().drop(['index'],axis=1)
    allIndeces.columns = ['Index'];
    return allIndeces

# This function takes in a dataframe and computes statistics and histograms for all columns that are not 'label', 'Index' or 'Unnamed: 0'
# It was used in part 2.2 to generate the histograms and statistics.
# It can for example be placed at the end of the read_data method and be passed one of the datasets to compute its statistics and histograms
def generateSummaryStatsAndHists(train1M):
    SummaryStats = pd.DataFrame()
    for col in train1M.columns:
        if (col != 'label' and col != 'Index' and col != 'Unnamed: 0'):

            train1M[col].value_counts().plot(kind='hist',title=col, bins=100)
            plt.savefig(col)
            plt.show()
            plt.gcf().clear()
            if (train1M[col].dtype != 'O'):
                SummaryStats[col] = train1M[col].describe()   
    # SummaryStats.head()
    SummaryStats.to_csv('integerStats.csv')
    return SummaryStats


# the generateSubSet function takes in a file, a dataFrame to put the data in as well as index values which should be extracted,
# a number of rows per itteration (this is used to not overload the memory) , total number of rows in the file, the column headers for the new dataframe
def generateSubSet(file,dataFrame,indexValues,numRowsPerItteration,totalNumRows,column_headers):
    totalNumIterations = int(totalNumRows/numRowsPerItteration)
    # print("Number of itterations = " + str(totalNumIterations))
    totalNumRowsTraversed = 0
    prevsize = 0
    for i in range(totalNumIterations + 1):

        curData = pd.read_table(file,skiprows = i * numRowsPerItteration, nrows = numRowsPerItteration,header=None)
        curData.index = [i for i in range(i*numRowsPerItteration,i*numRowsPerItteration + curData.shape[0])]
        totalNumRowsTraversed = totalNumRowsTraversed + curData.shape[0]
        

        curData['Index'] = curData.index
        curData.columns = column_headers
        
        curIndexRange = indexValues['Index'][(indexValues['Index'] < (i*numRowsPerItteration + numRowsPerItteration)) & (indexValues['Index'] > (i*numRowsPerItteration-1))]
        curData = curData[curData['Index'].isin(list(curIndexRange))]
        
        dataFrame = pd.concat([dataFrame,curData])
        
        clear_output()
        print("Extraction Stats: " + str(dataFrame.shape[0]) + " percent: " + str(dataFrame.shape[0] / indexValues.shape[0] * 100) + "%")
        print("Document Stats: " + str(totalNumRowsTraversed) + " percent: " + str(totalNumRowsTraversed/totalNumRows*100) + "%")
        if (dataFrame.shape[0] - prevsize) > 500000:
            prevsize = dataFrame.shape[0]
      
    return dataFrame

# This method generates is a wrapper around the generateSubset to generate the subset and save the dataframe to a csv file (for being able to make use of it after)
def generateAndSaveSubset(file,dataFrame,indexValues,numRowsPerItteration,totalNumRows,column_headers,frameSaveName):
    dataFrame = generateSubSet(file,dataFrame,indexValues,numRowsPerItteration,totalNumRows,column_headers)
    dataFrame.to_csv(frameSaveName)
    return dataFrame

def read_data(data_path, train_path, validation_path, test_path):

    print(data_path)
    print(train_path)
    print(validation_path)
    print(test_path)
    
    #get the ids
    try:
        trainIndeces = pd.read_csv(train_path, header = None)
        validationIndeces = pd.read_csv(validation_path, header = None)
        testingIndeces = pd.read_csv(test_path, header = None)
    except:
        print("There were not 1000000 data points")
        trainIndeces = generateNIndecesFrom(1000000,list(twoMIndeces['Index']))
        trainIndeces.to_csv('train_ids.txt',index=False,header=False)

        remainingIndeces = twoMIndeces['Index'][~twoMIndeces['Index'].isin(trainIndeces.values)]
        validationIndeces = generateNIndecesFrom(250000,list(remainingIndeces))
        validationIndeces.to_csv('validation_ids.txt',index=False,header=False)

        testingIndeces = twoMIndeces['Index'][~(twoMIndeces['Index'].isin(trainIndeces.values) | twoMIndeces['Index'].isin(validationIndeces.values))]
        testingIndeces = generateNIndecesFrom(750000,list(testingIndeces))
        testingIndeces.to_csv('test_ids.txt',index=False,header=False)
    
    trainIndeces.columns = ['Index']
    validationIndeces.columns = ['Index']
    testingIndeces.columns = ['Index']

    # Generate the actual data files
    column_headers = getColumnHeaders()
    train1M = pd.DataFrame()
    train1M = generateSubSet(data_path,train1M,trainIndeces,4000000,46000000,column_headers)

    validation250k = pd.DataFrame()
    validation250k = generateSubSet(data_path,validation250k,validationIndeces,4000000,46000000,column_headers)

    test750k = pd.DataFrame()
    test750k = generateSubSet(data_path,test750k,testingIndeces,4000000,46000000,column_headers)
    
    
#     print(train1M.shape)
#     print(validation250k.shape)
#     print(test750k.shape)

    return train1M[train1M.columns[1:40]].values, train1M['label'].values, validation250k[validation250k.columns[1:40]].values, validation250k['label'].values, test750k[test750k.columns[1:40]].values, test750k['label'].values

def preprocess_int_data(data, features):
    n = len([f for f in features if f < 13])
#     print(n)
#     print(data)
    dataFrame = pd.DataFrame(data)
#     print(dataFrame.head())
    dataFrame[dataFrame.columns[0:13]] = dataFrame[dataFrame.columns[0:13]].fillna(0)
    dataFrame[dataFrame.columns[0:13]] = dataFrame[dataFrame.columns[0:13]].replace(-1,0)
    dataFrame[dataFrame.columns[0:13]] = dataFrame[dataFrame.columns[0:13]].replace(-2,0)
#     print(dataFrame[dataFrame.columns[0:13]])
    return np.zeros((data.shape[0], n))


def preprocess_cat_data(data, features, preprocess):
    print(data)
    return None


In [3]:
def check_output_features(fs):
    assert len(fs) < 39


def check_output_read_data(data, target, n):
    assert data.shape[0] == n
    assert target.shape[0] == n


def check_output_preprocess(preprocess):
    assert (preprocess == 'onehot') or (preprocess == 'rate') or (preprocess == 'tfidf')


def check_output_preprocess_int_data(data, fs):
    n = len([f for f in fs if f < 13])
    assert data.shape[1] == n


def check_output_preprocess_cat_data(data, fs, preprocess):
    pass


def read_features(path):
    features = []
    with open(path) as f:
        for line in f:
            features.append(int(line.strip()))
    return features


def read_preprocess(path):
    with open(path) as f:
        for line in f:
            return line.strip()


if __name__ == '__main__':
    data = './dac/train.txt'
    train = 'train_ids.txt'
    validation = 'validation_ids.txt'
    test = 'test_ids.txt'
    features = 'features.txt'
    preprocess = 'preprocess.txt'

    train_data, train_target, validation_data, validation_target, test_data, test_target = \
        read_data(data, train, validation, test)

    check_output_read_data(train_data, train_target, 1000000)
    check_output_read_data(validation_data, validation_target, 250000)
    check_output_read_data(test_data, test_target, 750000)

    

Extraction Stats: 750000 percent: 100.0%
Document Stats: 45840617 percent: 99.6535152173913%
(1000000, 41)
(250000, 41)
(750000, 41)


NameError: name 'options' is not defined

In [12]:
features = read_features(features)

check_output_features(features)

preprocess = read_preprocess(preprocess)

check_output_preprocess(preprocess)

TypeError: expected str, bytes or os.PathLike object, not list

In [15]:
train_int_data = preprocess_int_data(train_data, features)
validation_int_data = preprocess_int_data(validation_data, features)
test_int_data = preprocess_int_data(test_data, features)

check_output_preprocess_int_data(train_int_data, features)
check_output_preprocess_int_data(validation_int_data, features)
check_output_preprocess_int_data(test_int_data, features)

[[7.0 102 nan ..., '49045073' nan nan]
 [nan 0 17.0 ..., '43f13e8b' 'ea9a246c' '731c3655']
 [nan 1 5.0 ..., '3b183c5c' nan nan]
 ..., 
 [0.0 0 1.0 ..., '2e814ddc' 'e8b83407' '7bf15447']
 [0.0 218 7.0 ..., '8fc66e78' nan nan]
 [nan 839 2.0 ..., '3b183c5c' nan nan]]
    0    1    2  3      4    5  6   7   8    9     ...           29        30  \
0    7  102  NaN  3    780   15  7  15  15    1    ...     e5ba7672  5edd90de   
1  NaN    0   17  3  19811  NaN  0   3  54  NaN    ...     1e88c74f  b04e4670   
2  NaN    1    5  4   2931   36  2   6  62  NaN    ...     07c540c4  0f4a15b0   
3  NaN    0  NaN  0  40698  963  0   2  23  NaN    ...     d4bb7bd8  281769c2   
4  NaN   23   83  2    NaN  NaN  0   2   2  NaN    ...     2005abd1  f54016b9   

         31        32        33        34        35        36        37  \
0       NaN       NaN  e12ce348       NaN  c3dc6cef  49045073       NaN   
1  21ddcdc9  5840adea  60f6221e       NaN  32c7478e  43f13e8b  ea9a246c   
2       NaN       NaN  

          0     1     2     3         4       5     6     7       8    9   \
0       10.0     1  11.0  14.0     133.0    16.0  12.0  18.0   244.0  1.0   
1        0.0    54  13.0   9.0    3411.0    75.0   3.0  21.0   113.0  0.0   
2       13.0     1  13.0  13.0      48.0    19.0  13.0   8.0    13.0  2.0   
3        2.0   322   1.0   3.0      37.0     4.0   2.0   3.0     3.0  1.0   
4        0.0     1   0.0   0.0    2969.0   126.0   3.0  27.0    77.0  0.0   
5        0.0   598   1.0   1.0    6492.0   130.0   7.0   1.0    81.0  0.0   
6        0.0    22   5.0   3.0   33228.0    95.0   2.0   4.0    38.0  0.0   
7        0.0     0   2.0   2.0    9720.0     0.0   0.0   7.0     2.0  0.0   
8        0.0     1   1.0   0.0     158.0     0.0   0.0   0.0    19.0  0.0   
9        1.0     0  15.0  21.0     161.0   122.0   1.0  36.0   430.0  1.0   
10       0.0     2   5.0   2.0    1406.0     5.0  10.0   8.0     8.0  0.0   
11       0.0     2   3.0   3.0  265024.0     0.0   0.0   3.0    18.0  0.0   

In [None]:
train_cat_data = preprocess_cat_data(train_data, features, preprocess)
validation_cat_data = preprocess_cat_data(validation_data, features, preprocess)
test_cat_data = preprocess_cat_data(test_data, features, preprocess)

check_output_preprocess_cat_data(train_cat_data, features, preprocess)
check_output_preprocess_cat_data(validation_cat_data, features, preprocess)
check_output_preprocess_cat_data(test_cat_data, features, preprocess)