In [1]:
import numpy as np
import pandas as pd
import random
import re
import itertools
from IPython.display import clear_output
from sklearn.preprocessing import scale
from sklearn.preprocessing import StandardScaler

In [23]:
# Additional helper functions required for the code to run:
# This function defines the column headers for the data frames
def getColumnHeaders():
    return pd.Series(data=['label','integer_1','integer_2','integer_3',
                                 'integer_4','integer_5','integer_6','integer_7','integer_8','integer_9',
                                 'integer_10','integer_11','integer_12','integer_13','categorical_1',
                                 'categorical_2','categorical_3','categorical_4','categorical_5','categorical_6',
                                 'categorical_7','categorical_8','categorical_9','categorical_10','categorical_11',
                                 'categorical_12','categorical_13','categorical_14','categorical_15','categorical_16',
                                 'categorical_17','categorical_18','categorical_19','categorical_20','categorical_21',
                                 'categorical_22','categorical_23','categorical_24','categorical_25','categorical_26','Index'])

# this function generateNIndecesFrom takes a range of Indeces and randomly takes n of them, returns a pandas Series object
def generateNIndecesFrom(n, rangeOfIndeces):
    # print("Generating " + str(n) + " indeces from range")
    allIndeces = random.sample(rangeOfIndeces, n)
    allIndeces = pd.Series(data = allIndeces)
    allIndeces = allIndeces.sort_values().reset_index().drop(['index'],axis=1)
    allIndeces.columns = ['Index'];
    return allIndeces

# This function takes in a dataframe and computes statistics and histograms for all columns that are not 'label', 'Index' or 'Unnamed: 0'
# It was used in part 2.2 to generate the histograms and statistics.
# It can for example be placed at the end of the read_data method and be passed one of the datasets to compute its statistics and histograms
def generateSummaryStatsAndHists(train1M):
    SummaryStats = pd.DataFrame()
    for col in train1M.columns:
        if (col != 'label' and col != 'Index' and col != 'Unnamed: 0'):

            train1M[col].value_counts().plot(kind='hist',title=col, bins=100)
            plt.savefig(col)
            plt.show()
            plt.gcf().clear()
            if (train1M[col].dtype != 'O'):
                SummaryStats[col] = train1M[col].describe()   
    # SummaryStats.head()
    SummaryStats.to_csv('integerStats.csv')
    return SummaryStats


# the generateSubSet function takes in a file, a dataFrame to put the data in as well as index values which should be extracted,
# a number of rows per itteration (this is used to not overload the memory) , total number of rows in the file, the column headers for the new dataframe
def generateSubSet(file,dataFrame,indexValues,numRowsPerItteration,totalNumRows,column_headers):
    totalNumIterations = int(totalNumRows/numRowsPerItteration)
    # print("Number of itterations = " + str(totalNumIterations))
    totalNumRowsTraversed = 0
    prevsize = 0
    for i in range(totalNumIterations + 1):

        curData = pd.read_table(file,skiprows = i * numRowsPerItteration, nrows = numRowsPerItteration,header=None)
        curData.index = [i for i in range(i*numRowsPerItteration,i*numRowsPerItteration + curData.shape[0])]
        totalNumRowsTraversed = totalNumRowsTraversed + curData.shape[0]
        

        curData['Index'] = curData.index
        curData.columns = column_headers
        
        curIndexRange = indexValues['Index'][(indexValues['Index'] < (i*numRowsPerItteration + numRowsPerItteration)) & (indexValues['Index'] > (i*numRowsPerItteration-1))]
        curData = curData[curData['Index'].isin(list(curIndexRange))]
        
        dataFrame = pd.concat([dataFrame,curData])
        
        clear_output()
        print("Extraction Stats: " + str(dataFrame.shape[0]) + " percent: " + str(dataFrame.shape[0] / indexValues.shape[0] * 100) + "%")
        print("Document Stats: " + str(totalNumRowsTraversed) + " percent: " + str(totalNumRowsTraversed/totalNumRows*100) + "%")
        if (dataFrame.shape[0] - prevsize) > 500000:
            prevsize = dataFrame.shape[0]
      
    return dataFrame

# This method generates is a wrapper around the generateSubset to generate the subset and save the dataframe to a csv file (for being able to make use of it after)
def generateAndSaveSubset(file,dataFrame,indexValues,numRowsPerItteration,totalNumRows,column_headers,frameSaveName):
    dataFrame = generateSubSet(file,dataFrame,indexValues,numRowsPerItteration,totalNumRows,column_headers)
    dataFrame.to_csv(frameSaveName)
    return dataFrame

# This method generates the categorical data required to then apply one hot encoding on the entire dataset
def generateCategoricalData(train1M):
    #change to categorical
    for col in train1M.columns[14:40]:
        train1M[col] = train1M[col].astype('category')
        # add the dummy category
        train1M[col].cat.add_categories(new_categories = 'Dummy',inplace = True)
        categories = pd.Series(train1M[col].cat.categories)
        categories.to_csv(str(col)+'_features.csv',header = False)
        #save the categories for each column
        #then we can set the categegories for each column
        # and when we get dummies from pandas we have a one hot encoding that is consistent accross
        # -> get_dummies() method does one hot encoding

def scale
        
def read_data(data_path, train_path, validation_path, test_path):

    print(data_path)
    print(train_path)
    print(validation_path)
    print(test_path)
    
    #get the ids
    try:
        trainIndeces = pd.read_csv(train_path, header = None)
        validationIndeces = pd.read_csv(validation_path, header = None)
        testingIndeces = pd.read_csv(test_path, header = None)
    except:
        print("There were not 1000000 data points")
        trainIndeces = generateNIndecesFrom(1000000,list(twoMIndeces['Index']))
        trainIndeces.to_csv('train_ids.txt',index=False,header=False)

        remainingIndeces = twoMIndeces['Index'][~twoMIndeces['Index'].isin(trainIndeces.values)]
        validationIndeces = generateNIndecesFrom(250000,list(remainingIndeces))
        validationIndeces.to_csv('validation_ids.txt',index=False,header=False)

        testingIndeces = twoMIndeces['Index'][~(twoMIndeces['Index'].isin(trainIndeces.values) | twoMIndeces['Index'].isin(validationIndeces.values))]
        testingIndeces = generateNIndecesFrom(750000,list(testingIndeces))
        testingIndeces.to_csv('test_ids.txt',index=False,header=False)
    
    trainIndeces.columns = ['Index']
    validationIndeces.columns = ['Index']
    testingIndeces.columns = ['Index']

    # Generate the actual data files
    column_headers = getColumnHeaders()
    train1M = pd.DataFrame()
    train1M = generateSubSet(data_path,train1M,trainIndeces,4000000,46000000,column_headers)
    
    generateCategoricalData(train1M)
    
    validation250k = pd.DataFrame()
    validation250k = generateSubSet(data_path,validation250k,validationIndeces,4000000,46000000,column_headers)

    test750k = pd.DataFrame()
    test750k = generateSubSet(data_path,test750k,testingIndeces,4000000,46000000,column_headers)
    
    
#     print(train1M.shape)
#     print(validation250k.shape)
#     print(test750k.shape)

    return train1M[train1M.columns[1:40]].values, train1M['label'].values, validation250k[validation250k.columns[1:40]].values, validation250k['label'].values, test750k[test750k.columns[1:40]].values, test750k['label'].values

def preprocess_int_data(data, features):
    n = len([f for f in features if f < 13])
#     print(features)
#     print(data)
    dataFrame = pd.DataFrame(data)
#     print(dataFrame.head())
    dataFrame[dataFrame.columns[0:13]] = dataFrame[dataFrame.columns[0:13]].fillna(0)
    dataFrame[dataFrame.columns[0:13]] = dataFrame[dataFrame.columns[0:13]].replace(-1,0)
    dataFrame[dataFrame.columns[0:13]] = dataFrame[dataFrame.columns[0:13]].replace(-2,0)
    dataFrame[dataFrame.columns[0:13]] = scale(dataFrame[dataFrame.columns[0:13]])
#     print(dataFrame[dataFrame.columns[0:13]])
    return np.zeros((data.shape[0], n))


def preprocess_cat_data(data, features, preprocess):
    print(data)
    print(features)
    dataFrame = pd.DataFrame(data)
    print(dataFrame[dataFrame.columns[13:39]])
    # Change each column in the 13-39 into categorical
    for col in dataFrame.columns[14:40]:
        dataFrame[col] = dataFrame[col].astype('category')
        #reset the categories to the ones for that column
        curFeatures = pd.read_csv(str(col) + "_features.csv",header = None,index_col = 0)
        print(curFeatures.values)
#     dataFrame[dataFrame.columns[13:39]] = dataFrame[dataFrame.columns[13:39]].fillna('Dummy')
    return None


In [3]:
def check_output_features(fs):
    assert len(fs) < 39


def check_output_read_data(data, target, n):
    assert data.shape[0] == n
    assert target.shape[0] == n


def check_output_preprocess(preprocess):
    assert (preprocess == 'onehot') or (preprocess == 'rate') or (preprocess == 'tfidf')


def check_output_preprocess_int_data(data, fs):
    n = len([f for f in fs if f < 13])
    assert data.shape[1] == n


def check_output_preprocess_cat_data(data, fs, preprocess):
    pass


def read_features(path):
    features = []
    with open(path) as f:
        for line in f:
            features.append(int(line.strip()))
    return features


def read_preprocess(path):
    with open(path) as f:
        for line in f:
            return line.strip()    

In [4]:
data = './dac/train.txt'
train = 'train_ids.txt'
validation = 'validation_ids.txt'
test = 'test_ids.txt'
features = 'features.txt'
preprocess = 'preprocess.txt'

In [5]:


train_data, train_target, validation_data, validation_target, test_data, test_target = \
    read_data(data, train, validation, test)

check_output_read_data(train_data, train_target, 1000000)
check_output_read_data(validation_data, validation_target, 250000)
check_output_read_data(test_data, test_target, 750000)

Extraction Stats: 750000 percent: 100.0%
Document Stats: 45840617 percent: 99.6535152173913%


In [6]:
features = read_features(features)

check_output_features(features)

preprocess = read_preprocess(preprocess)

check_output_preprocess(preprocess)

In [9]:
train_int_data = preprocess_int_data(train_data, features)
validation_int_data = preprocess_int_data(validation_data, features)
test_int_data = preprocess_int_data(test_data, features)

check_output_preprocess_int_data(train_int_data, features)
check_output_preprocess_int_data(validation_int_data, features)
check_output_preprocess_int_data(test_int_data, features)

In [24]:
train_cat_data = preprocess_cat_data(train_data, features, preprocess)
validation_cat_data = preprocess_cat_data(validation_data, features, preprocess)
test_cat_data = preprocess_cat_data(test_data, features, preprocess)

check_output_preprocess_cat_data(train_cat_data, features, preprocess)
check_output_preprocess_cat_data(validation_cat_data, features, preprocess)
check_output_preprocess_cat_data(test_cat_data, features, preprocess)

[[7.0 102 nan ..., '49045073' nan nan]
 [nan 0 17.0 ..., '43f13e8b' 'ea9a246c' '731c3655']
 [nan 1 5.0 ..., '3b183c5c' nan nan]
 ..., 
 [0.0 0 1.0 ..., '2e814ddc' 'e8b83407' '7bf15447']
 [0.0 218 7.0 ..., '8fc66e78' nan nan]
 [nan 839 2.0 ..., '3b183c5c' nan nan]]
[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 2, 5, 6, 7, 9, 11, 13, 14, 15, 17, 18, 20, 22, 23, 25]
              13        14        15        16        17        18        19  \
0       3c9d8785  b0660259  3a960356  15c92ddb  4cf72387  13718bbd  00c46cd1   
1       05db9164  f0cf0024  6f67f7e5  41274cd7  25c83c98  fbad5c96  9b6a4cc9   
2       05db9164  791f3f76  d032c263  c18be181  384874ce  7e0ccccf  fe4dce68   
3       05db9164  d833535f  ad4b77ff  d16679b9  25c83c98  7e0ccccf  41e6f3d3   
4       8cf07265  80e26c9b  0f8b497f  8d0c7214  25c83c98  7e0ccccf  bfa1a33f   
5       05db9164  3e4b7926  7442ec70  bb8645c3  25c83c98  fe6b92e5  a4ca48a1   
6       05db9164  80e26c9b  e399972d  b2e9811a  25c83c98  fe6b92e5  6772d022

FileNotFoundError: File b'14_features.csv' does not exist