In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import random
import re
import itertools
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.externals import joblib 


from sys import getsizeof
from IPython.display import clear_output

In [1]:
# Additional helper functions required for the code to run:
# This function defines the column headers for the data frames
def getColumnHeaders():
    return pd.Series(data=['label','integer_1','integer_2','integer_3',
                                 'integer_4','integer_5','integer_6','integer_7','integer_8','integer_9',
                                 'integer_10','integer_11','integer_12','integer_13','categorical_1',
                                 'categorical_2','categorical_3','categorical_4','categorical_5','categorical_6',
                                 'categorical_7','categorical_8','categorical_9','categorical_10','categorical_11',
                                 'categorical_12','categorical_13','categorical_14','categorical_15','categorical_16',
                                 'categorical_17','categorical_18','categorical_19','categorical_20','categorical_21',
                                 'categorical_22','categorical_23','categorical_24','categorical_25','categorical_26','Index'])

def getDataHeaders():
    return pd.Series(data=['integer_1','integer_2','integer_3',
                                 'integer_4','integer_5','integer_6','integer_7','integer_8','integer_9',
                                 'integer_10','integer_11','integer_12','integer_13','categorical_1',
                                 'categorical_2','categorical_3','categorical_4','categorical_5','categorical_6',
                                 'categorical_7','categorical_8','categorical_9','categorical_10','categorical_11',
                                 'categorical_12','categorical_13','categorical_14','categorical_15','categorical_16',
                                 'categorical_17','categorical_18','categorical_19','categorical_20','categorical_21',
                                 'categorical_22','categorical_23','categorical_24','categorical_25','categorical_26'])

# this function generateNIndecesFrom takes a range of Indeces and randomly takes n of them, returns a pandas Series object
def generateNIndecesFrom(n, rangeOfIndeces):
    # print("Generating " + str(n) + " indeces from range")
    allIndeces = random.sample(rangeOfIndeces, n)
    allIndeces = pd.Series(data = allIndeces)
    allIndeces = allIndeces.sort_values().reset_index().drop(['index'],axis=1)
    allIndeces.columns = ['Index'];
    return allIndeces

# This function takes in a dataframe and computes statistics and histograms for all columns that are not 'label', 'Index' or 'Unnamed: 0'
# It was used in part 2.2 to generate the histograms and statistics.
# It can for example be placed at the end of the read_data method and be passed one of the datasets to compute its statistics and histograms
def generateSummaryStatsAndHists(train1M):
    SummaryStats = pd.DataFrame()
    for col in train1M.columns:
        if (col != 'label' and col != 'Index' and col != 'Unnamed: 0'):

            train1M[col][train1M['label'] == 0].value_counts().plot(kind='hist',title=col, bins=100,label='0s')
            train1M[col][train1M['label'] == 1].value_counts().plot(kind='hist',title=col, bins=100,label='1s')
            plt.legend(loc='upper right')
            plt.savefig(col)
#             plt.show()
            plt.gcf().clear()
            if (train1M[col].dtype != 'O'):
                SummaryStats[col] = train1M[col].describe()   
    # SummaryStats.head()
    SummaryStats.to_csv('integerStats.csv')
    return SummaryStats


# the generateSubSet function takes in a file, a dataFrame to put the data in as well as index values which should be extracted,
# a number of rows per itteration (this is used to not overload the memory) , total number of rows in the file, the column headers for the new dataframe
def generateSubSet(file,dataFrame,indexValues,numRowsPerItteration,totalNumRows,column_headers):
    totalNumIterations = int(totalNumRows/numRowsPerItteration)
    # print("Number of itterations = " + str(totalNumIterations))
    totalNumRowsTraversed = 0
    prevsize = 0
    for i in range(totalNumIterations + 1):

        curData = pd.read_table(file,skiprows = i * numRowsPerItteration, nrows = numRowsPerItteration,header=None)
        curData.index = [i for i in range(i*numRowsPerItteration,i*numRowsPerItteration + curData.shape[0])]
        totalNumRowsTraversed = totalNumRowsTraversed + curData.shape[0]
        

        curData['Index'] = curData.index
        curData.columns = column_headers
        
        curIndexRange = indexValues['Index'][(indexValues['Index'] < (i*numRowsPerItteration + numRowsPerItteration)) & (indexValues['Index'] > (i*numRowsPerItteration-1))]
        curData = curData[curData['Index'].isin(list(curIndexRange))]
        
        dataFrame = pd.concat([dataFrame,curData])
        
#         clear_output()
        print("Extraction Stats: " + str(dataFrame.shape[0]) + " percent: " + str(dataFrame.shape[0] / indexValues.shape[0] * 100) + "%")
#         print("Document Stats: " + str(totalNumRowsTraversed) + " percent: " + str(totalNumRowsTraversed/totalNumRows*100) + "%")
        if (dataFrame.shape[0] - prevsize) > 500000:
            prevsize = dataFrame.shape[0]
      
    return dataFrame

# This method generates is a wrapper around the generateSubset to generate the subset and save the dataframe to a csv file (for being able to make use of it after)
def generateAndSaveSubset(file,dataFrame,indexValues,numRowsPerItteration,totalNumRows,column_headers,frameSaveName):
    dataFrame = generateSubSet(file,dataFrame,indexValues,numRowsPerItteration,totalNumRows,column_headers)
    dataFrame.to_csv(frameSaveName)
    return dataFrame

# This method generates the categorical data required to then apply one hot encoding on the entire dataset
def generateCategoricalData(train1M):
    #change to categorical
    for col in train1M.columns[14:40]:
        train1M[col] = train1M[col].astype('category')
        
        #get only the top 10 categories with highest count
        print(train1M[col].value_counts().mean())
        
        # add the dummy category
        train1M[col].cat.add_categories(new_categories = 'Dummy',inplace = True)
        categories = pd.Series(train1M[col].cat.categories)
        categories.to_csv(str(col)+'_features.csv',header = False)
        #save the categories for each column
        #then we can set the categegories for each column
        # and when we get dummies from pandas we have a one hot encoding that is consistent accross
        # -> get_dummies() method does one hot encoding

# This methods takes the training set and creates a scaler that is fit to the integer columns of the training set then saves
# The model to file for future retrieval.
def preProcessIntsAndSave(dataFrame,fileName):
    dataFrame[dataFrame.columns[1:14]] = dataFrame[dataFrame.columns[1:14]].fillna(0)
    dataFrame[dataFrame.columns[1:14]] = dataFrame[dataFrame.columns[1:14]].replace(-1,0)
    dataFrame[dataFrame.columns[1:14]] = dataFrame[dataFrame.columns[1:14]].replace(-2,0)
    
    curScaler = StandardScaler()
    curScaler.fit(dataFrame[dataFrame.columns[1:14]])
    joblib.dump(curScaler, fileName)
    return
        
def read_data(data_path, train_path, validation_path, test_path):

    print(data_path)
    print(train_path)
    print(validation_path)
    print(test_path)
    
    #get the ids
    try:
        trainIndeces = pd.read_csv(train_path, header = None)
        validationIndeces = pd.read_csv(validation_path, header = None)
        testingIndeces = pd.read_csv(test_path, header = None)
    except:
        print("There were not 1000000 data points")
        trainIndeces = generateNIndecesFrom(1000000,list(twoMIndeces['Index']))
        trainIndeces.to_csv('train_ids.txt',index=False,header=False)

        remainingIndeces = twoMIndeces['Index'][~twoMIndeces['Index'].isin(trainIndeces.values)]
        validationIndeces = generateNIndecesFrom(250000,list(remainingIndeces))
        validationIndeces.to_csv('validation_ids.txt',index=False,header=False)

        testingIndeces = twoMIndeces['Index'][~(twoMIndeces['Index'].isin(trainIndeces.values) | twoMIndeces['Index'].isin(validationIndeces.values))]
        testingIndeces = generateNIndecesFrom(750000,list(testingIndeces))
        testingIndeces.to_csv('test_ids.txt',index=False,header=False)
    
    trainIndeces.columns = ['Index']
    validationIndeces.columns = ['Index']
    testingIndeces.columns = ['Index']

    # Generate the actual data files
    column_headers = getColumnHeaders()
    train1M = pd.DataFrame()
    train1M = generateSubSet(data_path,train1M,trainIndeces,4000000,46000000,column_headers)
    
    generateSummaryStatsAndHists(train1M)
    generateCategoricalData(train1M)
    preProcessIntsAndSave(train1M,'scalerPickle.pkl')
    
    validation250k = pd.DataFrame()
    validation250k = generateSubSet(data_path,validation250k,validationIndeces,4000000,46000000,column_headers)

    test750k = pd.DataFrame()
    test750k = generateSubSet(data_path,test750k,testingIndeces,4000000,46000000,column_headers)
    
    
#     print(train1M.shape)
#     print(validation250k.shape)
#     print(test750k.shape)

    return train1M[train1M.columns[1:40]].values, train1M['label'].values, validation250k[validation250k.columns[1:40]].values, validation250k['label'].values, test750k[test750k.columns[1:40]].values, test750k['label'].values

def preprocess_int_data(data, features):
    n = len([f for f in features if f < 13])
    
    dataFrame = pd.DataFrame()
    for f in features:
        if f < 13:
            dataFrame = pd.concat([dataFrame, pd.DataFrame(data[:,f:f+1])],axis=1)

    headers = getDataHeaders()
    trueHeaders = []
    for f in features:
        if f < 13:
            trueHeaders.append(headers[f])
    
    dataFrame.columns = trueHeaders

    for f in features:
        if f < 13:

            dataFrame[dataFrame.columns[f]] = dataFrame[dataFrame.columns[f]].fillna(0)
            dataFrame[dataFrame.columns[f]] = dataFrame[dataFrame.columns[f]].replace(-1,0)
            dataFrame[dataFrame.columns[f]] = dataFrame[dataFrame.columns[f]].replace(-2,0)

    scaler = joblib.load('scalerPickle.pkl') 
    scaledValues = scaler.transform(dataFrame)
    
    return scaledValues


def preprocess_cat_data(data, features, preprocess):
#     print(features)
#     print(preprocess)
#     print(data)
    dataFrame = pd.DataFrame(data)
#     print(dataFrame[dataFrame.columns[13:39]])
    # Change each column in the 13-39 into categorical
#     dataFrame.columns = getDataHeaders()
    
    returnFrame = pd.DataFrame()
    # drop the cols that are not in the features vector
    for col in dataFrame.columns:
        foundCol = False
        for f in features:
            if (f == col):
                foundCol = True
                
        if foundCol == False:
            dataFrame.drop(col,inplace=True,axis=1)
        else:    
            # I know that the categorical features start at 1 and index 13 so add 12 to f
            if (col > 12):
                print(col)
#                 print(dataFrame[col].dtype)
                dataFrame[col] = dataFrame[col].astype('category')
                curFeatures = pd.read_csv("categorical_" + str(col-12) + "_features.csv",header = None,index_col = 0)
#                 print(curFeatures.values)
                dataFrame[col].cat.set_categories(curFeatures.values)
                dataFrame[col].cat.add_categories(new_categories = 'Dummy',inplace = True)
                dataFrame[col] = dataFrame[col].fillna('Dummy')
#                 print(dataFrame[col].dtype)
# #                 print(dataFrame[col].cat.categories)
                onehotVals = pd.get_dummies(dataFrame[col],prefix='encoded',sparse=True)
#                 print(onehotVals.info())
                returnFrame = pd.concat([returnFrame, onehotVals],axis=1)
    
                del onehotVals
        
                print("Got 1hot for " + str(col))
#                 print(returnFrame.info())
#                 return
    
#     print(dataFrame.head())
#     print(dataFrame.info())
#     print(returnFrame.head())
    
#     for col in dataFrame.columns[13:39]:
#         dataFrame[col] = dataFrame[col].astype('category')
#         #reset the categories to the ones for that column
#         
        
#         dataFrame[col].cat.set_categories(curFeatures.values)
#         pd.get_dummies(train1M[col],prefix=['encoded'],sparse=True)
#     dataFrame[dataFrame.columns[13:39]] = dataFrame[dataFrame.columns[13:39]].fillna('Dummy')
    return dataFrame.values


In [7]:
def check_output_features(fs):
    assert len(fs) < 39


def check_output_read_data(data, target, n):
    assert data.shape[0] == n
    assert target.shape[0] == n


def check_output_preprocess(preprocess):
    assert (preprocess == 'onehot') or (preprocess == 'rate') or (preprocess == 'tfidf')


def check_output_preprocess_int_data(data, fs):
    n = len([f for f in fs if f < 13])
    assert data.shape[1] == n


def check_output_preprocess_cat_data(data, fs, preprocess):
    pass


def read_features(path):
    features = []
    with open(path) as f:
        for line in f:
            features.append(int(line.strip()))
    return features


def read_preprocess(path):
    with open(path) as f:
        for line in f:
            return line.strip()    

In [8]:
data = './dac/train.txt'
train = 'train_ids.txt'
validation = 'validation_ids.txt'
test = 'test_ids.txt'
features = 'features.txt'
preprocess = 'preprocess.txt'

In [None]:


train_data, train_target, validation_data, validation_target, test_data, test_target = \
    read_data(data, train, validation, test)

check_output_read_data(train_data, train_target, 1000000)
check_output_read_data(validation_data, validation_target, 250000)
check_output_read_data(test_data, test_target, 750000)

./dac/train.txt
train_ids.txt
validation_ids.txt
test_ids.txt
Extraction Stats: 87443 percent: 8.7443%
Extraction Stats: 174728 percent: 17.4728%
Extraction Stats: 262233 percent: 26.2233%
Extraction Stats: 349388 percent: 34.9388%
Extraction Stats: 436039 percent: 43.6039%
Extraction Stats: 523389 percent: 52.3389%
Extraction Stats: 610528 percent: 61.0528%
Extraction Stats: 698062 percent: 69.80619999999999%
Extraction Stats: 785627 percent: 78.56269999999999%
Extraction Stats: 872569 percent: 87.2569%
Extraction Stats: 960224 percent: 96.02239999999999%
Extraction Stats: 1000000 percent: 100.0%
05db9164    500035
68fd1e64    167225
5a9ed9b0     83500
8cf07265     49364
be589b51     33431
5bfa8ab5     24142
87552397     17745
f473b8dc     13945
39af2607     11168
ae82ea21      8825
9a89b36c      7753
17f69355      6441
241546e0      5320
09ca0b81      4780
75ac2fe6      4115
41edac3d      3555
439a44a4      3459
7e5c2ff4      2984
b455c6d7      2508
1464facd      2438
fbc55dae      2

a73ee510    898969
7cc72ec2    100869
a18233ea       162
Name: categorical_9, dtype: int64
3b08e48b    220715
efea433b     14690
fbbf2c95      7682
fa7d0797      5690
03e48276      5280
451bd4e4      5021
5ba575e7      4557
49d1ad89      4149
5162b19c      3969
935a36f0      3857
f6f942d1      3752
015ac893      3752
305a0646      3671
711ec2bc      3532
299aecf1      3396
31990058      3393
000e2f4b      2909
67eea4ef      2816
474773a7      2675
6c47047a      2663
e7ba2569      2591
547c0ffe      2568
d2f1c80c      2299
7259dc52      2267
0e9ead52      2215
acccca1c      2170
2462946f      2123
f9065d00      2115
dcbc7c2b      2061
a1ee64a6      1988
             ...  
f23673ed         1
f224ce53         1
f22ce80d         1
793c49f1         1
79382ffe         1
7928dfca         1
79215586         1
79161733         1
7915899a         1
7912b76d         1
f2345683         1
78ff230a         1
78fe044b         1
78f2e9a9         1
78ed0c14         1
78b194bb         1
78e9bdd2        

21ddcdc9    344666
55dd3565     19569
5b885066      8007
9437f62f      7320
712d530c      5815
cf99e5de      5815
1d1eb838      5106
efa3470f      4028
1d04f4a4      2972
4764bf77      2900
3014a4b1      2742
a153cea2      2478
6f3756eb      2403
04de9d96      2260
9653bb65      2172
338f20de      2007
f30f7842      1933
064f1f80      1865
2b558521      1780
a34d2cf6      1712
e3b5ceb7      1673
083e89d9      1670
26e97973      1566
7a45f7f2      1483
315ba0e1      1409
2f4b9dd2      1367
a18beb4f      1344
0053530c      1337
b87498e2      1238
92524a76      1235
             ...  
a6b21933         1
a4558105         1
5aae8e01         1
ec8f6ac4         1
ec51e9ca         1
7ee7a9c5         1
193b7677         1
4595d922         1
b7c877e8         1
b7c5e2e5         1
b7172f2f         1
465ce4be         1
e8339ce4         1
47e4f12b         1
e8ecdd1e         1
1a6e3a12         1
ead57433         1
b4a40568         1
171b30d4         1
eb0a0811         1
eb14c5ef         1
18581340    

In [None]:
# # backup
# train_data2 = train_data
# train_target2 = train_target
# validation_data2 = validation_data
# validation_target2 = validation_target
# test_data2 = test_data
# test_target2 = test_target

In [None]:
del train_data2
del train_target2
del validation_data2
del validation_target2
del test_data2
del test_target2

In [None]:
# joblib.dump(train_data, 'train_data')
# joblib.dump(train_target, 'train_target')
# joblib.dump(validation_data, 'validation_data')
# joblib.dump(validation_target, 'validation_target')
# joblib.dump(test_data, 'test_data')
# joblib.dump(test_target, 'test_target')

In [None]:
# #restore
# train_data = train_data2
# train_target = train_target2
# validation_data = validation_data2
# validation_target = validation_target2
# test_data = test_data2
# test_target = test_target2

In [None]:
train_data = joblib.load('train_data')
train_target = joblib.load('train_target')
validation_data = joblib.load('validation_data')
validation_target = joblib.load('validation_target')
test_data = joblib.load('test_data')
test_target = joblib.load('test_target')

In [None]:
features = read_features(features)

check_output_features(features)

preprocess = read_preprocess(preprocess)

check_output_preprocess(preprocess)

In [None]:
train_int_data = preprocess_int_data(train_data, features)
validation_int_data = preprocess_int_data(validation_data, features)
test_int_data = preprocess_int_data(test_data, features)

check_output_preprocess_int_data(train_int_data, features)
check_output_preprocess_int_data(validation_int_data, features)
check_output_preprocess_int_data(test_int_data, features)

In [None]:
getsizeof(train_data)+getsizeof(train_target)+getsizeof(validation_data)+getsizeof(validation_target)+getsizeof(test_data)+getsizeof(test_target)+getsizeof(train_int_data)+getsizeof(validation_int_data)+getsizeof(test_int_data)

In [None]:
train_cat_data = preprocess_cat_data(train_data, features, preprocess)
validation_cat_data = preprocess_cat_data(validation_data, features, preprocess)
test_cat_data = preprocess_cat_data(test_data, features, preprocess)

check_output_preprocess_cat_data(train_cat_data, features, preprocess)
check_output_preprocess_cat_data(validation_cat_data, features, preprocess)
check_output_preprocess_cat_data(test_cat_data, features, preprocess)