In [23]:
import pandas as pd
import random
import re
from IPython.display import clear_output
import itertools
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
#create an index file for the huge data file
def createIndex(filePath):
    file = open(filePath,'r');
    indexFilePath = filePath+"_index.csv"
    indexFile = open(indexFilePath,'w');
    indexFile.close()
    
    offset = 0
    lineNumber = 0
    for line in file:
#         clear_output(wait=True)
#         print(lineNumber)
        indexFile = open(indexFilePath,'a');
        indexFile.write(str(lineNumber) + "," + str(offset) + "\n")
        indexFile.close()
        lineNumber = lineNumber + 1
        offset += len(line)
    file.close()
    return indexFilePath

In [3]:
def generateNIndecesFrom(n, rangeOfIndeces):
    print("Generating " + str(n) + " indeces from range")
    allIndeces = random.sample(rangeOfIndeces, n)
    allIndeces = pd.Series(data = allIndeces)
    allIndeces = allIndeces.sort_values().reset_index().drop(['index'],axis=1)
    allIndeces.columns = ['Index'];
    return allIndeces

In [4]:
def generateAndSaveSubset(file,dataFrame,indexValues,numRowsPerItteration,totalNumRows,column_headers,frameSaveName):
    dataFrame = generateSubSet(file,dataFrame,indexValues,numRowsPerItteration,totalNumRows,column_headers)
    dataFrame.to_csv(frameSaveName)
    return dataFrame

In [5]:
def generateSubSet(file,dataFrame,indexValues,numRowsPerItteration,totalNumRows,column_headers):
    totalNumIterations = int(totalNumRows/numRowsPerItteration)
    print("Number of itterations = " + str(totalNumIterations))
    totalNumRowsTraversed = 0
    prevsize = 0
    for i in range(totalNumIterations + 1):
#         
#         print("Itteration number: " + str(i))
#         print("skipRows: " + str(i * numRowsPerItteration))
#         print("Read in : " + str(numRowsPerItteration))
        curData = pd.read_table(file,skiprows = i * numRowsPerItteration, nrows = numRowsPerItteration,header=None)
        curData.index = [i for i in range(i*numRowsPerItteration,i*numRowsPerItteration + curData.shape[0])]
        totalNumRowsTraversed = totalNumRowsTraversed + curData.shape[0]

#         print(curData.shape)
#         print(curData.index.shape)

        curData['Index'] = curData.index
        curData.columns = column_headers

#         print(indexValues)
#         break
        curIndexRange = indexValues['Index'][(indexValues['Index'] < (i*numRowsPerItteration + numRowsPerItteration)) & (indexValues['Index'] > (i*numRowsPerItteration-1))]
        curData = curData[curData['Index'].isin(curIndexRange)]

        dataFrame = pd.concat([dataFrame,curData])
        
        clear_output()
        print("Extraction Stats: " + str(dataFrame.shape[0]) + " percent: " + str(dataFrame.shape[0] / indexValues.shape[0] * 100) + "%")
        print("Document Stats: " + str(totalNumRowsTraversed) + " percent: " + str(totalNumRowsTraversed/totalNumRows*100) + "%")
        if (dataFrame.shape[0] - prevsize) > 500000:
            prevsize = dataFrame.shape[0]
#             dataFrame.to_csv(frameSaveName)
        elif dataFrame.shape[0] == indexValues.shape[0]:
            print("Finished with the data collection")
#             dataFrame.to_csv(frameSaveName)
            break
    print("Extraction is Done, now saving frame")        
    return dataFrame

In [6]:
#Load or Generate the 2M indeces to use
try:
    twoMIndeces = pd.read_csv('2MIndeces.csv')
    
except:
    print("There were not 2000000 data points")
    twoMIndeces = generateNIndecesFrom(2000000,range(0,45840617)) # this range is because there are this number of records in the training set.
    twoMIndeces.to_csv('2MIndeces.csv',index=False,header=False)

In [8]:
#Load or Generate the 1M indeces for train, 250k validation and 750k test
try:
    trainIndeces = pd.read_csv('train_ids.txt')
    validationIndeces = pd.read_csv('validation_ids.txt')
    testingIndeces = pd.read_csv('test_ids.txt')
except:
    print("There were not 1000000 data points")
    trainIndeces = generateNIndecesFrom(1000000,list(twoMIndeces['Index']))
    trainIndeces.to_csv('train_ids.txt',index=False,header=False)

    remainingIndeces = twoMIndeces['Index'][~twoMIndeces['Index'].isin(trainIndeces.values)]
    validationIndeces = generateNIndecesFrom(250000,list(remainingIndeces))
    validationIndeces.to_csv('validation_ids.txt',index=False,header=False)
    
    testingIndeces = twoMIndeces['Index'][~(twoMIndeces['Index'].isin(trainIndeces.values) | twoMIndeces['Index'].isin(validationIndeces.values))]
    testingIndeces = generateNIndecesFrom(750000,list(testingIndeces))
    testingIndeces.to_csv('test_ids.txt',index=False,header=False)

#renaming the columns in case loading from file
trainIndeces.columns = ['Index']
validationIndeces.columns = ['Index']
testingIndeces.columns = ['Index']

In [9]:
def getColumnHeaders():
    return pd.Series(data=['label','integer_1','integer_2','integer_3',
                                 'integer_4','integer_5','integer_6','integer_7','integer_8','integer_9',
                                 'integer_10','integer_11','integer_12','integer_13','categorical_1',
                                 'categorical_2','categorical_3','categorical_4','categorical_5','categorical_6',
                                 'categorical_7','categorical_8','categorical_9','categorical_10','categorical_11',
                                 'categorical_12','categorical_13','categorical_14','categorical_15','categorical_16',
                                 'categorical_17','categorical_18','categorical_19','categorical_20','categorical_21',
                                 'categorical_22','categorical_23','categorical_24','categorical_25','categorical_26','Index'])
def getDataColumns():
    return pd.Series(data=['integer_1','integer_2','integer_3',
                                 'integer_4','integer_5','integer_6','integer_7','integer_8','integer_9',
                                 'integer_10','integer_11','integer_12','integer_13','categorical_1',
                                 'categorical_2','categorical_3','categorical_4','categorical_5','categorical_6',
                                 'categorical_7','categorical_8','categorical_9','categorical_10','categorical_11',
                                 'categorical_12','categorical_13','categorical_14','categorical_15','categorical_16',
                                 'categorical_17','categorical_18','categorical_19','categorical_20','categorical_21',
                                 'categorical_22','categorical_23','categorical_24','categorical_25','categorical_26'])
column_headers = getColumnHeaders()

In [10]:
try:
    train1M = pd.read_csv('train1M.csv',squeeze = True)
except:
    print("No 1M collection")
    train1M = pd.DataFrame()
    train1M = generateAndSaveSubset('dac/train.txt',train1M,trainIndeces,4000000,46000000,column_headers,'train1M.csv')

In [11]:
try:
    validation250k = pd.read_csv('validation250k.csv',squeeze = True)
except:
    print("No 250k collection")
    validation250k = pd.DataFrame()
    validation250k = generateAndSaveSubset('dac/train.txt',validation250k,validationIndeces,4000000,46000000,column_headers,'validation250k.csv')

In [12]:
try:
    test750k = pd.read_csv('test750k.csv',squeeze = True)
except:
    print("No 750k collection")
    test750k = pd.DataFrame()
    test750k = generateAndSaveSubset('dac/train.txt',test750k,validationIndeces,4000000,46000000,column_headers,'test750k.csv')

In [13]:
train1M.shape

(1000000, 42)

In [14]:
train1M.columns

Index(['Unnamed: 0', 'label', 'integer_1', 'integer_2', 'integer_3',
       'integer_4', 'integer_5', 'integer_6', 'integer_7', 'integer_8',
       'integer_9', 'integer_10', 'integer_11', 'integer_12', 'integer_13',
       'categorical_1', 'categorical_2', 'categorical_3', 'categorical_4',
       'categorical_5', 'categorical_6', 'categorical_7', 'categorical_8',
       'categorical_9', 'categorical_10', 'categorical_11', 'categorical_12',
       'categorical_13', 'categorical_14', 'categorical_15', 'categorical_16',
       'categorical_17', 'categorical_18', 'categorical_19', 'categorical_20',
       'categorical_21', 'categorical_22', 'categorical_23', 'categorical_24',
       'categorical_25', 'categorical_26', 'Index'],
      dtype='object')

In [15]:
validation250k['label'].values

array([0, 0, 1, ..., 0, 1, 0], dtype=int64)

In [16]:
validation250k[validation250k.columns[1:40]].head()

Unnamed: 0,label,integer_1,integer_2,integer_3,integer_4,integer_5,integer_6,integer_7,integer_8,integer_9,...,categorical_16,categorical_17,categorical_18,categorical_19,categorical_20,categorical_21,categorical_22,categorical_23,categorical_24,categorical_25
0,0,10.0,1,11.0,14.0,133.0,16.0,12.0,18.0,244.0,...,085faf22,e5ba7672,0b331314,21ddcdc9,5840adea,1242136d,,32c7478e,0bab5d7e,ea9a246c
1,0,0.0,54,13.0,9.0,3411.0,75.0,3.0,21.0,113.0,...,54dd60b2,3486227d,5aed7436,21ddcdc9,a458ea53,b39b1608,,32c7478e,3fdb382b,e8b83407
2,1,13.0,1,13.0,13.0,48.0,19.0,13.0,8.0,13.0,...,c64d548f,3486227d,63cdbb21,cf99e5de,a458ea53,5f957280,,3a171ecb,1793a828,e8b83407
3,0,2.0,322,1.0,3.0,37.0,4.0,2.0,3.0,3.0,...,08dd6c7b,776ce399,5aed7436,9902b8f1,a458ea53,3fcdec93,,32c7478e,302bda93,e8b83407
4,0,0.0,1,,,2969.0,126.0,3.0,27.0,77.0,...,31ca40b6,07c540c4,7b49e3d2,,,dfcfc3fa,,3a171ecb,aee52b6f,


In [None]:
# testData = pd.read_table('dac/train.txt',skiprows = 0, nrows = 1000,header=None)

# testData['Index'] = testData.index

# testData.columns = column_headers

# testData.head()

In [41]:
plt.figure();
train1M.hist(stacked=True, bins=20)
# for col in train1M.columns:
#     if (train1M[col].dtype == 'int64' and col != 'label' and col != 'Index'):
# #         print(train1M[col].dtype)
#         train1M[col].hist(stacked=True, bins=20)

array([[<matplotlib.axes._subplots.AxesSubplot object at 0x000002171F531208>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000002171F59AF98>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000002171F6087B8>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000002171F642BA8>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x000002171F6C25C0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000002171F6FC5F8>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000002171F78CF98>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000002171F79EC50>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x000002171F857DA0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000002171F8F8588>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000002171F961BE0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000002171F9C2F60>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x00

(4, 4)