In [7]:
import json
import pandas as pd
import numpy as np
import sklearn
from zipfile import ZipFile
import os
import seaborn as sns
import matplotlib.pyplot as plt
from pathlib import Path

In [9]:
data_folder = Path('data')
train_zip = 'train.csv.zip'
test_zip = 'test.csv.zip'

In [10]:
def read_json_zip(zip_file_path):
    with ZipFile(zip_file_path, "r") as z:
        with z.open(zip_file_path.stem) as f:
            data = pd.read_csv(f)
    return data

In [11]:
df_train = read_json_zip(data_folder.joinpath(train_zip))
df_test = read_json_zip(data_folder.joinpath(test_zip))

In [12]:
df_train

Unnamed: 0,Id,Sequence
0,3,"1,3,13,87,1053,28576,2141733,508147108,4021352..."
1,7,"1,2,1,5,5,1,11,16,7,1,23,44,30,9,1,47,112,104,..."
2,8,"1,2,4,5,8,10,16,20,32,40,64,80,128,160,256,320..."
3,11,"1,8,25,83,274,2275,132224,1060067,3312425,1099..."
4,13,"1,111,12211,1343211,147753211,16252853211,1787..."
...,...,...
113840,227683,"0,0,4,1198,1829388,23796035743,214296750607865..."
113841,227684,"0,-1,-1,-1,-1,10324303,-6586524273069171148,11..."
113842,227686,"0,1,9,85,801,7549,71145,670501,6319089,5955380..."
113843,227689,"2,3,3,4,6,4,5,10,10,5,6,15,20,15,6,7,21,35,35,..."


In [13]:
print('Number of training examples: %s\n Number of testing examples: %s' %(len(df_train), len(df_test)))

Number of training examples: 113845
 Number of testing examples: 113845


In [14]:
seq_len_train = df_train['Sequence'].str.split(',').map(len)
seq_len_test = df_test['Sequence'].str.split(',').map(len)

In [15]:
print('Train Sequence length: \n', seq_len_train.describe(percentiles=[.25, .5, .75, .9, .99]), '\n\n')
print('Test Sequence length: \n', seq_len_test.describe(percentiles=[.25, .5, .75, .9, .99]), '\n\n')

Train Sequence length: 
 count    113845.000000
mean         41.669630
std          28.132079
min           1.000000
25%          19.000000
50%          34.000000
75%          59.000000
90%          86.000000
99%         105.000000
max         348.000000
Name: Sequence, dtype: float64 


Test Sequence length: 
 count    113845.000000
mean         40.536370
std          28.030535
min           1.000000
25%          18.000000
50%          33.000000
75%          57.000000
90%          85.000000
99%         104.000000
max         347.000000
Name: Sequence, dtype: float64 




Lets convert the data to pandas format for easy usage

In [16]:
df_train.head()

Unnamed: 0,Id,Sequence
0,3,"1,3,13,87,1053,28576,2141733,508147108,4021352..."
1,7,"1,2,1,5,5,1,11,16,7,1,23,44,30,9,1,47,112,104,..."
2,8,"1,2,4,5,8,10,16,20,32,40,64,80,128,160,256,320..."
3,11,"1,8,25,83,274,2275,132224,1060067,3312425,1099..."
4,13,"1,111,12211,1343211,147753211,16252853211,1787..."


In [17]:
df_test.head()

Unnamed: 0,Id,Sequence
0,1,"1,0,0,2,24,552,21280,103760,70299264,579285324..."
1,2,"1,1,5,11,35,93,269,747,2115,5933,16717,47003,1..."
2,4,"0,1,101,2,15,102,73,3,40,16,47,103,51,74,116,4..."
3,5,"1,4,14,23,42,33,35,34,63,66,87,116,84,101,126,..."
4,6,"1,1,2,5,4,2,6,13,11,4,10,10,12,6,8,29,16,11,18..."


In [18]:
df_train[seq_len_train<10]['Sequence'].values

array(['4,6,8,9,26,1752',
       '1,176,570496,9223556096,460993706622976,55266063875773300736,13556412676746128524312576,6098119622978353294582319415296,4642750780653941150969313976486199296',
       '1,3,635,2049219,7372235460687', ...,
       '0,0,4,1198,1829388,23796035743,2142967506078650,1608943108575146892075',
       '0,-1,-1,-1,-1,10324303,-6586524273069171148,110780954395540516579111562860048860420,5864545399742183862578018016183410025465491904722516203269973267547486512819',
       '5,7,179,229,439,557,6113,223999'], dtype=object)

- Since the sequence is of variable length, we will use RNN
- Sequence has some 

In [20]:
x = 4642750780653941150969313976486199296

In [26]:
x/10**36

4.642750780653941