## Make lag_time stay_time labels more detailed
## Combine features to SGT features

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
cd /content/drive/MyDrive

/content/drive/MyDrive


In [4]:
!pip install pandarallel
!pip install sgt

Collecting pandarallel
  Downloading https://files.pythonhosted.org/packages/f9/c9/2350222cec65593ab5f2f00f2e57dfd1fa4e697dbe92fcaff641485354e6/pandarallel-1.5.2.tar.gz
Building wheels for collected packages: pandarallel
  Building wheel for pandarallel (setup.py) ... [?25l[?25hdone
  Created wheel for pandarallel: filename=pandarallel-1.5.2-cp37-none-any.whl size=18386 sha256=ad19276b056f9120c861579f79fcd1ea733be3ec0ab7965876ae7d0c9d620289
  Stored in directory: /root/.cache/pip/wheels/40/80/6d/d50fb72a8ce6a923fb10390fec9eaaa40b02d07a7ec05c9c05
Successfully built pandarallel
Installing collected packages: pandarallel
Successfully installed pandarallel-1.5.2
Collecting sgt
  Downloading https://files.pythonhosted.org/packages/aa/ac/158c762acd2a75f347a480271de12e06a95d2c75a30026207f827568f8ea/sgt-2.0.3-py3-none-any.whl
Installing collected packages: sgt
Successfully installed sgt-2.0.3


# Configurations

In [5]:
# Library used

from datetime import datetime
import numpy as np
import pandas as pd
from tqdm import tqdm
import json
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import normalize
import seaborn as sns
from sgt import SGT

# Data Input and processing

In [6]:
# SGT features for page name
train = pd.read_csv('processed_data/SGT/sgt_train_page_name April 2.csv', index_col=0)
test = pd.read_csv('processed_data/SGT/sgt_test_page_name April 2.csv', index_col=0)

In [7]:
# Import other sequences, including stay_time, lag_time, pid change and sid change
train_add_feature = pd.read_csv('processed_data/SGT/SGT training with label April 2.csv')
train_add_feature = train_add_feature.reset_index()
train_add_feature = train_add_feature.rename(columns={'index': 'id'})
train_add_feature['stay_time_label_sequence'] = train_add_feature['stay_time_label_sequence'].apply(lambda x: x.split())
train_add_feature['lagg_label_sequence'] = train_add_feature['lagg_label_sequence'].apply(lambda x: x.split())
train_add_feature['pid_label_sequence'] = train_add_feature['pid_label_sequence'].apply(lambda x: x.split())
train_add_feature['sid_label_sequence'] = train_add_feature['sid_label_sequence'].apply(lambda x: x.split())

test_add_feature = pd.read_csv('processed_data/SGT/SGT testing with label April 2.csv')
test_add_feature = test_add_feature.reset_index()
test_add_feature = test_add_feature.rename(columns={'index': 'id'})
test_add_feature['stay_time_label_sequence'] = test_add_feature['stay_time_label_sequence'].apply(lambda x: x.split())
test_add_feature['lagg_label_sequence'] = test_add_feature['lagg_label_sequence'].apply(lambda x: x.split())
test_add_feature['pid_label_sequence'] = test_add_feature['pid_label_sequence'].apply(lambda x: x.split())
test_add_feature['sid_label_sequence'] = test_add_feature['sid_label_sequence'].apply(lambda x: x.split())

In [8]:
# Mapping between page name and alphabet

map = {'loan_index':'A',
       'personal_info': 'B',
       'id_verify': 'C',
       'contacts_info': 'D',
       'loan_submission': 'E',
       'operator': 'F',
       'bind_debit_card': 'G',
       'biometric_auto': 'H',
       'login': 'I',
       'register': 'J',
       'biometric_auth': 'K'}

In [9]:
# Create alphabet sequences

train = []
with open('/content/drive/MyDrive/processed_data/SGT/sgt_sentences training April 2.txt', 'r') as f:
    for sent in f.readlines():
        tokens = sent.split()
        alphabets = []
        for token in tokens:
            alphabets.append(map[token])
        train.append(alphabets)
test = []
with open('/content/drive/MyDrive/processed_data/SGT/sgt_sentences testing April 2.txt', 'r') as f:
    for sent in f.readlines():
        tokens = sent.split()
        alphabets = []
        for token in tokens:
            alphabets.append(map[token])
        test.append(alphabets)

In [10]:
# Format into dataframe for SGT

train_seq = pd.DataFrame({'sequence': train})
train_seq = train_seq.reset_index()
train_seq = train_seq.rename(columns={'index': 'id'})
test_seq = pd.DataFrame({'sequence': test})
test_seq = test_seq.reset_index()
test_seq = test_seq.rename(columns={'index': 'id'})

In [11]:
train_add_feature['pname_sequence'] = train_seq['sequence']
test_add_feature['pname_sequence'] = test_seq['sequence']

In [12]:
train_add_feature.head(1)

Unnamed: 0,level_0,id,sentence,overdue,new_client,order_time,label,user_id,application_time,application_date,day_of_week,hour_of_day,pname_sequence,stay_time_label_sequence,lagg_label_sequence,pid_label_sequence,sid_label_sequence
0,0,00003b4ceb224e1b977c242bbf767ea0|1508889540000,login loan_index biometric_auto loan_index ope...,0.0,1.0,1508890000000.0,0,00003b4ceb224e1b977c242bbf767ea0,1508889540000,2017-10-24 23:59:00,2,6,"[I, A, H, A, F, F, F, F, A, A, A, A, E, E, A, A]","[4, 3, 5, 2, 5, 3, 5, 5, 5, 5, 5, 1, 3, 4, 4, 4]","[1, 2, 2, 1, 4, 3, 3, 3, 2, 4, 4, 3, 2, 5, 1, 3]","[C, A, A, A, A, A, A, A, A, B, A, A, A, A, A, A]","[C, A, A, A, A, A, A, A, A, B, A, B, A, A, A, A]"


## Combine Features Function

In [13]:
def combine_features(feature_1, feature_2):
  ls = []
  if len(feature_1) != len(feature_2):
    raise ValueError('len(feature_1) != len(feature_2)')
  for i in range(len(feature_1)):
    ls.append(str(feature_1[i])+str(feature_2[i]))
  return ls

In [14]:
# df = pd.DataFrame([[1, ["B","B","A","C"], ['1','2','3','4']],
#           [2, ["D","C","B","A"], ['2','1','4','3']]]
#                   , columns=['id','col_1','col_2'])
# df

In [15]:
# a = ['B', 'B', 'A', 'C'] 
# b = [1, 2, 3, 4]
# combine_features(a,b)

In [16]:
# df[['col_1','col_2']].apply(lambda x: combine_features(x[0],x[1]),axis=1)

## Generate Combined Features

In [17]:
train_add_feature['pname_&_stay_time_label_sequence'] = train_add_feature[['pname_sequence', 'stay_time_label_sequence']].apply(lambda x: combine_features(x[0],x[1]),axis=1)
test_add_feature['pname_&_stay_time_label_sequence'] = test_add_feature[['pname_sequence', 'stay_time_label_sequence']].apply(lambda x: combine_features(x[0],x[1]),axis=1)

train_add_feature['pname_&_lagg_label_sequence'] = train_add_feature[['pname_sequence', 'lagg_label_sequence']].apply(lambda x: combine_features(x[0],x[1]),axis=1)
test_add_feature['pname_&_lagg_label_sequence'] = test_add_feature[['pname_sequence', 'lagg_label_sequence']].apply(lambda x: combine_features(x[0],x[1]),axis=1)

train_add_feature['stay_time_label_&_lagg_label_sequence'] = train_add_feature[['stay_time_label_sequence', 'lagg_label_sequence']].apply(lambda x: combine_features(x[0],x[1]),axis=1)
test_add_feature['stay_time_label_&_lagg_label_sequence'] = test_add_feature[['stay_time_label_sequence', 'lagg_label_sequence']].apply(lambda x: combine_features(x[0],x[1]),axis=1)

train_add_feature['pname_&_pid_sequence'] = train_add_feature[['pname_sequence', 'pid_label_sequence']].apply(lambda x: combine_features(x[0],x[1]),axis=1)
test_add_feature['pname_&_pid_sequence'] = test_add_feature[['pname_sequence', 'pid_label_sequence']].apply(lambda x: combine_features(x[0],x[1]),axis=1)

train_add_feature['pname_&_sid_sequence'] = train_add_feature[['pname_sequence', 'sid_label_sequence']].apply(lambda x: combine_features(x[0],x[1]),axis=1)
test_add_feature['pname_&_sid_sequence'] = test_add_feature[['pname_sequence', 'sid_label_sequence']].apply(lambda x: combine_features(x[0],x[1]),axis=1)

In [18]:
# to save memory
del train
del test
del train_seq
del test_seq
# del test_add_feature
del train_add_feature['sentence']
del train_add_feature['overdue']
del train_add_feature['order_time']
del train_add_feature['user_id']
del train_add_feature['application_time']
del train_add_feature['application_date']

In [19]:
# training dataset is twice of the testing dataset, but there're RAM issues in SGT package running
len(train_add_feature)/79949

2.499618506798084

In [13]:
# Use Length-sensitive embedding, the user can change the boolean value to customize the embedding

sgt_ = SGT(kappa=1, 
           lengthsensitive=True, 
           mode='multiprocessing')

### pname_&_stay_time_label_sequence

In [25]:
# SGT for pname_&_stay_time_label_sequence training

%%time
df = train_add_feature[['id', 'pname_&_stay_time_label_sequence']].rename(columns={'pname_&_stay_time_label_sequence':'sequence'})
cut_off = int(len(df)/2)
df_1 = df[:cut_off]
df_2 = df[cut_off:]
sgtembedding_train_pname_and_stay_time_1 = sgt_.fit_transform(df_1)
sgtembedding_train_pname_and_stay_time_2 = sgt_.fit_transform(df_2)

sgtembedding_train_pname_and_stay_time = sgtembedding_train_pname_and_stay_time_1.append(sgtembedding_train_pname_and_stay_time_2)
# sgtembedding_train_pname_and_stay_time.to_csv('processed_data/SGT/Combined Features/sgt_train_pname_and_stay_time April 2.csv')

INFO: Pandarallel will run on 3 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.
CPU times: user 8.35 s, sys: 13.5 s, total: 21.9 s
Wall time: 16min 35s


In [31]:
# SGT for pname_&_stay_time_label_sequence testing
# begin: 3.42GB to 5.26G

%%time
sgtembedding_test_pname_and_stay_time = sgt_.fit_transform(test_add_feature[['id', 'pname_&_stay_time_label_sequence']].rename(columns={'pname_&_stay_time_label_sequence':'sequence'}))
# sgtembedding_test_pname_and_stay_time.to_csv('processed_data/SGT/Combined Features/sgt_test_pname_and_stay_time April 2.csv')

INFO: Pandarallel will run on 3 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.
CPU times: user 3min 19s, sys: 14.1 s, total: 3min 33s
Wall time: 18min 26s


### pname_&_lagg_label_sequence

In [14]:
# SGT for pname_&_lagg_label_sequence training
%%time

df = train_add_feature[['id', 'pname_&_lagg_label_sequence']].rename(columns={'pname_&_lagg_label_sequence':'sequence'})
cut_off = int(len(df)/2)
df_1 = df[:cut_off]
df_2 = df[cut_off:]
sgtembedding_train_pname_and_lagg_1 = sgt_.fit_transform(df_1)
sgtembedding_train_pname_and_lagg_2 = sgt_.fit_transform(df_2)

sgtembedding_train_pname_and_lagg = sgtembedding_train_pname_and_lagg_1.append(sgtembedding_train_pname_and_lagg_2)
# sgtembedding_train_pname_and_lagg.to_csv('processed_data/SGT/Combined Features/sgt_train_pname_and_lagg April 2.csv')

INFO: Pandarallel will run on 3 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.
INFO: Pandarallel will run on 3 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.
CPU times: user 8min 42s, sys: 1min 12s, total: 9min 55s
Wall time: 48min 13s


In [33]:
# SGT for pname_&_lagg_label_sequence testing
%%time

sgtembedding_test_pname_and_lagg = sgt_.fit_transform(test_add_feature[['id', 'pname_&_lagg_label_sequence']].rename(columns={'pname_&_lagg_label_sequence':'sequence'}))
# sgtembedding_test_pname_and_lagg.to_csv('processed_data/SGT/Combined Features/sgt_test_pname_and_lagg April 2.csv')

INFO: Pandarallel will run on 3 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.
CPU times: user 3min 29s, sys: 16 s, total: 3min 45s
Wall time: 19min 11s


### pname_&_pid_sequence

In [18]:
# SGT for pname_&_pid_sequence training

%%time
df = train_add_feature[['id', 'pname_&_pid_sequence']].rename(columns={'pname_&_pid_sequence':'sequence'})
cut_off = int(len(df)/2)
df_1 = df[:cut_off]
df_2 = df[cut_off:]
sgtembedding_train_pname_and_pid_1 = sgt_.fit_transform(df_1)
sgtembedding_train_pname_and_pid_2 = sgt_.fit_transform(df_2)

sgtembedding_train_pname_and_pid = sgtembedding_train_pname_and_pid_1.append(sgtembedding_train_pname_and_pid_2)
# sgtembedding_train_pname_and_pid.to_csv('processed_data/SGT/Combined Features/sgt_train_pname_and_pid April 2.csv')

INFO: Pandarallel will run on 3 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.
INFO: Pandarallel will run on 3 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.
CPU times: user 2min 30s, sys: 13.7 s, total: 2min 43s
Wall time: 12min 31s


In [17]:
# SGT for pname_&_pid_sequence testing

%%time
sgtembedding_test_pname_and_pid = sgt_.fit_transform(test_add_feature[['id', 'pname_&_pid_sequence']].rename(columns={'pname_&_pid_sequence':'sequence'}))
# sgtembedding_test_pname_and_pid.to_csv('processed_data/SGT/Combined Features/sgt_test_pname_and_pid April 2.csv')

INFO: Pandarallel will run on 3 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.
CPU times: user 58.5 s, sys: 4.61 s, total: 1min 3s
Wall time: 5min


### pname_&_sid_sequence

In [14]:
# SGT for pname_&_sid_sequence training

%%time
df = train_add_feature[['id', 'pname_&_sid_sequence']].rename(columns={'pname_&_sid_sequence':'sequence'})
cut_off = int(len(df)/2)
df_1 = df[:cut_off]
df_2 = df[cut_off:]
sgtembedding_train_pname_and_sid_1 = sgt_.fit_transform(df_1)
sgtembedding_train_pname_and_sid_2 = sgt_.fit_transform(df_2)

sgtembedding_train_pname_and_sid = sgtembedding_train_pname_and_sid_1.append(sgtembedding_train_pname_and_sid_2)
# sgtembedding_train_pname_and_sid.to_csv('processed_data/SGT/Combined Features/sgt_train_pname_and_sid April 2.csv')

INFO: Pandarallel will run on 3 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.
INFO: Pandarallel will run on 3 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.
CPU times: user 2min 25s, sys: 13.9 s, total: 2min 38s
Wall time: 13min 20s


In [15]:
# SGT for pname_&_sid_sequence testing

%%time
sgtembedding_test_pname_and_sid = sgt_.fit_transform(test_add_feature[['id', 'pname_&_sid_sequence']].rename(columns={'pname_&_sid_sequence':'sequence'}))
# sgtembedding_test_pname_and_sid.to_csv('processed_data/SGT/Combined Features/sgt_test_pname_and_sid April 2.csv')

INFO: Pandarallel will run on 3 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.
CPU times: user 59.4 s, sys: 5.46 s, total: 1min 4s
Wall time: 5min 14s


### stay_time_label_&_lagg_label_sequence

In [None]:
# SGT for stay_time_label_&_lagg_label_sequence training
%%time

df = train_add_feature[['id', 'stay_time_label_&_lagg_label_sequence']].rename(columns={'stay_time_label_&_lagg_label_sequence':'sequence'})
cut_off = int(len(df)/2)
df_1 = df[:cut_off]
df_2 = df[cut_off:]
sgtembedding_train_stay_time_and_lagg_1 = sgt_.fit_transform(df_1)
sgtembedding_train_stay_time_and_lagg_2 = sgt_.fit_transform(df_2)

sgtembedding_train_stay_time_and_lagg = sgtembedding_train_stay_time_and_lagg_1.append(sgtembedding_train_stay_time_and_lagg_2)
sgtembedding_train_stay_time_and_lagg.to_csv('processed_data/SGT/Combined Features/sgt_train_stay_time_and_lagg April 2.csv')

In [14]:
# SGT for pname_&_lagg_label_sequence testing
%%time

sgtembedding_test_stay_time_and_lagg = sgt_.fit_transform(test_add_feature[['id', 'stay_time_label_&_lagg_label_sequence']].rename(columns={'stay_time_label_&_lagg_label_sequence':'sequence'}))
sgtembedding_test_stay_time_and_lagg.to_csv('processed_data/SGT/Combined Features/sgt_test_stay_time_and_lagg April 2.csv')

INFO: Pandarallel will run on 3 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.
CPU times: user 1min 1s, sys: 3.38 s, total: 1min 4s
Wall time: 10min 28s


# Sequence Graph Transform

In [20]:
# Use Length-sensitive embedding, the user can change the boolean value to customize the embedding

sgt_ = SGT(kappa=1, 
           lengthsensitive=True, 
           mode='multiprocessing')

In [23]:
# SGT for stay time training

%%time
sgtembedding_train_stay_time = sgt_.fit_transform(train_add_feature[['id', 'stay_time_label_sequence']].rename(columns={'stay_time_label_sequence':'sequence'}))

INFO: Pandarallel will run on 3 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.
CPU times: user 925 ms, sys: 301 ms, total: 1.23 s
Wall time: 2min 50s


In [24]:
# SGT for lag time training

%%time
sgtembedding_train_lag = sgt_.fit_transform(train_add_feature[['id', 'lagg_label_sequence']].rename(columns={'lagg_label_sequence':'sequence'}))

INFO: Pandarallel will run on 3 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.
CPU times: user 938 ms, sys: 288 ms, total: 1.23 s
Wall time: 2min 52s


In [60]:
# SGT for pid change training

%%time
sgtembedding_train_pid = sgt_.fit_transform(train_add_feature[['id', 'pid_label_sequence']].rename(columns={'pid_label_sequence':'sequence'}))

INFO: Pandarallel will run on 3 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.
CPU times: user 737 ms, sys: 249 ms, total: 985 ms
Wall time: 2min 8s


In [61]:
# SGT for sid change training

%%time
sgtembedding_train_sid = sgt_.fit_transform(train_add_feature[['id', 'sid_label_sequence']].rename(columns={'sid_label_sequence':'sequence'}))

INFO: Pandarallel will run on 3 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.
CPU times: user 588 ms, sys: 248 ms, total: 837 ms
Wall time: 2min 6s


In [19]:
# SGT for stay time testing

%%time
sgtembedding_test_stay_time = sgt_.fit_transform(test_add_feature[['id', 'stay_time_label_sequence']].rename(columns={'stay_time_label_sequence':'sequence'}))

INFO: Pandarallel will run on 3 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.
CPU times: user 925 ms, sys: 196 ms, total: 1.12 s
Wall time: 1min 8s


In [20]:
# SGT for lag time testing

%%time
sgtembedding_test_lag = sgt_.fit_transform(test_add_feature[['id', 'lagg_label_sequence']].rename(columns={'lagg_label_sequence':'sequence'}))

INFO: Pandarallel will run on 3 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.
CPU times: user 358 ms, sys: 175 ms, total: 533 ms
Wall time: 1min 8s


In [56]:
# SGT for pid change testing

%%time
sgtembedding_test_pid = sgt_.fit_transform(test_add_feature[['id', 'pid_label_sequence']].rename(columns={'pid_label_sequence':'sequence'}))

INFO: Pandarallel will run on 3 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.
CPU times: user 874 ms, sys: 186 ms, total: 1.06 s
Wall time: 52.4 s


In [59]:
# SGT for sid change testing

%%time
sgtembedding_test_sid = sgt_.fit_transform(test_add_feature[['id', 'sid_label_sequence']].rename(columns={'sid_label_sequence':'sequence'}))

INFO: Pandarallel will run on 3 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.
CPU times: user 333 ms, sys: 179 ms, total: 512 ms
Wall time: 51.9 s


In [62]:
train = pd.concat([train, sgtembedding_train_stay_time.drop(columns=['id']), sgtembedding_train_lag.drop(columns=['id']), sgtembedding_train_lag.drop(columns=['id']), sgtembedding_train_lag.drop(columns=['id'])], axis=1)
test = pd.concat([test, sgtembedding_test_stay_time.drop(columns=['id']), sgtembedding_test_lag.drop(columns=['id']), sgtembedding_test_lag.drop(columns=['id']), sgtembedding_test_lag.drop(columns=['id'])], axis=1)

In [63]:
train

Unnamed: 0,"('A', 'A')","('A', 'B')","('A', 'C')","('A', 'D')","('A', 'E')","('A', 'F')","('A', 'G')","('A', 'H')","('A', 'I')","('A', 'J')","('A', 'K')","('B', 'A')","('B', 'B')","('B', 'C')","('B', 'D')","('B', 'E')","('B', 'F')","('B', 'G')","('B', 'H')","('B', 'I')","('B', 'J')","('B', 'K')","('C', 'A')","('C', 'B')","('C', 'C')","('C', 'D')","('C', 'E')","('C', 'F')","('C', 'G')","('C', 'H')","('C', 'I')","('C', 'J')","('C', 'K')","('D', 'A')","('D', 'B')","('D', 'C')","('D', 'D')","('D', 'E')","('D', 'F')","('D', 'G')",...,"(3, 1)","(3, 2)","(3, 3)","(3, 4)","(3, 5)","(4, 1)","(4, 2)","(4, 3)","(4, 4)","(4, 5)","(5, 1)","(5, 2)","(5, 3)","(5, 4)","(5, 5)","(1, 1)","(1, 2)","(1, 3)","(1, 4)","(1, 5)","(2, 1)","(2, 2)","(2, 3)","(2, 4)","(2, 5)","(3, 1).1","(3, 2).1","(3, 3).1","(3, 4).1","(3, 5).1","(4, 1).1","(4, 2).1","(4, 3).1","(4, 4).1","(4, 5).1","(5, 1).1","(5, 2).1","(5, 3).1","(5, 4).1","(5, 5).1"
0,1.168560,0.000000,0.000000,0.000000,1.042247,1.297274e+00,0.000000,5.886071,0.0,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.000000e+00,0.000000,0.000000e+00,0.0,0.0,0.0,0.0,0.000000e+00,0.000000,0.000000,0.000000,0.000000e+00,0.000000e+00,0.000000e+00,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000e+00,0.000000,0.000000,...,0.204631,2.128022,1.467914,0.742076,0.556246,0.133861,0.815094,1.895754,2.011179,0.363873,5.886071,0.000000,2.165365,0.000000,0.000000,0.265625,1.361114,0.846261,1.039345,0.000381,1.709432,1.039061,0.249884,1.380087,1.498561,0.204631,2.128022,1.467914,0.742076,0.556246,0.133861,0.815094,1.895754,2.011179,0.363873,5.886071,0.000000,2.165365,0.000000,0.000000
1,1.247240,0.000000,0.000000,0.000000,0.588068,0.000000e+00,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.000000e+00,0.000000,0.000000e+00,0.0,0.0,0.0,0.0,0.000000e+00,0.000000,0.000000,0.000000,0.000000e+00,0.000000e+00,0.000000e+00,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000e+00,0.000000,0.000000,...,0.008864,1.814495,0.002962,0.235703,1.896240,0.854447,1.542235,3.179232,1.003521,0.361218,0.829014,3.058348,0.008051,0.967082,1.476756,1.780387,0.063549,1.110871,1.414756,1.286481,0.203746,0.723213,1.236502,2.469456,0.986653,0.008864,1.814495,0.002962,0.235703,1.896240,0.854447,1.542235,3.179232,1.003521,0.361218,0.829014,3.058348,0.008051,0.967082,1.476756
2,1.165118,0.000000,0.000000,0.000000,0.751630,5.890731e-01,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.000000e+00,0.000000,0.000000e+00,0.0,0.0,0.0,0.0,0.000000e+00,0.000000,0.000000,0.000000,0.000000e+00,0.000000e+00,0.000000e+00,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000e+00,0.000000,0.000000,...,0.007635,1.262020,0.003826,0.202973,1.905019,0.977001,1.155605,4.106508,0.875416,0.487522,1.177441,1.806467,0.010399,0.791441,1.147316,1.215796,1.315750,2.097956,2.115124,1.488148,0.416138,0.410433,1.597149,2.316461,1.409038,0.007635,1.262020,0.003826,0.202973,1.905019,0.977001,1.155605,4.106508,0.875416,0.487522,1.177441,1.806467,0.010399,0.791441,1.147316
3,2.353713,0.037895,5.898686,0.000010,2.545042,2.442019e-07,3.025759,0.586100,0.0,0.0,0.0,0.000408,3.837538,0.0,0.803288,6.329104e-07,0.018988,6.424399e-05,0.0,0.0,0.0,0.0,5.279878e-08,0.381386,9.291671,0.000104,8.192062e-11,2.457719e-06,8.315406e-09,5.898686,0.0,0.0,0.0,0.049199,0.0,0.0,7.595178,7.633578e-05,2.290167,0.007749,...,1.135881,1.942011,1.006764,0.984520,0.212711,0.999410,2.806440,1.918968,0.980372,1.374940,1.089643,0.000000,11.772142,4.330729,0.000000,0.915921,3.599979,0.646343,1.424043,0.757846,0.250679,0.847734,1.279681,0.926493,0.000007,1.135881,1.942011,1.006764,0.984520,0.212711,0.999410,2.806440,1.918968,0.980372,1.374940,1.089643,0.000000,11.772142,4.330729,0.000000
4,1.761646,0.006249,3.721522,0.000015,1.388694,3.270896e+00,1.840279,0.120074,0.0,0.0,0.0,0.000585,8.921358,0.0,1.323277,2.581543e-10,0.003138,2.015858e-07,0.0,0.0,0.0,0.0,2.896519e-07,0.264364,12.342165,0.000655,1.278394e-13,1.553894e-06,9.982631e-11,5.079707,0.0,0.0,0.0,0.235971,0.0,0.0,8.921358,1.041469e-07,1.265911,0.000081,...,0.510962,0.753333,1.583113,1.877349,0.385417,1.346925,0.274831,2.995267,1.018475,0.227396,1.141712,3.036548,1.649193,2.270315,1.743327,0.796847,2.382913,1.169729,1.009809,1.243031,1.328732,1.922719,0.760983,0.173912,0.956035,0.510962,0.753333,1.583113,1.877349,0.385417,1.346925,0.274831,2.995267,1.018475,0.227396,1.141712,3.036548,1.649193,2.270315,1.743327
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
199837,1.157871,0.000000,0.000000,0.000000,1.518688,1.016857e+00,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.000000e+00,0.000000,0.000000e+00,0.0,0.0,0.0,0.0,0.000000e+00,0.000000,0.000000,0.000000,0.000000e+00,0.000000e+00,0.000000e+00,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000e+00,0.000000,0.000000,...,1.939893,0.109906,0.141497,1.430595,1.558355,1.424294,1.417098,0.037386,0.348555,0.523935,0.528084,0.748495,2.231120,2.111466,1.169906,0.819225,1.094030,0.362359,0.758902,1.537522,1.494113,0.947365,0.193610,0.000351,2.420709,1.939893,0.109906,0.141497,1.430595,1.558355,1.424294,1.417098,0.037386,0.348555,0.523935,0.528084,0.748495,2.231120,2.111466,1.169906
199838,1.721346,0.924489,3.012008,0.004117,1.351592,1.328265e-04,1.833145,0.161334,0.0,0.0,0.0,0.283734,5.553204,0.0,1.437296,4.201839e-05,0.046374,2.308823e-03,0.0,0.0,0.0,0.0,2.427410e-01,0.097004,6.341315,0.000010,2.862933e-10,3.159700e-07,1.573122e-08,2.807744,0.0,0.0,0.0,0.069694,0.0,0.0,12.104816,4.514096e-03,4.982020,0.248040,...,0.082496,0.812796,3.047313,0.659893,0.256298,1.643463,1.676210,2.613234,0.671310,0.629108,0.873508,4.214007,0.003304,1.692208,4.579649,2.027647,3.498951,0.553058,2.489854,0.707234,0.346294,2.469108,1.520467,0.966922,0.028406,0.082496,0.812796,3.047313,0.659893,0.256298,1.643463,1.676210,2.613234,0.671310,0.629108,0.873508,4.214007,0.003304,1.692208,4.579649
199839,1.769446,0.012689,2.643845,0.000016,1.954622,2.361563e-07,1.545588,0.152760,0.0,0.0,0.0,0.000291,6.458196,0.0,1.122467,1.098277e-05,0.016460,8.194865e-04,0.0,0.0,0.0,0.0,5.361664e-01,0.400500,5.320819,0.000508,4.973564e-09,7.453858e-06,3.711057e-07,4.821604,0.0,0.0,0.0,0.060103,0.0,0.0,8.580278,2.268675e-03,3.400053,0.169279,...,0.313601,0.876403,1.447900,0.764050,1.195724,0.686394,1.671090,5.225687,1.369194,0.717106,1.655561,0.685802,2.750232,0.812797,1.619249,1.517431,3.059699,0.611532,1.667155,1.318166,0.239470,2.516462,1.481638,0.784791,0.556798,0.313601,0.876403,1.447900,0.764050,1.195724,0.686394,1.671090,5.225687,1.369194,0.717106,1.655561,0.685802,2.750232,0.812797,1.619249
199840,1.310544,0.000000,0.000000,0.000000,0.751603,3.133677e+00,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.000000e+00,0.000000,0.000000e+00,0.0,0.0,0.0,0.0,0.000000e+00,0.000000,0.000000,0.000000,0.000000e+00,0.000000e+00,0.000000e+00,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000e+00,0.000000,0.000000,...,0.088360,1.688449,1.088151,2.136878,1.027113,0.042139,0.311366,2.856717,0.000000,0.480463,1.777033,2.103864,1.275324,1.573540,1.455423,0.000005,2.167308,1.059780,0.002098,0.014440,0.575381,1.236298,0.801164,0.110857,1.185384,0.088360,1.688449,1.088151,2.136878,1.027113,0.042139,0.311366,2.856717,0.000000,0.480463,1.777033,2.103864,1.275324,1.573540,1.455423


In [64]:
train.to_csv('processed_data/SGT/sgt_train_all April 2.csv')
test.to_csv('processed_data/SGT/sgt_test_all April 2.csv')