# Shread train-dev-test csv datasets progressively using BigQuery 

In [1]:
import google.datalab.bigquery as bq
import pandas as pd
import numpy as np
import seaborn as sns
import shutil

In [20]:
def test_sample(a,b):
  basequery = """
  SELECT MAX(farmhash) as max_farmhash, COUNT(answer_count) as count
  FROM
  (
  SELECT 
    MOD(ABS(FARM_FINGERPRINT(CAST(id as STRING))), EVERY_N * 100)  as farmhash, answer_count
  FROM 
    `bigquery-public-data.stackoverflow.posts_questions`
  """
  sampler = "WHERE MOD(ABS(FARM_FINGERPRINT(CAST(id as STRING))), EVERY_N * 100) < 20 AND  MOD(ABS(FARM_FINGERPRINT(CAST(id as STRING))), EVERY_N * 100) >= 10  "
  sampler2 = "AND {0} >= {1}\n AND {0} < {2} )".format(
           "MOD(ABS(FARM_FINGERPRINT(CAST(id AS STRING))), EVERY_N * 100) * {}".format(10),
           a, b
          )
    
  return "{}\n{}\n{}".format(basequery, sampler, sampler2)

EVERY_N = 100
query_maxhash = test_sample(0,70).replace("EVERY_N", str(EVERY_N))
df_maxhash = bq.Query(query_maxhash).execute().result().to_dataframe()
print(df_maxhash)


def test_sample2(a,b):
  basequery = """
  SELECT MIN(farmhash10) as min_farmhash10, MAX(farmhash10) as max_farmhash10, COUNT(answer_count) as count
  FROM
  (  
  SELECT 
    MOD(ABS(FARM_FINGERPRINT(CAST(id as STRING))), EVERY_N * 100)*10  as farmhash10, answer_count
  FROM 
    `bigquery-public-data.stackoverflow.posts_questions`
  """
  sampler = "WHERE MOD(ABS(FARM_FINGERPRINT(CAST(id as STRING))), EVERY_N * 100) < 20 AND  MOD(ABS(FARM_FINGERPRINT(CAST(id as STRING))), EVERY_N * 100) >= 10  "
  sampler2 = "AND {0} >= {1}\n AND {0} < {2})".format(
           "MOD(ABS(FARM_FINGERPRINT(CAST(id AS STRING))), EVERY_N * 100) * {}".format(10),
           (10*10)+a, (10*10)+b
          )
    
  return "{}\n{}\n{}".format(basequery, sampler, sampler2)
  #return "{}\n{}".format(basequery, sampler)


EVERY_N = 100
queryhash = test_sample2(0,60).replace("EVERY_N", str(EVERY_N))
df_hash = bq.Query(queryhash).execute().result().to_dataframe()
print(df_hash.head())


  max_farmhash  count
0         None      0
   min_farmhash10  max_farmhash10  count
0             100             150  10333


In [26]:

def sample_between(a, b, shredstart):
  basequery = """
  SELECT 
    answer_count, comment_count, favorite_count,  score, view_count,
    TIMESTAMP_DIFF(last_activity_date, creation_date, DAY) as days_posted,
    IF(accepted_answer_id IS NULL , 0, 1) as accepted
  FROM 
    `bigquery-public-data.stackoverflow.posts_questions`
  """
  
  # Use sampling for initial model development. Once model is developed, shread the entire dataset into  .csv files based on condition in the sampler.
  sampler = "WHERE MOD(ABS(FARM_FINGERPRINT(CAST(id as STRING))), EVERY_N * 100) < {1} AND MOD(ABS(FARM_FINGERPRINT(CAST(id as STRING))), EVERY_N * 100) >= {0}".format(
            shredstart, shredstart + 10
            )
  sampler2 = "AND {0} >= {1}\n AND {0} < {2}".format(
           "MOD(ABS(FARM_FINGERPRINT(CAST(id AS STRING))), EVERY_N * 100) * {}".format(10),
           (shredstart*10)+a, (shredstart*10)+b
          )
  return "{}\n{}\n{}".format(basequery, sampler, sampler2)


def create_query(phase, EVERY_N, shredstart):
  """Phase: train (70%) valid (15%) or test (15%)"""
  query = ""
  if phase == 'train':
    query = sample_between(0,60, shredstart)
  elif phase == 'valid':
    query = sample_between(60,75, shredstart)
  else:
    query = sample_between(75, 100, shredstart)
  return query.replace("EVERY_N", str(EVERY_N))

#print(create_query('train', 100))
#(answer_count - AVG(answer_count)) / STDDEV_POP(answer_count)  as answer_count,

In [27]:
def to_csv(df, filename):
  outdf = df.copy(deep = True)
  #outdf.loc[:, 'key'] = np.arange(0, len(outdf)) # rownumber as key
  # Reorder columns so that target is first column
  #print(outdf.head())
  #print(df.head())
  cols = outdf.columns.tolist()
  #print(cols)
  cols.remove('accepted')
  cols.insert(0, 'accepted')
  #print(cols)
  outdf = outdf[cols]  
  
  
  #Normalizing input columns  and replace NaN or null
  normalize_cols = outdf.columns.tolist()
  normalize_cols.remove('accepted')
  for normalize_cols_name in normalize_cols:
    outdf[normalize_cols_name].fillna(0, inplace = True)
    outdf[normalize_cols_name] = (outdf[normalize_cols_name] - outdf[normalize_cols_name].mean())  / outdf[normalize_cols_name].std() 
  #print(outdf)
  #print(outdf['answer_count'] )
  outdf.to_csv(filename,  header = True, index_label = False, index = False)
  print("Wrote {} to {}".format(len(outdf), filename))

In [28]:

for phase in ['train', 'valid', 'test']:
  for x in range(10):
    query = create_query(phase, 100, x*10)
    #print(query)
    df = bq.Query(query).execute().result().to_dataframe()
    #print(df.head())
    to_csv(df, 'stackoverflow-{}-{}.csv'.format(phase,(x+1)*10))

Wrote 10186 to stackoverflow-train-10.csv
Wrote 10333 to stackoverflow-train-20.csv
Wrote 10426 to stackoverflow-train-30.csv
Wrote 10260 to stackoverflow-train-40.csv
Wrote 10298 to stackoverflow-train-50.csv
Wrote 10401 to stackoverflow-train-60.csv
Wrote 10276 to stackoverflow-train-70.csv
Wrote 10249 to stackoverflow-train-80.csv
Wrote 10291 to stackoverflow-train-90.csv
Wrote 10332 to stackoverflow-train-100.csv
Wrote 3500 to stackoverflow-valid-10.csv
Wrote 3453 to stackoverflow-valid-20.csv
Wrote 3367 to stackoverflow-valid-30.csv
Wrote 3573 to stackoverflow-valid-40.csv
Wrote 3482 to stackoverflow-valid-50.csv
Wrote 3476 to stackoverflow-valid-60.csv
Wrote 3431 to stackoverflow-valid-70.csv
Wrote 3496 to stackoverflow-valid-80.csv
Wrote 3469 to stackoverflow-valid-90.csv
Wrote 3354 to stackoverflow-valid-100.csv
Wrote 3352 to stackoverflow-test-10.csv
Wrote 3476 to stackoverflow-test-20.csv
Wrote 3439 to stackoverflow-test-30.csv
Wrote 3408 to stackoverflow-test-40.csv
Wrote 35

##### Refactor by not using sampling and creating large shreaded datasets (check size)

In [30]:
!ls -l *.csv

-rw-r--r-- 1 root root  421033 May 16 01:32 stackoverflow-test-100.csv
-rw-r--r-- 1 root root  414416 May 16 01:31 stackoverflow-test-10.csv
-rw-r--r-- 1 root root  429912 May 16 01:31 stackoverflow-test-20.csv
-rw-r--r-- 1 root root  427740 May 16 01:31 stackoverflow-test-30.csv
-rw-r--r-- 1 root root  418516 May 16 01:31 stackoverflow-test-40.csv
-rw-r--r-- 1 root root  440698 May 16 01:32 stackoverflow-test-50.csv
-rw-r--r-- 1 root root  421966 May 16 01:32 stackoverflow-test-60.csv
-rw-r--r-- 1 root root  429984 May 16 01:32 stackoverflow-test-70.csv
-rw-r--r-- 1 root root  444369 May 16 01:32 stackoverflow-test-80.csv
-rw-r--r-- 1 root root  427138 May 16 01:32 stackoverflow-test-90.csv
-rw-r--r-- 1 root root 1286710 May 16 01:30 stackoverflow-train-100.csv
-rw-r--r-- 1 root root 1272065 May 16 01:29 stackoverflow-train-10.csv
-rw-r--r-- 1 root root 1284726 May 16 01:29 stackoverflow-train-20.csv
-rw-r--r-- 1 root root 1312253 May 16 01:29 stackoverflow-train-30.csv


In [31]:
%bash
head stackoverflow-test-100.csv

accepted,answer_count,comment_count,favorite_count,score,view_count,days_posted
0,-1.0411391480812462,-0.367090763865624,-0.16657359952844625,-0.17285273952546185,-0.19863303033934546,-0.36901565664583064
1,0.3044971393420383,0.0008645057730129897,-0.16657359952844625,-0.08359529668588077,0.0982786543917133,-0.36901565664583064
1,-0.3683210043696039,0.7367750450502869,-0.16657359952844625,0.005662146153700297,-0.1545800154267018,-0.07929916822667062
1,-0.3683210043696039,-0.367090763865624,-0.16657359952844625,-0.351367625204624,0.03017964413229615,-0.36901565664583064
1,-0.3683210043696039,0.0008645057730129897,0.4539768451043321,-0.08359529668588077,0.19286583140031063,0.33218222112228135
1,0.3044971393420383,0.0008645057730129897,0.4539768451043321,-0.08359529668588077,0.07056940194132977,-0.35431989274051096
0,-0.3683210043696039,-0.367090763865624,0.14370162278794293,-0.17285273952546185,-0.19534549191302877,-0.36901565664583064
0,-1.0411391480812462,0.36881977541164995,-0.1665735

# tf.estimator modeling

In [32]:
# Ensure that we have TensorFlow 1.13.1 installed.
!pip3 freeze | grep tensorflow==1.13.1 || pip3 install tensorflow==1.13.1

Collecting tensorflow==1.13.1
[?25l  Downloading https://files.pythonhosted.org/packages/ca/f2/0931c194bb98398017d52c94ee30e5e1a4082ab6af76e204856ff1fdb33e/tensorflow-1.13.1-cp35-cp35m-manylinux1_x86_64.whl (92.5MB)
[K    100% |████████████████████████████████| 92.5MB 296kB/s eta 0:00:01  2% |▉                               | 2.4MB 36.0MB/s eta 0:00:03    8% |██▋                             | 7.4MB 36.1MB/s eta 0:00:03    9% |███                             | 8.9MB 35.3MB/s eta 0:00:03    11% |███▋                            | 10.4MB 32.7MB/s eta 0:00:03    12% |████▏                           | 11.9MB 28.9MB/s eta 0:00:03    22% |███████▎                        | 21.0MB 30.4MB/s eta 0:00:03    25% |████████▎                       | 24.0MB 36.1MB/s eta 0:00:02    27% |████████▉                       | 25.5MB 35.7MB/s eta 0:00:02    48% |███████████████▌                | 44.9MB 15.5MB/s eta 0:00:04    49% |████████████████                | 46.2MB 14.5MB/s eta 0:00:04    56% |█████████

In [33]:
import tensorflow as tf
import pandas as pd
import shutil

print(tf.__version__)

  from ._conv import register_converters as _register_converters


1.13.1


In [34]:
!ls -l *.csv

-rw-r--r-- 1 root root  421033 May 16 01:32 stackoverflow-test-100.csv
-rw-r--r-- 1 root root  414416 May 16 01:31 stackoverflow-test-10.csv
-rw-r--r-- 1 root root  429912 May 16 01:31 stackoverflow-test-20.csv
-rw-r--r-- 1 root root  427740 May 16 01:31 stackoverflow-test-30.csv
-rw-r--r-- 1 root root  418516 May 16 01:31 stackoverflow-test-40.csv
-rw-r--r-- 1 root root  440698 May 16 01:32 stackoverflow-test-50.csv
-rw-r--r-- 1 root root  421966 May 16 01:32 stackoverflow-test-60.csv
-rw-r--r-- 1 root root  429984 May 16 01:32 stackoverflow-test-70.csv
-rw-r--r-- 1 root root  444369 May 16 01:32 stackoverflow-test-80.csv
-rw-r--r-- 1 root root  427138 May 16 01:32 stackoverflow-test-90.csv
-rw-r--r-- 1 root root 1286710 May 16 01:30 stackoverflow-train-100.csv
-rw-r--r-- 1 root root 1272065 May 16 01:29 stackoverflow-train-10.csv
-rw-r--r-- 1 root root 1284726 May 16 01:29 stackoverflow-train-20.csv
-rw-r--r-- 1 root root 1312253 May 16 01:29 stackoverflow-train-30.csv


In [None]:
'''
# Before reading csv was incorporated using pandas dataframe
# But now reading csv is incorporated using tensorflow and so it's in the graps and also it reads progressively the shreaded files

df_train = pd.read_csv(filepath_or_buffer = "./stackoverflow-train.csv")
df_valid = pd.read_csv(filepath_or_buffer = "./stackoverflow-valid.csv")
df_test = pd.read_csv(filepath_or_buffer = "./stackoverflow-test.csv")

CSV_COLUMNNAMES = list(df_train) # CSV_COLUMNNAMES = df_train.columns.tolist()
print(CSV_COLUMNNAMES)

FEATURE_NAMES = CSV_COLUMNNAMES[1:]
LABEL_NAME = CSV_COLUMNNAMES[0]
'''

In [None]:
'''featcols = [ tf.feature_column.numeric_column(feat) for feat in  FEATURE_NAMES ]
#print(featcols) '''

In [52]:
CSV_COLUMNS = ['accepted', 'answer_count', 'comment_count', 'favorite_count', 'score', 'view_count', 'days_posted']
DEFAULTS = [[0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0]]
DEFAULTS = [tf.constant([], dtype=tf.int32),
            tf.constant([], dtype=tf.float32)],

            ]


def read_dataset(filename, mode, batch_size = 512):
  def decode_line(row):
    cols = tf.decode_csv(row, record_defaults = DEFAULTS)
    #print(cols)
    features = dict(zip(CSV_COLUMNS,cols))
    #print(cols)
    label = features.pop('accepted')  # remove label from features and store
    return features, label
  
  # Create list of file names that match "glob" pattern (i.e. data_file_*.csv)
  filenames_dataset = tf.data.Dataset.list_files(filename, shuffle=False)
  # Read lines from text files
  textlines_dataset = filenames_dataset.flat_map(tf.data.TextLineDataset)
  # Parse text lines as comma-separated values (CSV)
  dataset = textlines_dataset.map(decode_line)
  
  # Note:
  # use tf.data.Dataset.flat_map to apply one to many transformations (here: filename -> text lines)
  # use tf.data.Dataset.map      to apply one to one  transformations (here: text line -> feature list)
  
  if(mode == tf.estimator.ModeKeys.TRAIN):
    num_epochs = None  # loop indefinitely
    dataset = dataset.shuffle(buffer_size = 10*batch_size, seed=2)
  else:
    num_epochs = 1
  
  dataset = dataset.repeat(num_epochs).batch(batch_size)
  return dataset

  
def get_train_input_fn():
  return read_dataset('./stackoverflow-train-10.csv', tf.estimator.ModeKeys.TRAIN)

def get_valid_input_fn():
  return read_dataset('./stackoverflow-valid-10.csv', tf.estimator.ModeKeys.EVAL)

def get_test_input_fn():
  return read_dataset('./stackoverflow-test-10.csv', tf.estimator.ModeKeys.PREDICT)

In [53]:
FEATURE_NAMES = CSV_COLUMNS[1:]
LABEL_NAME = CSV_COLUMNS[0]

featcols = [ tf.feature_column.numeric_column(feat) for feat in  FEATURE_NAMES ]
#print(featcols)

In [54]:
%%time
OUTDIR = "stackoverflow_model"

tf.logging.set_verbosity(tf.logging.INFO)
shutil.rmtree(path = OUTDIR, ignore_errors = True)

model = tf.estimator.DNNClassifier(
    hidden_units = [1024, 512, 128, 32],  # specify neural architecture
    feature_columns = featcols,
    n_classes=2,
    optimizer = tf.train.AdamOptimizer(learning_rate=0.001),
    model_dir = OUTDIR,
    config = tf.estimator.RunConfig(tf_random_seed = 1)  
  )

model.train(
    input_fn = lambda : get_train_input_fn(),
    steps = 200
  )

INFO:tensorflow:Using config: {'_eval_distribute': None, '_num_worker_replicas': 1, '_keep_checkpoint_max': 5, '_tf_random_seed': 1, '_task_type': 'worker', '_evaluation_master': '', '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f83d0829358>, '_model_dir': 'stackoverflow_model', '_num_ps_replicas': 0, '_protocol': None, '_experimental_distribute': None, '_save_summary_steps': 100, '_master': '', '_global_id_in_cluster': 0, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_service': None, '_save_checkpoints_secs': 600, '_keep_checkpoint_every_n_hours': 10000, '_train_distribute': None, '_is_chief': True, '_task_id': 0, '_log_step_count_steps': 100, '_device_fn': None, '_save_checkpoints_steps': None}
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running l

InvalidArgumentError: Field 0 in record 0 is not a valid int32: accepted
	 [[{{node DecodeCSV}}]]
	 [[node IteratorGetNext (defined at /usr/local/envs/py3env/lib/python3.5/site-packages/tensorflow_estimator/python/estimator/util.py:110) ]]

In [None]:
def validate_rmse(model, df_validation):
  metrices = model.evaluate(input_fn = lambda : get_valid_input_fn() )
  print("RMSE on dataset = {}".format(metrices["average_loss"]**.5))

#validate_rmse(model, df_train)
validate_rmse(model, df_valid)

on training set evaluate

INFO:tensorflow:Saving dict for global step 500: accuracy = 0.6811679, accuracy_baseline = 0.5281519, auc = 0.7331483, auc_precision_recall = 0.7117538, average_loss = 0.56418276, global_step = 500, label/mean = 0.5281519, loss = 71.741234, precision = 0.6524123, prediction/mean = 0.48095724, recall = 0.8482496
RMSE on dataset = 0.7511210011251841

In [None]:
raw_predictions = model.predict( input_fn = lambda : get_test_input_fn() )

#print(next(raw_predictions))
#print(next(raw_predictions))
#print(next(raw_predictions))
# class_ids determine the prediction

predictions = [p['class_ids'][0] for p in raw_predictions]

#confusion_matrix = tf.confusion_matrix(df_test['accepted'], predictions)
#print(confusion_matrix)

In [None]:
df_test_predictions = df_test.copy(deep = True)
df_test_predictions['accepted'] = predictions

In [None]:
#sns.set(style="ticks", color_codes=True)
sns.pairplot(df_test, hue="accepted", palette="husl")

In [None]:
sns.pairplot(df_test_predictions, hue="accepted", palette="husl")