# Shread train-dev-test csv datasets progressively using BigQuery 

In [10]:
import google.datalab.bigquery as bq
import pandas as pd
import numpy as np
import seaborn as sns
import shutil

In [11]:
def test_sample(a,b):
  basequery = """
  SELECT MAX(farmhash) as max_farmhash, COUNT(answer_count) as count
  FROM
  (
  SELECT 
    MOD(ABS(FARM_FINGERPRINT(CAST(id as STRING))), EVERY_N * 100)  as farmhash, answer_count
  FROM 
    `bigquery-public-data.stackoverflow.posts_questions`
  """
  sampler = "WHERE MOD(ABS(FARM_FINGERPRINT(CAST(id as STRING))), EVERY_N * 100) < 20 AND  MOD(ABS(FARM_FINGERPRINT(CAST(id as STRING))), EVERY_N * 100) >= 10  "
  sampler2 = "AND {0} >= {1}\n AND {0} < {2} )".format(
           "MOD(ABS(FARM_FINGERPRINT(CAST(id AS STRING))), EVERY_N * 100) * {}".format(10),
           a, b
          )
    
  return "{}\n{}\n{}".format(basequery, sampler, sampler2)

EVERY_N = 100
query_maxhash = test_sample(0,70).replace("EVERY_N", str(EVERY_N))
df_maxhash = bq.Query(query_maxhash).execute().result().to_dataframe()
print(df_maxhash)


def test_sample2(a,b):
  basequery = """
  SELECT MIN(farmhash10) as min_farmhash10, MAX(farmhash10) as max_farmhash10, COUNT(answer_count) as count
  FROM
  (  
  SELECT 
    MOD(ABS(FARM_FINGERPRINT(CAST(id as STRING))), EVERY_N * 100)*10  as farmhash10, answer_count
  FROM 
    `bigquery-public-data.stackoverflow.posts_questions`
  """
  sampler = "WHERE MOD(ABS(FARM_FINGERPRINT(CAST(id as STRING))), EVERY_N * 100) < 20 AND  MOD(ABS(FARM_FINGERPRINT(CAST(id as STRING))), EVERY_N * 100) >= 10  "
  sampler2 = "AND {0} >= {1}\n AND {0} < {2})".format(
           "MOD(ABS(FARM_FINGERPRINT(CAST(id AS STRING))), EVERY_N * 100) * {}".format(10),
           (10*10)+a, (10*10)+b
          )
    
  return "{}\n{}\n{}".format(basequery, sampler, sampler2)
  #return "{}\n{}".format(basequery, sampler)


EVERY_N = 100
queryhash = test_sample2(0,60).replace("EVERY_N", str(EVERY_N))
df_hash = bq.Query(queryhash).execute().result().to_dataframe()
print(df_hash.head())


  max_farmhash  count
0         None      0
   min_farmhash10  max_farmhash10  count
0             100             150  10333


In [12]:

def sample_between(a, b, shredstart):
  basequery = """
  SELECT 
    answer_count, comment_count, favorite_count,  score, view_count,
    TIMESTAMP_DIFF(last_activity_date, creation_date, DAY) as days_posted,
    IF(accepted_answer_id IS NULL , 0, 1 ) as accepted
  FROM 
    `bigquery-public-data.stackoverflow.posts_questions`
  """
  
  # Use sampling for initial model development. Once model is developed, shread the entire dataset into  .csv files based on condition in the sampler.
  sampler = "WHERE MOD(ABS(FARM_FINGERPRINT(CAST(id as STRING))), EVERY_N * 100) < {1} AND MOD(ABS(FARM_FINGERPRINT(CAST(id as STRING))), EVERY_N * 100) >= {0}".format(
            shredstart, shredstart + 10
            )
  sampler2 = "AND {0} >= {1}\n AND {0} < {2}".format(
           "MOD(ABS(FARM_FINGERPRINT(CAST(id AS STRING))), EVERY_N * 100) * {}".format(10),
           (shredstart*10)+a, (shredstart*10)+b
          )
  return "{}\n{}\n{}".format(basequery, sampler, sampler2)


def create_query(phase, EVERY_N, shredstart):
  """Phase: train (70%) valid (15%) or test (15%)"""
  query = ""
  if phase == 'train':
    query = sample_between(0,60, shredstart)
  elif phase == 'valid':
    query = sample_between(60,75, shredstart)
  else:
    query = sample_between(75, 100, shredstart)
  return query.replace("EVERY_N", str(EVERY_N))

#print(create_query('train', 100))
#(answer_count - AVG(answer_count)) / STDDEV_POP(answer_count)  as answer_count,
#IF(accepted_answer_id IS NULL , cast(0 as int64), cast(1 as int64)) as accepted

In [13]:
def to_csv(df, filename):
  outdf = df.copy(deep = True)
  #outdf.loc[:, 'key'] = np.arange(0, len(outdf)) # rownumber as key
  # Reorder columns so that target is first column
  #print(outdf.head())
  #print(df.head())
  cols = outdf.columns.tolist()
  #print(cols)
  cols.remove('accepted')
  cols.insert(0, 'accepted')
  #print(cols)
  outdf = outdf[cols]  
  
  
  #Normalizing input columns  and replace NaN or null
  normalize_cols = outdf.columns.tolist()
  normalize_cols.remove('accepted')
  for normalize_cols_name in normalize_cols:
    outdf[normalize_cols_name].fillna(0, inplace = True)
    outdf[normalize_cols_name] = (outdf[normalize_cols_name] - outdf[normalize_cols_name].mean())  / outdf[normalize_cols_name].std() 
  #print(outdf)
  #print(outdf['answer_count'] )
  outdf.to_csv(filename,  header = True, index_label = False, index = False)
  print("Wrote {} to {}".format(len(outdf), filename))

In [14]:

for phase in ['train', 'valid', 'test']:
  #for x in range(10):
  for x in range(2):
    query = create_query(phase, 100, x*10)
    #print(query)
    df = bq.Query(query).execute().result().to_dataframe()
    #print(df.head())
    to_csv(df, 'stackoverflow-{}-{}.csv'.format(phase,(x+1)*10))

Wrote 10186 to stackoverflow-train-10.csv
Wrote 10333 to stackoverflow-train-20.csv
Wrote 3500 to stackoverflow-valid-10.csv
Wrote 3453 to stackoverflow-valid-20.csv
Wrote 3352 to stackoverflow-test-10.csv
Wrote 3476 to stackoverflow-test-20.csv


##### Refactor by not using sampling and creating large shreaded datasets (check size)

In [15]:
!ls -l *.csv

-rw-r--r-- 1 root root  412884 May 16 06:56 stackoverflow-test-10.csv
-rw-r--r-- 1 root root  430803 May 16 06:56 stackoverflow-test-20.csv
-rw-r--r-- 1 root root 1272276 May 16 06:56 stackoverflow-train-10.csv
-rw-r--r-- 1 root root 1268321 May 16 06:56 stackoverflow-train-20.csv
-rw-r--r-- 1 root root  434375 May 16 06:56 stackoverflow-valid-10.csv
-rw-r--r-- 1 root root  430956 May 16 06:56 stackoverflow-valid-20.csv


In [18]:
%bash
head stackoverflow-test-100.csv

head: cannot open 'stackoverflow-test-100.csv' for reading: No such file or directory


# tf.estimator modeling

In [9]:
# Ensure that we have TensorFlow 1.13.1 installed.
!pip3 freeze | grep tensorflow==1.13.1 || pip3 install tensorflow==1.13.1

Collecting tensorflow==1.13.1
[?25l  Downloading https://files.pythonhosted.org/packages/ca/f2/0931c194bb98398017d52c94ee30e5e1a4082ab6af76e204856ff1fdb33e/tensorflow-1.13.1-cp35-cp35m-manylinux1_x86_64.whl (92.5MB)
[K    100% |████████████████████████████████| 92.5MB 319kB/s eta 0:00:01  1% |▍                               | 1.2MB 29.5MB/s eta 0:00:04    6% |██▏                             | 6.3MB 35.7MB/s eta 0:00:03    15% |████▉                           | 14.0MB 34.2MB/s eta 0:00:03    20% |██████▌                         | 18.8MB 34.7MB/s eta 0:00:03    22% |███████                         | 20.4MB 28.7MB/s eta 0:00:03    27% |████████▋                       | 25.0MB 34.7MB/s eta 0:00:02    28% |█████████                       | 26.1MB 36.2MB/s eta 0:00:02    36% |███████████▋                    | 33.7MB 33.9MB/s eta 0:00:02    47% |███████████████▏                | 44.0MB 34.1MB/s eta 0:00:02    48% |███████████████▌                | 44.7MB 14.3MB/s eta 0:00:04    49% |███████

In [2]:
import tensorflow as tf
import pandas as pd
import shutil

print(tf.__version__)

  from ._conv import register_converters as _register_converters


1.13.1


In [3]:
#tf.enable_eager_execution()

In [3]:
!ls -l *.csv

-rw-r--r-- 1 root root  423423 May 16 06:13 stackoverflow-test-100.csv
-rw-r--r-- 1 root root  414416 May 16 06:12 stackoverflow-test-10.csv
-rw-r--r-- 1 root root  428712 May 16 06:12 stackoverflow-test-20.csv
-rw-r--r-- 1 root root  427717 May 16 06:12 stackoverflow-test-30.csv
-rw-r--r-- 1 root root  418516 May 16 06:12 stackoverflow-test-40.csv
-rw-r--r-- 1 root root  440698 May 16 06:12 stackoverflow-test-50.csv
-rw-r--r-- 1 root root  424388 May 16 06:12 stackoverflow-test-60.csv
-rw-r--r-- 1 root root  429867 May 16 06:12 stackoverflow-test-70.csv
-rw-r--r-- 1 root root  441008 May 16 06:13 stackoverflow-test-80.csv
-rw-r--r-- 1 root root  427138 May 16 06:13 stackoverflow-test-90.csv
-rw-r--r-- 1 root root 1273856 May 16 06:11 stackoverflow-train-100.csv
-rw-r--r-- 1 root root 1272276 May 16 06:09 stackoverflow-train-10.csv
-rw-r--r-- 1 root root 1269927 May 16 06:10 stackoverflow-train-20.csv
-rw-r--r-- 1 root root 1305190 May 16 06:10 stackoverflow-train-30.csv


In [None]:
'''
# Before reading csv was incorporated using pandas dataframe
# But now reading csv is incorporated using tensorflow and so it's in the graps and also it reads progressively the shreaded files

df_train = pd.read_csv(filepath_or_buffer = "./stackoverflow-train.csv")
df_valid = pd.read_csv(filepath_or_buffer = "./stackoverflow-valid.csv")
df_test = pd.read_csv(filepath_or_buffer = "./stackoverflow-test.csv")

CSV_COLUMNNAMES = list(df_train) # CSV_COLUMNNAMES = df_train.columns.tolist()
print(CSV_COLUMNNAMES)

FEATURE_NAMES = CSV_COLUMNNAMES[1:]
LABEL_NAME = CSV_COLUMNNAMES[0]
'''

In [25]:
# Debugging  issue:  Field 0 in record 0 is not a valid int32: accepted
#	 [[{{node DecodeCSV}}]]
#	 [[node IteratorGetNext (defined at /usr/local/envs/py3env/lib/python3.5/site-packages/tensorflow_estimator/python/estimator/util.py:110) ]]

df_train = pd.read_csv(filepath_or_buffer = "stackoverflow-test-10.csv")
df_train.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3352 entries, 0 to 3351
Data columns (total 7 columns):
accepted          3352 non-null int64
answer_count      3352 non-null float64
comment_count     3352 non-null float64
favorite_count    3352 non-null float64
score             3352 non-null float64
view_count        3352 non-null float64
days_posted       3352 non-null float64
dtypes: float64(6), int64(1)
memory usage: 183.4 KB


In [None]:
'''featcols = [ tf.feature_column.numeric_column(feat) for feat in  FEATURE_NAMES ]
#print(featcols) '''

In [33]:
CSV_COLUMNS = ['accepted', 'answer_count', 'comment_count', 'favorite_count', 'score', 'view_count', 'days_posted']
#DEFAULTS = [[], [], [], [], [], [], []]
DEFAULTS = [[0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0]]

#DEFAULTS = [tf.constant([0], dtype=tf.int32),
#            tf.constant([0.0], dtype=tf.float32),
#            tf.constant([0.0], dtype=tf.float32),
#           tf.constant([0.0], dtype=tf.float32),
#            tf.constant([0.0], dtype=tf.float32),
#            tf.constant([0.0], dtype=tf.float32),
#            tf.constant([0.0], dtype=tf.float32) ]


def read_dataset(filename, mode, batch_size = 512):
  def decode_line(row):
    print(row)
    cols = tf.decode_csv(row, record_defaults = DEFAULTS)
    print(cols)
    features = dict(zip(CSV_COLUMNS,cols))
    print(cols)
    label = features.pop('accepted')  # remove label from features and store
    print("features: {} \n label: {}".format(features, label))
    return features, label
  
  # Create list of file names that match "glob" pattern (i.e. data_file_*.csv)
  filenames_dataset = tf.data.Dataset.list_files(filename, shuffle=False)
  # Read lines from text files
  textlines_dataset = filenames_dataset.flat_map(tf.data.TextLineDataset).skip(1)
  # Parse text lines as comma-separated values (CSV)
  dataset = textlines_dataset.map(decode_line)
  
  # Note:
  # use tf.data.Dataset.flat_map to apply one to many transformations (here: filename -> text lines)
  # use tf.data.Dataset.map      to apply one to one  transformations (here: text line -> feature list)
  
  if(mode == tf.estimator.ModeKeys.TRAIN):
    num_epochs = None  # loop indefinitely
    dataset = dataset.shuffle(buffer_size = 10*batch_size, seed=2)
  else:
    num_epochs = 1
  
  dataset = dataset.repeat(num_epochs).batch(batch_size)
  return dataset

  
def get_train_input_fn():
  return read_dataset('./stackoverflow-train-10.csv', tf.estimator.ModeKeys.TRAIN)

def get_valid_input_fn():
  return read_dataset('./stackoverflow-valid-10.csv', tf.estimator.ModeKeys.EVAL)

def get_test_input_fn():
  return read_dataset('./stackoverflow-test-10.csv', tf.estimator.ModeKeys.PREDICT)

get_train_input_fn()

Tensor("arg0:0", shape=(), dtype=string)
[<tf.Tensor 'DecodeCSV:0' shape=() dtype=int32>, <tf.Tensor 'DecodeCSV:1' shape=() dtype=float32>, <tf.Tensor 'DecodeCSV:2' shape=() dtype=float32>, <tf.Tensor 'DecodeCSV:3' shape=() dtype=float32>, <tf.Tensor 'DecodeCSV:4' shape=() dtype=float32>, <tf.Tensor 'DecodeCSV:5' shape=() dtype=float32>, <tf.Tensor 'DecodeCSV:6' shape=() dtype=float32>]
[<tf.Tensor 'DecodeCSV:0' shape=() dtype=int32>, <tf.Tensor 'DecodeCSV:1' shape=() dtype=float32>, <tf.Tensor 'DecodeCSV:2' shape=() dtype=float32>, <tf.Tensor 'DecodeCSV:3' shape=() dtype=float32>, <tf.Tensor 'DecodeCSV:4' shape=() dtype=float32>, <tf.Tensor 'DecodeCSV:5' shape=() dtype=float32>, <tf.Tensor 'DecodeCSV:6' shape=() dtype=float32>]
features: {'comment_count': <tf.Tensor 'DecodeCSV:2' shape=() dtype=float32>, 'answer_count': <tf.Tensor 'DecodeCSV:1' shape=() dtype=float32>, 'view_count': <tf.Tensor 'DecodeCSV:5' shape=() dtype=float32>, 'days_posted': <tf.Tensor 'DecodeCSV:6' shape=() dtyp

<DatasetV1Adapter shapes: ({comment_count: (?,), answer_count: (?,), view_count: (?,), days_posted: (?,), favorite_count: (?,), score: (?,)}, (?,)), types: ({comment_count: tf.float32, answer_count: tf.float32, view_count: tf.float32, days_posted: tf.float32, favorite_count: tf.float32, score: tf.float32}, tf.int32)>

In [34]:
FEATURE_NAMES = CSV_COLUMNS[1:]
LABEL_NAME = CSV_COLUMNS[0]

featcols = [ tf.feature_column.numeric_column(feat) for feat in  FEATURE_NAMES ]
#print(featcols)

In [35]:
%%time
OUTDIR = "stackoverflow_model"

tf.logging.set_verbosity(tf.logging.INFO)
shutil.rmtree(path = OUTDIR, ignore_errors = True)

model = tf.estimator.DNNClassifier(
    hidden_units = [1024, 512, 128, 32],  # specify neural architecture
    feature_columns = featcols,
    n_classes=2,
    optimizer = tf.train.AdamOptimizer(learning_rate=0.001),
    model_dir = OUTDIR,
    config = tf.estimator.RunConfig(tf_random_seed = 1)  
  )

model.train(
    input_fn = lambda : get_train_input_fn(),
    steps = 200
  )

INFO:tensorflow:Using config: {'_protocol': None, '_keep_checkpoint_every_n_hours': 10000, '_train_distribute': None, '_tf_random_seed': 1, '_master': '', '_evaluation_master': '', '_service': None, '_save_checkpoints_steps': None, '_task_type': 'worker', '_model_dir': 'stackoverflow_model', '_save_summary_steps': 100, '_task_id': 0, '_keep_checkpoint_max': 5, '_experimental_distribute': None, '_log_step_count_steps': 100, '_num_worker_replicas': 1, '_global_id_in_cluster': 0, '_is_chief': True, '_device_fn': None, '_num_ps_replicas': 0, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7ff2219579b0>, '_eval_distribute': None, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_save_checkpoints_secs': 600}
Tensor("arg0:0", shape=(), dtype=string, device=/device:CPU:0)
[<tf.Tensor 'DecodeCSV:0' shape=() dtype=int32>, <tf.Tensor 'DecodeCSV:1' shape=() dtype=float32>, <tf.Tensor 'Decod

In [37]:
def validate_rmse(model):
  metrices = model.evaluate(input_fn = lambda : get_valid_input_fn() )
  print("RMSE on dataset = {}".format(metrices["average_loss"]**.5))

#validate_rmse(model, df_train)
validate_rmse(model)

Tensor("arg0:0", shape=(), dtype=string, device=/device:CPU:0)
[<tf.Tensor 'DecodeCSV:0' shape=() dtype=int32>, <tf.Tensor 'DecodeCSV:1' shape=() dtype=float32>, <tf.Tensor 'DecodeCSV:2' shape=() dtype=float32>, <tf.Tensor 'DecodeCSV:3' shape=() dtype=float32>, <tf.Tensor 'DecodeCSV:4' shape=() dtype=float32>, <tf.Tensor 'DecodeCSV:5' shape=() dtype=float32>, <tf.Tensor 'DecodeCSV:6' shape=() dtype=float32>]
[<tf.Tensor 'DecodeCSV:0' shape=() dtype=int32>, <tf.Tensor 'DecodeCSV:1' shape=() dtype=float32>, <tf.Tensor 'DecodeCSV:2' shape=() dtype=float32>, <tf.Tensor 'DecodeCSV:3' shape=() dtype=float32>, <tf.Tensor 'DecodeCSV:4' shape=() dtype=float32>, <tf.Tensor 'DecodeCSV:5' shape=() dtype=float32>, <tf.Tensor 'DecodeCSV:6' shape=() dtype=float32>]
features: {'comment_count': <tf.Tensor 'DecodeCSV:2' shape=() dtype=float32>, 'answer_count': <tf.Tensor 'DecodeCSV:1' shape=() dtype=float32>, 'view_count': <tf.Tensor 'DecodeCSV:5' shape=() dtype=float32>, 'days_posted': <tf.Tensor 'Deco

on training set evaluate

INFO:tensorflow:Saving dict for global step 500: accuracy = 0.6811679, accuracy_baseline = 0.5281519, auc = 0.7331483, auc_precision_recall = 0.7117538, average_loss = 0.56418276, global_step = 500, label/mean = 0.5281519, loss = 71.741234, precision = 0.6524123, prediction/mean = 0.48095724, recall = 0.8482496
RMSE on dataset = 0.7511210011251841

In [38]:
raw_predictions = model.predict( input_fn = lambda : get_test_input_fn() )

#print(next(raw_predictions))
#print(next(raw_predictions))
#print(next(raw_predictions))
# class_ids determine the prediction

predictions = [p['class_ids'][0] for p in raw_predictions]

#confusion_matrix = tf.confusion_matrix(df_test['accepted'], predictions)
#print(confusion_matrix)

Tensor("arg0:0", shape=(), dtype=string, device=/device:CPU:0)
[<tf.Tensor 'DecodeCSV:0' shape=() dtype=int32>, <tf.Tensor 'DecodeCSV:1' shape=() dtype=float32>, <tf.Tensor 'DecodeCSV:2' shape=() dtype=float32>, <tf.Tensor 'DecodeCSV:3' shape=() dtype=float32>, <tf.Tensor 'DecodeCSV:4' shape=() dtype=float32>, <tf.Tensor 'DecodeCSV:5' shape=() dtype=float32>, <tf.Tensor 'DecodeCSV:6' shape=() dtype=float32>]
[<tf.Tensor 'DecodeCSV:0' shape=() dtype=int32>, <tf.Tensor 'DecodeCSV:1' shape=() dtype=float32>, <tf.Tensor 'DecodeCSV:2' shape=() dtype=float32>, <tf.Tensor 'DecodeCSV:3' shape=() dtype=float32>, <tf.Tensor 'DecodeCSV:4' shape=() dtype=float32>, <tf.Tensor 'DecodeCSV:5' shape=() dtype=float32>, <tf.Tensor 'DecodeCSV:6' shape=() dtype=float32>]
features: {'comment_count': <tf.Tensor 'DecodeCSV:2' shape=() dtype=float32>, 'answer_count': <tf.Tensor 'DecodeCSV:1' shape=() dtype=float32>, 'view_count': <tf.Tensor 'DecodeCSV:5' shape=() dtype=float32>, 'days_posted': <tf.Tensor 'Deco

In [39]:
df_test_predictions = df_test.copy(deep = True)
df_test_predictions['accepted'] = predictions

NameError: name 'df_test' is not defined

In [None]:
#sns.set(style="ticks", color_codes=True)
sns.pairplot(df_test, hue="accepted", palette="husl")

In [None]:
sns.pairplot(df_test_predictions, hue="accepted", palette="husl")