# Extract train-dev-test csv datasets from BigQuery using dataframe

In [1]:
import google.datalab.bigquery as bq
import pandas as pd
import numpy as np
import seaborn as sns
import shutil

In [48]:
def test_sample():
  basequery = """
  SELECT MAX(farmhash) as max_farmhash, COUNT(answer_count) as count
  FROM
  (
  SELECT 
    MOD(ABS(FARM_FINGERPRINT(CAST(id as STRING))), EVERY_N * 100)  as farmhash, answer_count
  FROM 
    `bigquery-public-data.stackoverflow.posts_questions`
  """
  sampler = "WHERE MOD(ABS(FARM_FINGERPRINT(CAST(id as STRING))), EVERY_N * 100) < 20 AND  MOD(ABS(FARM_FINGERPRINT(CAST(id as STRING))), EVERY_N * 100) >= 10  )"
  return "{}\n{}".format(basequery, sampler)

EVERY_N = 100
query_maxhash = test_sample().replace("EVERY_N", str(EVERY_N))
df_maxhash = bq.Query(query_maxhash).execute().result().to_dataframe()
print(df_maxhash)

   max_farmhash  count
0            19  17262


In [43]:

def sample_between(a, b, shredstart):
  basequery = """
  SELECT 
    answer_count, comment_count, favorite_count,  score, view_count,
    TIMESTAMP_DIFF(last_activity_date, creation_date, DAY) as days_posted,
    IF(accepted_answer_id IS NULL , 0, 1) as accepted
  FROM 
    `bigquery-public-data.stackoverflow.posts_questions`
  """
  
  # Use sampling for initial model development. Once model is developed, shread the entire dataset into  .csv files based on condition in the sampler.
  sampler = "WHERE MOD(ABS(FARM_FINGERPRINT(CAST(id as STRING))), EVERY_N * 100) < {1} AND MOD(ABS(FARM_FINGERPRINT(CAST(id as STRING))), EVERY_N * 100) >= {0}".format(
            shredstart, shredstart + 10
            )
  sampler2 = "AND {0} >= {1}\n AND {0} < {2}".format(
           "MOD(ABS(FARM_FINGERPRINT(CAST(id AS STRING))), EVERY_N * 100) * {}".format(10),
           a, b
          )
  return "{}\n{}\n{}".format(basequery, sampler, sampler2)


def create_query(phase, EVERY_N, shredstart):
  """Phase: train (70%) valid (15%) or test (15%)"""
  query = ""
  if phase == 'train':
    query = sample_between(0,70, shredstart)
  elif phase == 'valid':
    query = sample_between(70,85, shredstart)
  else:
    query = sample_between(85,100, shredstart)
  return query.replace("EVERY_N", str(EVERY_N))

#print(create_query('train', 100))
#(answer_count - AVG(answer_count)) / STDDEV_POP(answer_count)  as answer_count,

In [44]:
def to_csv(df, filename):
  outdf = df.copy(deep = True)
  #outdf.loc[:, 'key'] = np.arange(0, len(outdf)) # rownumber as key
  # Reorder columns so that target is first column
  #print(outdf.head())
  #print(df.head())
  cols = outdf.columns.tolist()
  print(cols)
  cols.remove('accepted')
  cols.insert(0, 'accepted')
  print(cols)
  outdf = outdf[cols]  
  
  
  #Normalizing input columns  and replace NaN or null
  normalize_cols = outdf.columns.tolist()
  normalize_cols.remove('accepted')
  for normalize_cols_name in normalize_cols:
    outdf[normalize_cols_name].fillna(0, inplace = True)
    outdf[normalize_cols_name] = (outdf[normalize_cols_name] - outdf[normalize_cols_name].mean())  / outdf[normalize_cols_name].std() 
  #print(outdf)
  #print(outdf['answer_count'] )
  outdf.to_csv(filename,  header = True, index_label = False, index = False)
  print("Wrote {} to {}".format(len(outdf), filename))

In [47]:

for phase in ['train', 'valid', 'test']:
  for x in range(10):
    query = create_query(phase, 100, x*10)
    print(query)
    df = bq.Query(query).execute().result().to_dataframe()
    print(df.head())
    to_csv(df, 'stackoverflow-{}-{}.csv'.format(phase,(x+1)*10))


  SELECT 
    answer_count, comment_count, favorite_count,  score, view_count,
    TIMESTAMP_DIFF(last_activity_date, creation_date, DAY) as days_posted,
    IF(accepted_answer_id IS NULL , 0, 1) as accepted
  FROM 
    `bigquery-public-data.stackoverflow.posts_questions`
  
WHERE MOD(ABS(FARM_FINGERPRINT(CAST(id as STRING))), 100 * 100) < 10 AND MOD(ABS(FARM_FINGERPRINT(CAST(id as STRING))), 100 * 100) >= 0
AND MOD(ABS(FARM_FINGERPRINT(CAST(id AS STRING))), 100 * 100) * 10 >= 0
 AND MOD(ABS(FARM_FINGERPRINT(CAST(id AS STRING))), 100 * 100) * 10 < 70
   answer_count  comment_count  favorite_count  score  view_count  \
0             7              1             2.0     11       20478   
1             8              4             2.0      8        6964   
2            11              4             NaN      0         870   
3             8              0            43.0     88      140793   
4             9              6             3.0     11         499   

   days_posted  accepted  


ValueError: list.remove(x): x not in list

##### Sampling dataset (check size)

In [None]:
!ls -l *.csv

##### Removing sampling and creating datasets (check size)

In [None]:
!ls -l *.csv

In [None]:
%bash
head stackoverflow-train.csv

# tf.estimator modeling

In [None]:
# Ensure that we have TensorFlow 1.13.1 installed.
!pip3 freeze | grep tensorflow==1.13.1 || pip3 install tensorflow==1.13.1

In [None]:
import tensorflow as tf
import pandas as pd
import shutil

print(tf.__version__)

In [None]:
!ls -l *.csv

In [None]:
df_train = pd.read_csv(filepath_or_buffer = "./stackoverflow-train.csv")
df_valid = pd.read_csv(filepath_or_buffer = "./stackoverflow-valid.csv")
df_test = pd.read_csv(filepath_or_buffer = "./stackoverflow-test.csv")

CSV_COLUMNNAMES = list(df_train) # CSV_COLUMNNAMES = df_train.columns.tolist()
print(CSV_COLUMNNAMES)

FEATURE_NAMES = CSV_COLUMNNAMES[1:]
LABEL_NAME = CSV_COLUMNNAMES[0]

In [None]:
featcols = [ tf.feature_column.numeric_column(feat) for feat in  FEATURE_NAMES ]
#print(featcols)

In [None]:
def pandas_input_3(df, phase, batch_size = 128):
  dataset = tf.data.Dataset.from_tensor_slices(tensors = ( dict(df[FEATURE_NAMES]), df[LABEL_NAME] )  )
  if(phase == 'train'):
    #return tf.estimator.inputs.pandas_input_fn(x = dict(df[FEATURE_NAMES]), y= df[LABEL_NAME], batch_size = batch_size, num_epochs = 10, shuffle = True, queue_capacity = 1000)
    dataset = dataset.shuffle(buffer_size = 1000).repeat(count=None).batch(batch_size = batch_size)
    return dataset
  else:
    #return tf.estimator.inputs.pandas_input_fn(x = dict(df[FEATURE_NAMES]), y= df[LABEL_NAME], batch_size = batch_size)
    dataset = dataset.batch(batch_size = batch_size)
    return dataset

In [None]:
%%time
OUTDIR = "stack_trained"

tf.logging.set_verbosity(tf.logging.INFO)
shutil.rmtree(path = OUTDIR, ignore_errors = True)

model = tf.estimator.DNNClassifier(
    hidden_units = [1024, 512, 128, 32],  # specify neural architecture
    feature_columns = featcols,
    n_classes=2,
    optimizer = tf.train.AdamOptimizer(learning_rate=0.001),
    model_dir = OUTDIR,
    config = tf.estimator.RunConfig(tf_random_seed = 1)  
  )

model.train(
    input_fn = lambda : pandas_input_3(df = df_train, phase = 'train'),
    steps = 500
  )

In [None]:
def validate_rmse(model, df_validation):
  metrices = model.evaluate(input_fn = lambda : pandas_input_3(df_validation, 'valid') )
  print("RMSE on dataset = {}".format(metrices["average_loss"]**.5))

#validate_rmse(model, df_train)
validate_rmse(model, df_valid)

on training set evaluate

INFO:tensorflow:Saving dict for global step 500: accuracy = 0.6811679, accuracy_baseline = 0.5281519, auc = 0.7331483, auc_precision_recall = 0.7117538, average_loss = 0.56418276, global_step = 500, label/mean = 0.5281519, loss = 71.741234, precision = 0.6524123, prediction/mean = 0.48095724, recall = 0.8482496
RMSE on dataset = 0.7511210011251841

In [None]:
raw_predictions = model.predict( input_fn = lambda : pandas_input_3( df_test , 'test' ) )

#print(next(raw_predictions))
#print(next(raw_predictions))
#print(next(raw_predictions))
# class_ids determine the prediction

predictions = [p['class_ids'][0] for p in raw_predictions]

#confusion_matrix = tf.confusion_matrix(df_test['accepted'], predictions)
#print(confusion_matrix)

In [None]:
df_test_predictions = df_test.copy(deep = True)
df_test_predictions['accepted'] = predictions

In [None]:
#sns.set(style="ticks", color_codes=True)
sns.pairplot(df_test, hue="accepted", palette="husl")

In [None]:
sns.pairplot(df_test_predictions, hue="accepted", palette="husl")