<a href="https://colab.research.google.com/github/alexei-clay/SCEC_2019/blob/master/random_sampling_LR_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Random sampling of eqdata.csv




### Goals: 
- Get a random representation of the dataset for training and testing a Linear Regression model


### Implementation: 
- Select randomly WITH replacement
- The total number of samples chosen is based on the total number of rows divided by the sample size
* Input data for the model is in 100 year bins
* Output data for the model is in 30 year bins
*  Change variable values as needed



### Notes about the dataset

#### *Each row in eqdata.csv is 10 years, therefore*: 
- 10 rows gives us 100 year bins
- 3 rows for 30 year (ouput) bins

In [0]:
import numpy as np
import pandas as pd

df = pd.read_csv('https://raw.githubusercontent.com/alexei-clay/SCEC_2019/master/data/eqdata.csv')

In [113]:
# Get variables for total num of rows, bin size, sample size
num_rows = df.shape[0]
num_rows
bin_size = 10
out_sample_size = 3
sample_size = bin_size + out_sample_size 
num_rows

99659

In [0]:
import random

# Not disjoint, but limit the number of samples to the total num of rows 
# divided by the the bin size
num_of_samples = int(num_rows / sample_size)
num_of_samples

# Get variables for the rows to use for random sampling of the data
lower_row_range = random.randint(0, num_rows) 
upper_row_range = lower_row_range + 9                     
out_d_low_range = upper_row_range + 1                 
out_d_upper_range = out_d_low_range + 2               


In [115]:
# Check the values
print(lower_row_range, upper_row_range)
print(out_d_low_range, out_d_upper_range)
print('num of samples: ', num_of_samples)

41048 41057
41058 41060
num of samples:  7666


In [116]:
# Check the input data for accuracy
in_data = df[lower_row_range: upper_row_range + 1]

# Dropping the 'Index' column 
in_data = in_data.drop(["Index"], axis = 1, inplace = False) 
in_data

Unnamed: 0,San Andreas (Carrizo) rev,San Andreas (Cholame) rev,San Andreas (Mojave S),San Andreas (Coachella) rev,San Jacinto (Anza) rev,Garlock (West)
41048,5,5,10,30,8,26
41049,6,6,11,31,9,27
41050,7,7,12,32,10,28
41051,8,8,13,33,11,29
41052,9,9,14,0,12,30
41053,10,10,15,1,13,31
41054,11,11,16,2,14,32
41055,12,12,17,3,15,33
41056,13,13,18,4,16,34
41057,14,14,19,5,17,35


In [117]:
# Check the ouput data for accuracy
out_data = df[out_d_low_range : out_d_upper_range + 1]

# Dropping the 'Index' column 
out_data = out_data.drop(["Index"], axis = 1, inplace = False) 
out_data

Unnamed: 0,San Andreas (Carrizo) rev,San Andreas (Cholame) rev,San Andreas (Mojave S),San Andreas (Coachella) rev,San Jacinto (Anza) rev,Garlock (West)
41058,0,0,0,6,18,36
41059,1,1,1,7,19,37
41060,2,2,2,8,20,38


In [0]:
# Remove the 'Index' column, it's unnecessary 
col = list(df.columns)
col.remove("Index")
i_data = pd.DataFrame(data=None, columns=col, dtype='int32')
o_data = pd.DataFrame(data=None, columns=col, dtype='int32')

# Initialize counter and sum variables
count = 0
car_total = 0
cho_total = 0
moj_total = 0
coa_total = 0
anz_total = 0
gar_total = 0

# General idea is to:
# 1. Get a random selection for the input data and out_data
# 2. Loop through each one respectively, sum the number of eruptions in the bin 
# 3. Append them to separate dataframes, then later:
# 4. Join the two dataframes
while count < num_of_samples: 
  
  # Get variables for the rows to use for random sampling of the data
  lower_row_range = random.randint(0, num_rows) 
  upper_row_range = lower_row_range + 9                     
  out_d_low_range = upper_row_range + 1                 
  out_d_upper_range = out_d_low_range + 2   
  
  # Get the data from the proper indices
  in_data = df[lower_row_range: upper_row_range + 1]
  out_data = df[out_d_low_range : out_d_upper_range + 1]
  
  # Loop through input data for this bin and sum the ruptures
  for index, row in in_data.iterrows():
    if int(row['San Andreas (Carrizo) rev']) == 0: 
      car_total += 1
    if int(row['San Andreas (Cholame) rev']) == 0: 
      cho_total += 1
    if int(row['San Andreas (Mojave S)']) == 0: 
      moj_total += 1
    if int(row['San Andreas (Coachella) rev']) == 0: 
      coa_total += 1
    if int(row['San Jacinto (Anza) rev']) == 0: 
      anz_total += 1
    if int(row['Garlock (West)']) == 0: 
      gar_total += 1

  # Put the sums in a dataframe format, then append them to the input dataframe
  df2 = pd.DataFrame([[car_total, cho_total, moj_total, coa_total, anz_total, gar_total]], columns=col)
  i_data=i_data.append(df2, sort=False, ignore_index=True)
   
  # Reset the sum vars
  car_total = 0
  cho_total = 0
  moj_total = 0
  coa_total = 0
  anz_total = 0
  gar_total = 0
  
  # Now, loop through the OUPUT data for the bin, and sum the ruptures
  for index, row in out_data.iterrows():
    if int(row['San Andreas (Carrizo) rev']) == 0: 
      car_total += 1
    if int(row['San Andreas (Cholame) rev']) == 0: 
      cho_total += 1
    if int(row['San Andreas (Mojave S)']) == 0: 
      moj_total += 1
    if int(row['San Andreas (Coachella) rev']) == 0: 
      coa_total += 1
    if int(row['San Jacinto (Anza) rev']) == 0: 
      anz_total += 1
    if int(row['Garlock (West)']) == 0: 
      gar_total += 1
  
  # Put the sums in a dataframe format, then append them to the output dataframe
  df3 = pd.DataFrame([[car_total, cho_total, moj_total, coa_total, anz_total, gar_total]], columns=col)
  o_data=o_data.append(df3, ignore_index=True)
  
  # Resert the sum vars
  car_total = 0
  cho_total = 0
  moj_total = 0
  coa_total = 0
  anz_total = 0
  gar_total = 0
  
  # Keep track that you haven't exceeded the number of iterations desired
  count+=1
    

In [119]:
# Output the num of rows in the d_set
o_data.shape[0]

7666

In [120]:
# Make sure the shapes match
i_data.shape[0]

7666

In [0]:
# Rename the columns for clarity
o_data = o_data.rename(columns={"San Andreas (Carrizo) rev":'Car_out', "San Andreas (Cholame) rev": 'Cho_out', "San Andreas (Mojave S)": 'Moj_out', "San Andreas (Coachella) rev": 'Coa_out', "San Jacinto (Anza) rev":'Anz_out', "Garlock (West)": 'Gar_out'})

In [122]:
# Merge the input and output data sets 
result = pd.concat([i_data, o_data], axis=1)
result

Unnamed: 0,San Andreas (Carrizo) rev,San Andreas (Cholame) rev,San Andreas (Mojave S),San Andreas (Coachella) rev,San Jacinto (Anza) rev,Garlock (West),Car_out,Cho_out,Moj_out,Coa_out,Anz_out,Gar_out
0,0,0,1,0,1,0,0,0,0,0,0,0
1,1,1,1,1,0,0,0,0,0,0,0,0
2,1,1,0,1,0,0,0,0,0,0,0,0
3,0,0,0,0,0,1,1,1,1,0,1,0
4,0,0,0,0,0,0,0,0,0,0,0,0
5,0,0,1,0,0,0,0,0,0,0,0,0
6,0,1,0,1,0,0,0,0,0,0,0,0
7,0,0,1,1,0,0,0,0,0,0,0,0
8,0,0,0,1,1,0,1,1,1,0,0,0
9,1,1,0,0,1,0,0,0,0,0,0,0


#### Make into a csv file

In [0]:
result.to_csv("random_sampling_100_to_30")