## Load libraries
- Python 3.7.3
- pandas 0.24.2
- numpy 1.16.3
- seaborn 0.9.0
- sklearn (scikit-learn) 0.21.2

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.metrics.pairwise import pairwise_kernels

## Read in the data file
The dataset used in this study was compiled from the final full dataset from the data paper 

In [2]:
df_origin = pd.read_excel('./datasets/dataset.xlsx')
df_origin.head()
df = df_origin.copy()

In [3]:
df = df.drop(index=df.index[547:562]) # Remove the extremely high H2 (Dufaud et al 2009 - we are doubtful with 
                                      # results from this paper. Feel free to include them in your study)
df = df.drop(index=df.index[297:319]) # Remove no initial/final C (Lafay et al 2012 - only analyzed H2)
df = df.reset_index(drop=True)

## Delete duplicated measurements in each experiment
Check main text body for difference between measurement and experiment.
Before normalizing the dataset, we want to keep a copy of experiments 
(removed duplicated measurements) with real values - "reduced-with-real-values.xlsx"

In [4]:
df2 = pd.DataFrame(columns=df.columns)
for idx in range(df.shape[0] - 1):
    flg = False
    if df['No_of_experiments'][idx] != df['No_of_experiments'][idx+1]:
       flg = True
    else:
        if df['Duration_hr'][idx] > df['Duration_hr'][idx+1]:
            flg = True
            
    if flg:    
        df2 = df2.append(df.iloc[idx,:],ignore_index=True)       

df2 = df2.append(df.iloc[-1,:],ignore_index=True)

In [5]:
df2 = df2.drop(columns=['Number of dataline', 'Internal database article number'])

In [6]:
df2 = df2.fillna(-1)
df2.to_excel('./datasets/Reduced_with-real-values.xlsx')

## Normalize the dataframe
To reduce biases from every feature (numerical features)

In [7]:
cols = ['Temperature_C', 'Pressure_MPa', 'Duration_hr','Water_Rock', 'Grain_size_min','Grain_size_max', 
        'Initial_pH', 'Final_pH', 'Total_initial_C_mM',
        'H2_mM', 'CH4_mM', 'Other_organics_mM','Total_final_organic_C_mM']

In [8]:
normdf = df.copy()

# Get a normalized dataframe
# Normalize the dataframe into 0-1 range
for col in cols:
    col_data = df[col].astype(float)
    normdf[col] = (col_data - np.min(col_data))/(np.max(col_data)- np.min(col_data))
#     print(col, np.min(col_data), np.max(col_data))

In [9]:
drop_cols = list(df_origin.columns[:2])

normdf = normdf.drop(columns=drop_cols)
# len(drop_cols)

## Filling everything with -1
To replace missing values for further analysis

In [10]:
normdf = normdf.fillna(-1)
normdf.to_excel('./datasets/normalized_version.xlsx')

## Delete duplicated measurements in one single experiment for the normalized dataset

In [11]:
normdf2 = pd.DataFrame(columns=normdf.columns)
for idx in range(df.shape[0] - 1):
    flg = False
    if normdf['No_of_experiments'][idx] != normdf['No_of_experiments'][idx+1]:
        flg = True
    else:
        if normdf['Duration_hr'][idx] > normdf['Duration_hr'][idx+1]:
            flg = True
            
    if flg:    
        normdf2 = normdf2.append(normdf.iloc[idx,:],ignore_index=True)       

normdf2 = normdf2.append(normdf.iloc[-1,:],ignore_index=True)

In [12]:
normdf2.to_excel('./datasets/Reduced_version-normalized.xlsx')

## Calculate cosine similarity matrix and save the file

In [13]:
normdf2 = normdf2.drop(columns=['Duration_hr', 'Authors', 'No_of_experiments'])
res = pd.DataFrame(pairwise_kernels(normdf2, normdf2, metric='cosine'))

In [14]:
res.to_excel('./datasets/Cos-reduced-normalized.xlsx')