In [1]:
import pandas as pd
import numpy as np

def augment_raman_dataframe(df, num_augmented=4, noise_level=0.02):
    raman_shift = df.iloc[:, 0]
    spectra = df.iloc[:, 1:]

    augmented_data = {}

    for col in spectra.columns:
        original = spectra[col].values
        augmented_data[col] = original
        for i in range(num_augmented):
            noise = np.random.normal(0, noise_level * np.max(original), size=original.shape)
            noisy = original + noise
            new_col_name = f"{col}_aug{i+1}"
            augmented_data[new_col_name] = noisy


    augmented_df = pd.DataFrame(augmented_data)
    augmented_df.insert(0, 'Raman Shift', raman_shift)

    return augmented_df

In [3]:
df_exp = pd.read_excel("/content/drive/MyDrive/RSI_Google Colabs/preprocessed_raman_spectra (Experimental).xlsx")

df_exp = augment_raman_dataframe(df_exp)

df_exp.head(5)

Unnamed: 0,Raman Shift,AKSHAYA K-ALPHA FC_BACTERIA_D2.txt,AKSHAYA K-ALPHA FC_BACTERIA_D2.txt_aug1,AKSHAYA K-ALPHA FC_BACTERIA_D2.txt_aug2,AKSHAYA K-ALPHA FC_BACTERIA_D2.txt_aug3,AKSHAYA K-ALPHA FC_BACTERIA_D2.txt_aug4,AMUL TAZA TONED_BACTERIA_D2.txt,AMUL TAZA TONED_BACTERIA_D2.txt_aug1,AMUL TAZA TONED_BACTERIA_D2.txt_aug2,AMUL TAZA TONED_BACTERIA_D2.txt_aug3,...,NANDHINI TONED_BACTERIA_D2.txt,NANDHINI TONED_BACTERIA_D2.txt_aug1,NANDHINI TONED_BACTERIA_D2.txt_aug2,NANDHINI TONED_BACTERIA_D2.txt_aug3,NANDHINI TONED_BACTERIA_D2.txt_aug4,NANDHINI TONED_NON BACTERIA_D2.txt,NANDHINI TONED_NON BACTERIA_D2.txt_aug1,NANDHINI TONED_NON BACTERIA_D2.txt_aug2,NANDHINI TONED_NON BACTERIA_D2.txt_aug3,NANDHINI TONED_NON BACTERIA_D2.txt_aug4
0,400.0,6.089664,26.486849,33.159648,-17.744589,16.223703,23.740307,5.710208,26.334336,18.859959,...,3.742273,-7.162133,25.29816,-9.502863,-13.866792,9.616667,7.690588,-5.472765,16.948539,-11.341072
1,401.58487,13.091442,-0.488594,31.281832,2.914198,46.671628,26.848541,36.794354,24.46852,16.683479,...,1.787166,-14.233719,18.044962,27.354113,12.785308,6.644972,1.766899,-6.737621,-14.429931,-0.219968
2,403.16974,6.568796,-18.927407,12.436756,-4.328633,19.378007,27.049048,34.67425,20.257031,27.28871,...,4.240036,1.211866,-1.486291,3.726417,1.155711,11.794917,21.630211,12.495226,30.639524,-8.519998
3,404.75461,3.083516,9.499396,13.824568,-5.743727,-18.20234,26.569581,18.737272,22.56266,24.815348,...,4.161631,13.186165,9.570031,-2.672143,4.74776,8.37779,1.584651,19.332242,1.217019,5.551357
4,406.33948,2.574583,-37.215119,55.894933,61.608428,10.958661,23.319263,27.013977,4.769408,25.582866,...,5.356433,-12.232185,12.6329,29.438102,4.358498,9.369173,8.806181,10.987614,18.833545,-10.03234


In [5]:
df_control = pd.read_excel("/content/drive/MyDrive/RSI_Google Colabs/preprocessed_raman_spectra.xlsx")
df_control.drop("Unnamed: 0" , axis = 1, inplace = True)
df_control.head(5)

Unnamed: 0,Raman Shift,Amul full cream Cold 532nm 50% 600g 50xL 10s 10times 400-3500cm-1 200hole.txt,Amul full cream RT 532nm 50% 600g 50xL 10s 10times 400-3500cm-1 200hole.txt,Amul full cream RT after 24h 532nm 50% 600g 50xL 10s 10times 400-3500cm-1 200hole.txt,Amul full cream after heating 532nm 50% 600g 50xL 10s 10times 400-3500cm-1 200hole.txt,Amul full cream basic 532nm 50% 600g 50xL 10s 10times 400-3500cm-1 200hole.txt,Amul toned milk Cold 532nm 50% 600g 50xL 10s 10times 400-3500cm-1 200hole.txt,Amul toned milk RT 532nm 50% 600g 50xL 10s 10times 400-3500cm-1 200hole.txt,Amul toned milk RT after 24h 532nm 50% 600g 50xL 10s 10times 400-3500cm-1 200hole.txt,Amul toned milk after heating 532nm 50% 600g 50xL 10s 10times 400-3500cm-1 200hole.txt,...,Nandani toned milk Cold 532nm 50% 600g 50xL 10s 10times 400-3500cm-1 200hole.txt,Nandani toned milk RT 532nm 50% 600g 50xL 10s 10times 400-3500cm-1 200hole after heating.txt,Nandani toned milk RT 532nm 50% 600g 50xL 10s 10times 400-3500cm-1 200hole.txt,Nandani toned milk RT after 24 h 532nm 50% 600g 50xL 10s 10times 400-3500cm-1 200hole.txt,Nandani toned milk RT after 24h 532nm 50% 600g 50xL 10s 10times 400-3500cm-1 200hole.txt,Nandani toned milk basic 532nm 50% 600g 50xL 10s 10times 400-3500cm-1 200hole.txt,Nestle toned milk Cold 532nm 50% 600g 50xL 7s 20times 400-3500cm-1 200hole.txt,Nestle toned milk RT 532nm 50% 600g 50xL 7s 20times 400-3500cm-1 200hole.txt,Nestle toned milk after heating 532nm 50% 600g 50xL 7s 20times 400-3500cm-1 200hole.txt,Nestle toned milk basic 532nm 50% 600g 50xL 7s 20times 400-3500cm-1 200hole.txt
0,400.083,999.317107,1073.400953,1161.083306,1720.117063,1191.077263,133.38396,154.076432,155.795956,153.347328,...,773.89701,242.02572,391.479082,520.338648,720.645394,540.634261,121.608921,70.063468,69.403219,119.087542
1,401.858,849.564562,971.07215,1067.165289,1573.60599,1018.053939,-5.150261,-61.022305,-71.257405,-85.498544,...,678.625052,137.401063,272.653611,378.28764,616.74346,435.972607,6.736977,-96.389399,-116.608988,-70.217388
2,403.63,800.020579,975.66905,1092.31869,1604.777756,1004.919175,-64.930197,-72.0239,-99.199339,-119.404417,...,608.1988,144.562117,315.705279,387.159484,555.750092,336.182377,-57.289254,-109.262265,-101.881196,-37.722318
3,405.405,758.339425,954.31449,1083.549199,1590.200901,953.847234,-99.684417,-96.57401,-16.04692,-53.095924,...,661.389669,153.763166,278.351222,414.348457,634.879559,444.854989,28.815941,-67.709324,-81.750432,-33.932896
4,407.176,777.44393,928.957012,1049.373927,1577.492519,1033.12094,13.661434,12.878926,26.394306,-9.461433,...,664.169066,171.675632,292.145716,391.988831,579.34613,413.427572,38.881189,-39.998943,-55.030768,-30.680444


In [7]:
df_control.shape

(2118, 45)

In [8]:
df_exp.shape

(1957, 51)

In [13]:
from scipy.interpolate import interp1d

common_shift = np.arange(400, 3500, 1.25)

def interpolate_all_samples(df, common_shift):

    raman_shift = df.iloc[:, 0].values
    sample_columns = df.columns[1:]

    interpolated_data = {}

    for sample_name in sample_columns:
        intensity = df[sample_name].values
        f = interp1d(raman_shift, intensity, kind='linear', bounds_error=False, fill_value="extrapolate")
        interpolated_intensity = f(common_shift)
        interpolated_data[sample_name] = interpolated_intensity

    interpolated_df = pd.DataFrame.from_dict(interpolated_data, orient='index', columns=[f'I_{x}' for x in common_shift])

    interpolated_df.index.name = "Sample_ID"

    return interpolated_df

In [16]:
df_exp_interpolated = interpolate_all_samples(df_exp, common_shift)
df_control_interpolated = interpolate_all_samples(df_control, common_shift)

In [18]:
df = pd.concat([df_exp_interpolated, df_control_interpolated])

In [20]:
df.to_excel("/content/drive/MyDrive/RSI_Google Colabs/final_augmented_data.xlsx")