In [None]:
#Importing all libraries
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.graph_objs as go
import plotly.graph_objects as go
from scipy.interpolate import interp1d
from scipy.sparse import diags, eye, csc_matrix
from scipy.sparse.linalg import spsolve
from scipy.signal import savgol_filter

#Preprocessing and Scaling
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.impute import SimpleImputer

#Dimensionality Reduction
from sklearn.decomposition import PCA

#Clustering
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

#Classification
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, cross_val_score, cross_validate, StratifiedKFold, LeaveOneOut
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, make_scorer, precision_score, recall_score, f1_score

# Resampling for Imbalanced Data
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.combine import SMOTEENN
from imblearn.pipeline import make_pipeline

# Visualization Libraries
import seaborn as sns

In [None]:
from google.colab import drive
drive.mount('/content/drive')

**Loading all the Bacteria Absent files**

In [None]:
def load_and_average_intensities(directory):
    # Getting a list of all files in the directory
    files = [f for f in os.listdir(directory) if f.endswith('.asc')]
    num_files = len(files)
    all_intensities = []

    # Looping through the files and reading intensities
    for file in files:
        filepath = os.path.join(directory, file)
        with open(filepath, 'r') as file:
            intensities = np.array([float(line.strip().split()[1]) for line in file])
            all_intensities.append(intensities)

    # Converting the list of arrays to numpy array
    all_intensities = np.array(all_intensities)

    # Average intensity across all the files
    average_intensities = np.mean(all_intensities, axis=0)
    return average_intensities

directory = '/content/drive/MyDrive/New_Data_June6_final/Sifted Data/Bacteria_A_Only_Yes'
absent_average_intensities = load_and_average_intensities(directory) # Calling the function
print(absent_average_intensities)


0mfm**Loading all Bacteria Present files**


In [None]:
def load_and_average_intensities(directory):
    # Getting a list of all files in the directory
    files = [f for f in os.listdir(directory) if f.endswith('.asc')]
    num_files = len(files)
    all_intensities = []

    # Looping through the files and reading intensities
    for file in files:
        filepath = os.path.join(directory, file)
        with open(filepath, 'r') as file:
            intensities = np.array([float(line.strip().split()[1]) for line in file])
            all_intensities.append(intensities)

    # Converting the list of arrays to a 2D numpy array
    all_intensities = np.array(all_intensities)

    # Average of the intensities across all files
    average_intensities = np.mean(all_intensities, axis=0)
    return average_intensities


directory = '/content/drive/MyDrive/New_Data_June6_final/Sifted Data/Bacteria_P_Only_Yes'
present_average_intensities = load_and_average_intensities(directory) # Calling the function
print(present_average_intensities)

**Andor Wavenumbers**

In [None]:

# Wavenumbers in andor system
andorWaves = [-95.360845, -92.862156, -90.364595, -87.868163, -85.37286, -82.878686, -80.385642,
              -77.893726, -75.402939, -72.913281, -70.424753, -67.937353, -65.451082, -62.96594,
              -60.481928, -57.999044, -55.517289, -53.036664, -50.557167, -48.078799, -45.601561,
              -43.125451, -40.650471, -38.176619, -35.703897, -33.232303, -30.761838, -28.292503,
              -25.824296, -23.357219, -20.89127, -18.426451, -15.962761, -13.500199, -11.038767,
              -8.5784632, -6.1192888, -3.6612434, -1.2043271, 1.2514603, 3.7061186, 6.159648,
              8.6120483, 11.06332, 13.513462, 15.962475, 18.410359, 20.857115, 23.302741,
              25.747238, 28.190606, 30.632846, 33.073956, 35.513937, 37.952789, 40.390512,
              42.827106, 45.262572, 47.696908, 50.130115, 52.562193, 54.993142, 57.422962,
              59.851653, 62.279215, 64.705648, 67.130952, 69.555127, 71.978173, 74.40009,
              76.820878, 79.240537, 81.659067, 84.076468, 86.49274, 88.907883, 91.321897,
              93.734781, 96.146537, 98.557164, 100.96666, 103.37503, 105.78227, 108.18838,
              110.59336, 112.99722, 115.39994, 117.80153, 120.202, 122.60134, 124.99954,
              127.39662, 129.79257, 132.18739, 134.58109, 136.97365, 139.36508, 141.75539,
              144.14456, 146.53261, 148.91953, 151.30531, 153.68997, 156.0735, 158.45591,
              160.83718, 163.21732, 165.59634, 167.97422, 170.35098, 172.72661, 175.1011,
              177.47447, 179.84671, 182.21783, 184.58781, 186.95666, 189.32439, 191.69098,
              194.05645, 196.42078, 198.78399, 201.14607, 203.50702, 205.86684, 208.22554,
              210.5831, 212.93953, 215.29484, 217.64901, 220.00206, 222.35398, 224.70477,
              227.05443, 229.40296, 231.75036, 234.09664, 236.44178, 238.78579, 241.12868,
              243.47044, 245.81107, 248.15056, 250.48893, 252.82618, 255.16229, 257.49727,
              259.83112, 262.16385, 264.49545, 266.82591, 269.15525, 271.48346, 273.81054,
              276.13649, 278.46131, 280.785, 283.10757, 285.429, 287.74931, 290.06849,
              292.38653, 294.70345, 297.01924, 299.3339, 301.64743, 303.95984, 306.27111,
              308.58126, 310.89027, 313.19816, 315.50492, 317.81054, 320.11504, 322.41841,
              324.72066, 327.02177, 329.32175, 331.62061, 333.91833, 336.21493, 338.5104,
              340.80474, 343.09794, 345.39003, 347.68098, 349.9708, 352.25949, 354.54706,
              356.83349, 359.1188, 361.40298, 363.68602, 365.96794, 368.24873, 370.5284,
              372.80693, 375.08433, 377.36061, 379.63575, 381.90977, 384.18265, 386.45441,
              388.72504, 390.99454, 393.26291, 395.53016, 397.79627, 400.06125, 402.32511,
              404.58783, 406.84943, 409.1099, 411.36924, 413.62745, 415.88453, 418.14048,
              420.3953, 422.649, 424.90156, 427.153, 429.40331, 431.65248, 433.90053,
              436.14745, 438.39324, 440.63791, 442.88144, 445.12384, 447.36512, 449.60526,
              451.84428, 454.08217, 456.31893, 458.55456, 460.78906, 463.02243, 465.25467,
              467.48579, 469.71577, 471.94463, 474.17235, 476.39895, 478.62442, 480.84876,
              483.07197, 485.29405, 487.515, 489.73483, 491.95352, 494.17109, 496.38753,
              498.60283, 500.81701, 503.03006, 505.24198, 507.45277, 509.66244, 511.87097,
              514.07837, 516.28465, 518.4898, 520.69381, 522.8967, 525.09846, 527.29909,
              529.49859, 531.69697, 533.89421, 536.09032, 538.28531, 540.47916, 542.67189,
              544.86349, 547.05396, 549.2433, 551.43151, 553.61859, 555.80455, 557.98937,
              560.17307, 562.35563, 564.53707, 566.71738, 568.89656, 571.07461, 573.25153,
              575.42732, 577.60198, 579.77552, 581.94792, 584.1192, 586.28935, 588.45836,
              590.62625, 592.79301, 594.95864, 597.12315, 599.28652, 601.44876, 603.60988,
              605.76986, 607.92872, 610.08645, 612.24305, 614.39852, 616.55286, 618.70607,
              620.85815, 623.00911, 625.15893, 627.30763, 629.45519, 631.60163, 633.74694,
              635.89112, 638.03417, 640.17609, 642.31689, 644.45655, 646.59508, 648.73249,
              650.86877, 653.00391, 655.13793, 657.27082, 659.40258, 661.53321, 663.66272,
              665.79109, 667.91834, 670.04445, 672.16944, 674.2933, 676.41602, 678.53762,
              680.65809, 682.77744, 684.89565, 687.01273, 689.12869, 691.24351, 693.35721,
              695.46977, 697.58121, 699.69152, 701.8007, 703.90875, 706.01568, 708.12147,
              710.22613, 712.32967, 714.43208, 716.53335, 718.6335, 720.73252, 722.83041,
              724.92717, 727.0228, 729.11731, 731.21068, 733.30293, 735.39404, 737.48403,
              739.57289, 741.66062, 743.74722, 745.83269, 747.91703, 750.00024, 752.08233,
              754.16328, 756.24311, 758.32181, 760.39937, 762.47581, 764.55112, 766.6253,
              768.69836, 770.77028, 772.84107, 774.91074, 776.97927, 779.04668, 781.11296,
              783.17811, 785.24213, 787.30502, 789.36678, 791.42741, 793.48692, 795.54529,
              797.60254, 799.65865, 801.71364, 803.7675, 805.82023, 807.87183, 809.9223,
              811.97164, 814.01986, 816.06694, 818.1129, 820.15773, 822.20142, 824.24399,
              826.28543, 828.32574, 830.36492, 832.40298, 834.4399, 836.47569, 838.51036,
              840.5439, 842.5763, 844.60758, 846.63773, 848.66675, 850.69464, 852.72141,
              854.74704, 856.77154, 858.79492, 860.81717, 862.83828, 864.85827, 866.87713,
              868.89486, 870.91146, 872.92693, 874.94128, 876.95449, 878.96658, 880.97753,
              882.98736, 884.99606, 887.00363, 889.01007, 891.01538, 893.01956, 895.02261,
              897.02454, 899.02533, 901.025, 903.02354, 905.02094, 907.01722, 909.01237,
              911.00639, 912.99929, 914.99105, 916.98168, 918.97119, 920.95956, 922.94681,
              924.93293, 926.91792, 928.90178, 930.88451, 932.86611, 934.84658, 936.82593,
              938.80414, 940.78123, 942.75719, 944.73201, 946.70571, 948.67828, 950.64972,
              952.62003, 954.58922, 956.55727, 958.5242, 960.48999, 962.45466, 964.4182,
              966.3806, 968.34188, 970.30204, 972.26106, 974.21895, 976.17571, 978.13135,
              980.08585, 982.03923, 983.99148, 985.9426, 987.89259, 989.84145, 991.78918,
              993.73578, 995.68125, 997.6256, 999.56881, 1001.5109, 1003.4519, 1005.3917,
              1007.3304, 1009.268, 1011.2044, 1013.1397, 1015.0739, 1017.0069, 1018.9389,
              1020.8697, 1022.7993, 1024.7279, 1026.6553, 1028.5816, 1030.5067, 1032.4307,
              1034.3536, 1036.2754, 1038.196, 1040.1155, 1042.0339, 1043.9512, 1045.8673,
              1047.7823, 1049.6961, 1051.6089, 1053.5205, 1055.4309, 1057.3403, 1059.2485,
              1061.1556, 1063.0615, 1064.9664, 1066.8701, 1068.7726, 1070.6741, 1072.5744,
              1074.4736, 1076.3716, 1078.2686, 1080.1644, 1082.059, 1083.9526, 1085.845,
              1087.7363, 1089.6264, 1091.5154, 1093.4033, 1095.2901, 1097.1757, 1099.0602,
              1100.9436, 1102.8259, 1104.707, 1106.587, 1108.4658, 1110.3436, 1112.2202,
              1114.0956, 1115.97, 1117.8432, 1119.7153, 1121.5863, 1123.4561, 1125.3248,
              1127.1924, 1129.0588, 1130.9241, 1132.7883, 1134.6514, 1136.5133, 1138.3741,
              1140.2337, 1142.0923, 1143.9497, 1145.806, 1147.6611, 1149.5152, 1151.368,
              1153.2198, 1155.0704, 1156.92, 1158.7683, 1160.6156, 1162.4617, 1164.3067,
              1166.1506, 1167.9933, 1169.8349, 1171.6754, 1173.5147, 1175.3529, 1177.19,
              1179.026, 1180.8608, 1182.6945, 1184.5271, 1186.3585, 1188.1888, 1190.018,
              1191.8461, 1193.673, 1195.4988, 1197.3235, 1199.147, 1200.9694, 1202.7907,
              1204.6109, 1206.4299, 1208.2478, 1210.0645, 1211.8802, 1213.6947, 1215.5081,
              1217.3203, 1219.1314, 1220.9414, 1222.7503, 1224.558, 1226.3646, 1228.1701,
              1229.9745, 1231.7777, 1233.5798, 1235.3807, 1237.1806, 1238.9793, 1240.7768,
              1242.5733, 1244.3686, 1246.1628, 1247.9558, 1249.7478, 1251.5386, 1253.3282,
              1255.1168, 1256.9042, 1258.6905, 1260.4756, 1262.2596, 1264.0425, 1265.8243,
              1267.6049, 1269.3845, 1271.1628, 1272.9401, 1274.7162, 1276.4912, 1278.2651,
              1280.0378, 1281.8094, 1283.5799, 1285.3492, 1287.1174, 1288.8845, 1290.6505,
              1292.4153, 1294.179, 1295.9416, 1297.703, 1299.4634, 1301.2225, 1302.9806,
              1304.7375, 1306.4933, 1308.248, 1310.0015, 1311.754, 1313.5052, 1315.2554,
              1317.0044, 1318.7523, 1320.4991, 1322.2447, 1323.9892, 1325.7326, 1327.4749,
              1329.216, 1330.956, 1332.6948, 1334.4326, 1336.1692, 1337.9047, 1339.639,
              1341.3722, 1343.1043, 1344.8353, 1346.5651, 1348.2938, 1350.0214, 1351.7478,
              1353.4731, 1355.1973, 1356.9204, 1358.6423, 1360.3631, 1362.0828, 1363.8013,
              1365.5187, 1367.235, 1368.9502, 1370.6642, 1372.3771, 1374.0889, 1375.7995,
              1377.509, 1379.2174, 1380.9247, 1382.6308, 1384.3358, 1386.0396, 1387.7424,
              1389.444, 1391.1445, 1392.8438, 1394.542, 1396.2391, 1397.9351, 1399.6299,
              1401.3236, 1403.0162, 1404.7076, 1406.3979, 1408.0871, 1409.7752, 1411.4621,
              1413.1479, 1414.8326, 1416.5161, 1418.1985, 1419.8798, 1421.56, 1423.239,
              1424.9169, 1426.5937, 1428.2693, 1429.9438, 1431.6172, 1433.2895, 1434.9606,
              1436.6306, 1438.2994, 1439.9672, 1441.6338, 1443.2993, 1444.9636, 1446.6268,
              1448.2889, 1449.9499, 1451.6097, 1453.2684, 1454.926, 1456.5825, 1458.2378,
              1459.892, 1461.545, 1463.1969, 1464.8477, 1466.4974, 1468.146, 1469.7934,
              1471.4397, 1473.0848, 1474.7288, 1476.3717, 1478.0135, 1479.6542, 1481.2937,
              1482.932, 1484.5693, 1486.2054, 1487.8404, 1489.4743, 1491.107, 1492.7386,
              1494.3691, 1495.9985, 1497.6267, 1499.2538, 1500.8797, 1502.5046, 1504.1283,
              1505.7508, 1507.3723, 1508.9926, 1510.6118, 1512.2299, 1513.8468, 1515.4626,
              1517.0773, 1518.6908, 1520.3032, 1521.9145, 1523.5247, 1525.1337, 1526.7416,
              1528.3484, 1529.954, 1531.5585, 1533.1619, 1534.7641, 1536.3653, 1537.9653,
              1539.5641, 1541.1619, 1542.7585, 1544.354, 1545.9483, 1547.5415, 1549.1336,
              1550.7246, 1552.3144, 1553.9031, 1555.4907, 1557.0771, 1558.6625, 1560.2467,
              1561.8297, 1563.4116, 1564.9925, 1566.5721, 1568.1507, 1569.7281, 1571.3044,
              1572.8795, 1574.4536, 1576.0265, 1577.5982, 1579.1689, 1580.7384, 1582.3068,
              1583.874, 1585.4401, 1587.0051, 1588.569, 1590.1317, 1591.6934, 1593.2538,
              1594.8132, 1596.3714, 1597.9285, 1599.4845, 1601.0393, 1602.593, 1604.1456,
              1605.697, 1607.2473, 1608.7965, 1610.3446, 1611.8915, 1613.4373, 1614.982,
              1616.5255, 1618.068, 1619.6093, 1621.1494, 1622.6884, 1624.2263, 1625.7631,
              1627.2988, 1628.8333, 1630.3666, 1631.8989, 1633.43, 1634.96, 1636.4889,
              1638.0166, 1639.5432, 1641.0687, 1642.5931, 1644.1163, 1645.6384, 1647.1594,
              1648.6792, 1650.1979, 1651.7155, 1653.2319, 1654.7472, 1656.2614, 1657.7745,
              1659.2864, 1660.7972, 1662.3069, 1663.8154, 1665.3229, 1666.8292, 1668.3343,
              1669.8383, 1671.3412, 1672.843, 1674.3437, 1675.8432, 1677.3416, 1678.8388,
              1680.3349, 1681.8299, 1683.3238, 1684.8165, 1686.3081, 1687.7986, 1689.288,
              1690.7762, 1692.2633, 1693.7493, 1695.2341, 1696.7178, 1698.2004, 1699.6818,
               1701.1622, 1702.6413, 1704.1194, 1705.5963, 1707.0721, 1708.5468, 1710.0204,
              1711.4928, 1712.9641, 1714.4342, 1715.9033, 1717.3712, 1718.8379, 1720.3036,
              1721.7681, 1723.2315, 1724.6937, 1726.1549, 1727.6149, 1729.0737, 1730.5315,
              1731.9881, 1733.4436, 1734.8979, 1736.3511, 1737.8032, 1739.2542, 1740.704,
              1742.1527, 1743.6003, 1745.0468, 1746.4921, 1747.9363, 1749.3793, 1750.8213,
              1752.2621, 1753.7018, 1755.1403, 1756.5777, 1758.014, 1759.4492, 1760.8832,
              1762.3161, 1763.7479, 1765.1785, 1766.608, 1768.0364, 1769.4637, 1770.8898,
              1772.3148, 1773.7387, 1775.1614, 1776.583, 1778.0035, 1779.4229, 1780.8411,
              1782.2582, 1783.6741, 1785.089, 1786.5027, 1787.9153, 1789.3267, 1790.737,
              1792.1462, 1793.5543, 1794.9612, 1796.367, 1797.7717, 1799.1753, 1800.5777,
              1801.979, 1803.3791, 1804.7782, 1806.1761, 1807.5728, 1808.9685, 1810.363,
              1811.7564, 1813.1486, 1814.5398, 1815.9298, 1817.3186, 1818.7064, 1820.093,
              1821.4785, 1822.8628, 1824.246, 1825.6281, 1827.0091, 1828.3889, 1829.7677,
              1831.1452, 1832.5217, 1833.897, 1835.2712, 1836.6443, 1838.0162, 1839.387,
              1840.7567, 1842.1252, 1843.4927, 1844.8589, 1846.2241, 1847.5881, 1848.951,
              1850.3128, 1851.6735, 1853.033, 1854.3914, 1855.7486, 1857.1047, 1858.4597,
              1859.8136, 1861.1664, 1862.518, 1863.8685, 1865.2178, 1866.566, 1867.9131,
              1869.2591, 1870.6039]


**Plotting Raw spectrum for Bacteria Absent Samples**

In [None]:
# Generating a linear space for wavenumbers
linear_wavenumbers = np.linspace(min(andorWaves), max(andorWaves), num=1024)

# Matching intensities for the linearly spaced wavenumbers
interpolation_function = interp1d(andorWaves, absent_average_intensities, kind='linear', fill_value="extrapolate")
absent_interpolated_intensities = interpolation_function(linear_wavenumbers)

# Plotting Raw Spectrum
fig = go.Figure(data=go.Scatter(x=linear_wavenumbers, y=absent_interpolated_intensities, mode='lines'))
fig.update_layout(title='Raw SERS Spectrum for Bacteria Absent Samples',
                  xaxis_title='Wavenumber (cm^-1)',
                  yaxis_title='Intensity (a.u.)',
                  template="plotly_dark")
fig.show()


**Plotting Raw spectrum for Bacteria Present Samples**

In [None]:
# Generating a linear space for wavenumbers
linear_wavenumbers = np.linspace(min(andorWaves), max(andorWaves), num=1024)

# Matching intensities for the linearly spaced wavenumbers
interpolation_function = interp1d(andorWaves, present_average_intensities, kind='linear', fill_value="extrapolate")
present_interpolated_intensities = interpolation_function(linear_wavenumbers)


# Plotting
fig = go.Figure(data=go.Scatter(x=linear_wavenumbers, y=present_interpolated_intensities, mode='lines'))
fig.update_layout(title='Raw SERS Spectrum for Bacteria Present Samples',
                  xaxis_title='Wavenumber (cm^-1)',
                  yaxis_title='Intensity (a.u.)',
                  template="plotly_dark")
fig.show()


**Pre-processing**

In [None]:
def WhittakerSmooth(x, w, lambda_, differences=1):
    '''
    Penalized least squares algorithm for background fitting
    '''
    X = np.matrix(x).T
    m = X.size
    E = eye(m, format='csc')
    for i in range(differences):
        E = E[1:] - E[:-1]  # Adjust the difference matrix
    W = diags(w, 0, shape=(m, m))
    A = csc_matrix(W + (lambda_ * E.T * E))
    B = csc_matrix(W * X)
    background = spsolve(A, B)
    return np.array(background).flatten()  # Return a flat array

def airPLS(x, lambda_=100, porder=1, itermax=15):
    '''
    Adaptive iteratively reweighted penalized least squares for baseline fitting
    '''
    m = x.shape[0]
    w = np.ones(m)
    for i in range(1, itermax + 1):
        z = WhittakerSmooth(x, w, lambda_, porder)
        d = x - z
        dssn = np.abs(d[d < 0].sum())
        if dssn < 0.001 * np.abs(x).sum() or i == itermax:
            if i == itermax:
                print('WARNING: Max iteration reached!')
            break
        w[d >= 0] = 0
        w[d < 0] = np.exp(i * np.abs(d[d < 0]) / dssn)
        w[0] = np.exp(i * (d[d < 0]).max() / dssn)
        w[-1] = w[0]
    return z

# Apply airPLS for baseline correction
absent_baseline = airPLS(absent_interpolated_intensities, lambda_=10000, porder=2, itermax=100)
present_baseline = airPLS(present_interpolated_intensities, lambda_=10000, porder=2, itermax=100)
absent_corrected_intensities = absent_interpolated_intensities - absent_baseline #Subtract the baseline from the original intensities
present_corrected_intensities = present_interpolated_intensities - present_baseline #Subtract the baseline from the original intensities


**Pre-processed SERS Spectrum for Bacteria Absent samples**

In [None]:
fig = go.Figure(data=go.Scatter(x=linear_wavenumbers, y=absent_corrected_intensities, mode='lines'))
fig.update_layout(title='Pre-processed SERS Spectrum for Bacteria Absent Samples',
                  xaxis_title='Wavenumber (cm^-1)',
                  yaxis_title='Intensity (a.u.)',
                  template="plotly_dark")
fig.show()

**Pre-processed SERS Spectrum for Bacteria Present samples**

In [None]:
fig = go.Figure(data=go.Scatter(x=linear_wavenumbers, y=present_corrected_intensities, mode='lines'))
fig.update_layout(title='Pre-processed SERS Spectrum for Bacteria Present Samples',
                  xaxis_title='Wavenumber (cm^-1)',
                  yaxis_title='Intensity (a.u.)',
                  template="plotly_dark")
fig.show()

**Smoothing Using Sav-Goly Filter**

In [None]:
window_length = 13  #Higher the number, smoother the plot
polyorder = 2       # Polynomial order for the filter

absent_smoothed = savgol_filter(absent_corrected_intensities, window_length, polyorder)
present_smoothed = savgol_filter(present_corrected_intensities, window_length, polyorder)
#Plotting Bacteria Absent Samples
fig = go.Figure(data=go.Scatter(x=linear_wavenumbers, y=absent_smoothed, mode='lines'))
fig.update_layout(title='Bacteria Absent- SERS Spectrum',
                  xaxis_title='Wavenumber (cm^-1)',
                  yaxis_title='Intensity (a.u.)',
                  template="plotly_dark")
fig.show()
#Plotting Bacteria Present Samples
fig = go.Figure(data=go.Scatter(x=linear_wavenumbers, y=present_smoothed, mode='lines'))
fig.update_layout(title='Bacteria Present- SERS Spectrum',
                  xaxis_title='Wavenumber (cm^-1)',
                  yaxis_title='Intensity (a.u.)',
                  template="plotly_dark")
fig.show()

**Normalizing the Data**


In [None]:
scaler = MinMaxScaler()
absent_normalized = scaler.fit_transform(absent_smoothed.reshape(-1, 1)).flatten()
present_normalized = scaler.fit_transform(present_smoothed.reshape(-1, 1)).flatten()
#Plotting Bacteria Absent Samples
fig = go.Figure(data=go.Scatter(x=linear_wavenumbers, y=absent_normalized, mode='lines'))
fig.update_layout(title='Bacteria Absent- SERS Spectrum',
                  xaxis_title='Wavenumber (cm^-1)',
                  yaxis_title='Intensity (a.u.)',
                  template="plotly_dark")
fig.show()
#Plotting Bacteria Present Samples
fig = go.Figure(data=go.Scatter(x=linear_wavenumbers, y=present_normalized, mode='lines'))
fig.update_layout(title='Bacteria Present- SERS Spectrum',
                  xaxis_title='Wavenumber (cm^-1)',
                  yaxis_title='Intensity (a.u.)',
                  template="plotly_dark")
fig.show()


**Overlaying/Combining the two types of samples together**

In [None]:
scaler = MinMaxScaler()
absent_normalized = scaler.fit_transform(absent_smoothed.reshape(-1, 1)).flatten()
present_normalized = scaler.fit_transform(present_smoothed.reshape(-1, 1)).flatten()
fig = go.Figure()
fig.add_trace(go.Scatter(x=linear_wavenumbers, y=absent_normalized, mode='lines', name='Bacteria Absent',
                         line=dict(color='blue')))
fig.add_trace(go.Scatter(x=linear_wavenumbers, y=present_normalized, mode='lines', name='Bacteria Present',
                         line=dict(color='red')))
fig.update_layout(title='Interactive Raman Spectrum: Bacteria Absent vs. Present',
                  xaxis_title='Wavenumber (cm^-1)',
                  yaxis_title='Intensity (a.u.)',
                  template="plotly_dark")
fig.show()


**Pre-processing each spectra for Bacteria Absent Samples**

In [None]:
def WhittakerSmooth(x, w, lambda_, differences=1):
    X = np.matrix(x).T
    m = X.size
    E = eye(m, format='csc')
    for i in range(differences):
        E = E[1:] - E[:-1]
    W = diags(w, 0, shape=(m, m))
    A = csc_matrix(W + (lambda_ * E.T * E))
    B = csc_matrix(W * X)
    background = spsolve(A, B)
    return np.array(background).flatten()

def airPLS(x, lambda_=100, porder=1, itermax=15):
    m = x.shape[0]
    w = np.ones(m)
    for i in range(1, itermax + 1):
        z = WhittakerSmooth(x, w, lambda_, porder)
        d = x - z
        dssn = np.abs(d[d < 0].sum())
        if dssn < 0.001 * np.abs(x).sum() or i == itermax:
            if i == itermax:
                print('WARNING: Max iteration reached!')
            break
        w[d >= 0] = 0
        w[d < 0] = np.exp(i * np.abs(d[d < 0]) / dssn)
        w[0] = np.exp(i * (d[d < 0]).max() / dssn)
        w[-1] = w[0]
    return z

def read_spectrum(file_path):
    with open(file_path, 'r') as file:
        spectrum = np.array([float(line.strip().split()[1]) for line in file if line.strip()])
    return spectrum

def load_and_preprocess_spectra(directory, lambda_=10000, porder=2, itermax=100):
    spectra = []
    files = [f for f in os.listdir(directory) if f.endswith('.asc')]
    for file in files:
        file_path = os.path.join(directory, file)
        spectrum = read_spectrum(file_path)
        baseline = airPLS(spectrum, lambda_=lambda_, porder=porder, itermax=itermax)
        corrected_spectrum = spectrum - baseline
        spectra.append(corrected_spectrum)
    wavenumbers = andorWaves
    return pd.DataFrame(spectra, columns=wavenumbers)

directory = '/content/drive/MyDrive/New_Data_June6_final/Sifted Data/Bacteria_A_Only_Yes'
absent_spectra_df = load_and_preprocess_spectra(directory) #loading all spectra into a df
absent_spectra_df['ground_truth'] = 0 #assigning ground truth = 0


In [None]:
def plot_spectra(spectra_df):
    fig = go.Figure()

    for index, row in spectra_df.iterrows():
        fig.add_trace(go.Scatter(
            x=spectra_df.columns,  # Wavenumbers as x-axis
            y=row.values,  # Intensity values as y-axis
            mode='lines',  # Line plot
            name=f'Spectrum {index + 1}'  # Legend name
        ))

    # Update plot layout
    fig.update_layout(
        title='SERS spectra-Bacteria Absent Only',
        xaxis_title='Wavenumber',
        yaxis_title='Intensity',
        legend_title='Spectrum Index',
        hovermode='closest'  # Show closest point on hover
    )

    fig.show()
plot_spectra(absent_spectra_df)


**Pre-processing each spectra for Bacteria Present Samples**

In [None]:
directory = '/content/drive/MyDrive/New_Data_June6_final/Sifted Data/Bacteria_P_Only_Yes'
present_spectra_df = load_and_preprocess_spectra(directory)
present_spectra_df['ground_truth'] = 1

In [None]:
def plot_spectra(spectra_df):
    fig = go.Figure()

    for index, row in spectra_df.iterrows():
        fig.add_trace(go.Scatter(
            x=spectra_df.columns,  # Wavenumbers as x-axis
            y=row.values,  # Intensity values as y-axis
            mode='lines',  # Line plot
            name=f'Spectrum {index + 1}'  # Legend name
        ))

    # Update plot layout
    fig.update_layout(
        title='SERS spectra-Bacteria Present Only',
        xaxis_title='Wavenumber',
        # ytype='linear',  # Setting the y-axis type, could be 'log'
        yaxis_title='Intensity',
        legend_title='Spectrum Index',
        hovermode='closest'  # Show closest point on hover
    )

    fig.show()
plot_spectra(present_spectra_df)


**Combining Present and Absent Samples to create a Master dataframe**

In [None]:
master_df = pd.concat([absent_spectra_df, present_spectra_df], ignore_index=True)
master_df_raw = master_df.copy()

In [None]:
master_df

In [None]:
# Count the number of samples with 1 and 0 in the 'ground_truth' column
counts = master_df['ground_truth'].value_counts()

# Display the counts
print(counts)


**PCA on SERS data**

In [None]:
#PCA to see number of components
features = master_df.drop('ground_truth', axis=1)
scaler = StandardScaler()
features_scaled = scaler.fit_transform(features)
pca = PCA()
pca.fit(features_scaled)
explained_variance = pca.explained_variance_ratio_
cumulative_explained_variance = np.cumsum(explained_variance)
plt.figure(figsize=(10, 6))
plt.bar(range(1, len(explained_variance) + 1), explained_variance, alpha=0.5, align='center', label='Individual explained variance')
plt.step(range(1, len(cumulative_explained_variance) + 1), cumulative_explained_variance, where='mid', label='Cumulative explained variance')
plt.ylabel('Explained Variance Ratio')
plt.xlabel('Principal Component Index')
plt.title('Explained Variance by Different Principal Components')
plt.legend(loc='best')
plt.axhline(y=0.95, color='r', linestyle='-', label='95% Explained Variance')
plt.text(0.5, 0.85, '95% cut-off threshold', color = 'red', fontsize=16)
plt.show()

In [None]:
#dropping outliers
master_df.drop(labels=range(38, 40), inplace=True)
# training_master_df.drop(labels=range(12, 17), inplace=True)
# training_master_df

**Unsupervised Clustering: K-means**

In [None]:
master_df.columns = master_df.columns.astype(str)
scaler = StandardScaler()
features = master_df.drop('ground_truth', axis=1)
scaled_features = scaler.fit_transform(features)
pca = PCA(n_components=30)
pca_features = pca.fit_transform(scaled_features)
kmeans = KMeans(n_clusters=2, random_state=42)
clusters = kmeans.fit_predict(pca_features)
master_df['kmeans_cluster'] = clusters
master_df

In [None]:
master_df = master_df[master_df['kmeans_cluster'] == 0]
master_df = master_df.drop('kmeans_cluster', axis=1)
master_df

**SVM on clustered data**

In [None]:
X = master_df.drop('ground_truth', axis=1)  # features
y = master_df['ground_truth']  # labels
print("Class distribution in y:", y.value_counts())
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
svm_classifier = SVC(kernel='rbf', gamma='scale', C=100, random_state=90)

# Check if there are at least two classes
if len(np.unique(y_train)) > 1:
    cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
    cv_scores = cross_val_score(svm_classifier, X_train_scaled, y_train, cv=cv, scoring='accuracy')
    print("Individual Cross-Validation Accuracy Scores:", cv_scores)
    print("Mean CV Accuracy:", np.mean(cv_scores))
    print("Standard Deviation in CV Accuracy:", np.std(cv_scores))
else:
    print("Not enough classes for StratifiedKFold.")

# Train the SVM classifier on the whole training set
svm_classifier.fit(X_train_scaled, y_train)

# Predict on the test set
y_pred = svm_classifier.predict(X_test_scaled)

# Calculate the accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"\nAccuracy of the SVM model on test data: {accuracy}")

# Print the classification report
classification_report_output = classification_report(y_test, y_pred)
print("Classification Report:")
print(classification_report_output)

# Compute and visualize the confusion matrix
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Predicted 0', 'Predicted 1'], yticklabels=['Actual 0', 'Actual 1'])
plt.title('Confusion Matrix')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.show()


**ID the samples it is taking in the random sampling**

In [None]:
correct_predictions = y_pred == y_test
correct_samples = X_test[correct_predictions]
correct_samples['Actual Label'] = y_test[correct_predictions]
correct_samples['Predicted Label'] = y_pred[correct_predictions]
print("\nCorrectly Predicted Samples:")
print(correct_samples)