In [None]:
%pip install soundfile

In [None]:
import soundfile as sf
import os
import numpy as np

def split_wav(input_path, output_dir, chunk_duration=10):
    """
    Splits a WAV file into fixed-size chunks (e.g., 10 seconds each).
    """
    # Create output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)

    # Load audio file
    data, sr = sf.read(input_path)

    samples_per_chunk = int(chunk_duration * sr)
    total_samples = len(data)

    # Number of chunks
    num_chunks = int(np.ceil(total_samples / samples_per_chunk))

    # Base name for output files
    basename = os.path.splitext(os.path.basename(input_path))[0]

    for i in range(num_chunks):
        start = i * samples_per_chunk
        end = min(start + samples_per_chunk, total_samples)

        chunk = data[start:end]

        out_file = os.path.join(output_dir, f"{basename}_chunk_{i+1}.wav")

        # Save chunk
        sf.write(out_file, chunk, sr)
        print(f"Saved: {out_file}")


#Path to input WAV file and output folder
input_wav_path = r"/Users/ian.straits/Documents/Graduate School/SAHB/data1-08.wav"
output_folder   = r"/Users/ian.straits/Documents/Graduate School/SAHB/Audio8"

# Split into 10-second chunks
split_wav(input_wav_path, output_folder, chunk_duration=10)

In [None]:
%pip install librosa pandas scikit-learn joblib soundfile

In [None]:
#Load Manual Labels

In [None]:
import os
import numpy as np
import pandas as pd
import librosa
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix
import joblib

AUDIO_DIR = "/Users/ian.straits/Documents/Graduate School/SAHB/Attempt 1/All Chunks"
LABELS_CSV = "/Users/ian.straits/Documents/Graduate School/SAHB/Attempt 1/LabelsCombined.csv"

df = pd.read_csv(LABELS_CSV)
print(df.head())
print("Number of manually labeled files:", len(df))


In [None]:
#Feature Extraction Function (Log-Mel Spectrogram)

In [None]:
def extract_features(file_path, sr_target=16000, n_mels=64, duration=None):
    """
    Returns a 1D audio feature vector based on log-mel spectrogram.
    """
    try:
        y, sr = librosa.load(file_path, sr=sr_target, mono=True, duration=duration)
        if len(y) == 0:
            return None

        S = librosa.feature.melspectrogram(y=y, sr=sr_target, n_mels=n_mels)
        S_db = librosa.power_to_db(S, ref=np.max)

        # Statistical features across time
        mean = S_db.mean(axis=1)
        std = S_db.std(axis=1)

        # Combine into one feature vector (length = 2 * n_mels)
        feat = np.concatenate([mean, std], axis=0)
        return feat

    except Exception as e:
        print(f"Error processing {file_path}: {e}")
        return None


In [None]:
#Prepare Training Data from Manual Labels

In [None]:
X = []
y = []
missing = 0

for _, row in df.iterrows():
    fname = row["filename"]
    label = row["label"]

    file_path = os.path.join(AUDIO_DIR, fname)
    if not os.path.exists(file_path):
        missing += 1
        continue

    feat = extract_features(file_path)
    if feat is None:
        continue

    X.append(feat)
    y.append(int(label))

X = np.array(X)
y = np.array(y)

print("Feature matrix:", X.shape)
print("Labels:", y.shape)
print("Missing files:", missing)


In [None]:
#Train Test Split + Train Classifier

In [None]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Scale features (helps with generalization)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train classifier
clf = RandomForestClassifier(
    n_estimators=200,
    random_state=42,
    n_jobs=-1
)

clf.fit(X_train_scaled, y_train)

# Evaluate
y_pred = clf.predict(X_test_scaled)
print(classification_report(y_test, y_pred, digits=4))
print("Confusion matrix:\n", confusion_matrix(y_test, y_pred))


In [None]:
#Save the Model and Scaler

joblib.dump(clf, "construction_classifier.joblib")
joblib.dump(scaler, "feature_scaler.joblib")
print("Model + scaler saved.")

In [None]:
clf = joblib.load("construction_classifier.joblib")
scaler = joblib.load("feature_scaler.joblib")

all_files = sorted([f for f in os.listdir(AUDIO_DIR) if f.lower().endswith(".wav")])

manual_dict = {row["filename"]: str(row["label"]) for _, row in df.iterrows()}

final_results = []

for fname in all_files:
    file_path = os.path.join(AUDIO_DIR, fname)

    # if fname in manual_dict:
    #     # keep human label
    #     final_results.append({
    #         "filename": fname,
    #         "label": int(manual_dict[fname]),
    #         "source": "manual", 
    #         "probability": float(manual_dict[fname])
    #     })
    #     continue

    # Predict using model
    feat = extract_features(file_path)
    if feat is None:
        continue

    feat_scaled = scaler.transform([feat])
    prob = clf.predict_proba(feat_scaled)[0][1]
    pred = int(clf.predict(feat_scaled)[0])

    if fname in manual_dict:
        # keep human label
        final_results.append({
            "filename": fname,
            "label": pred,
            "source": "manual", 
            "probability": prob
        })
    else:
        final_results.append({
            "filename": fname,
            "label": pred,
            "source": "model", 
            "probability": prob
        })
        

    final_results.append({
        "filename": fname,
        "label": pred,
        "source": "model", 
        "probability": prob
    })

final_df = pd.DataFrame(final_results)
print("Total labeled files:", len(final_df))

final_df.to_csv("labels_full_with_source.csv", index=False)
print("Saved --> labels_full_with_source.csv")

In [None]:
#clean version without the source column:

In [None]:
final_df[["filename", "label", "probability"]].to_csv("labels_full.csv", index=False)

In [None]:
#Visual labeling tool

In [None]:
testprobs = clf.predict_proba(X_test_scaled)[:,1]
print (testprobs)

In [None]:
print(testprobs.shape)

In [None]:
trainprobs = clf.predict_proba(X_train_scaled)[:,1]
print (trainprobs)

In [None]:
print(trainprobs.shape)

In [None]:
import pandas as pd

In [None]:
read_df = pd.read_csv("labels_full_with_source.csv")
df = pd.DataFrame()

In [None]:
for i in range (0, len(read_df), 2):
    curr_prob = read_df.loc[i, "probability"]
    read_df.loc[i, "probability"] = 1-curr_prob
    df = pd.concat([df, read_df.loc[[i]]])

df = df.reset_index(drop=True)

In [None]:
tod_df = pd.read_csv("tod.csv")

In [None]:
df["TOD"] = tod_df["TOD"]
df["smoothed probabilities"] = 0.0
df["WNA setting"] = ""

In [None]:
def ema(curr_prob, prev_ema, hp):
    return (curr_prob*hp) + (prev_ema*(1-hp))

In [None]:
ema_hyperparamater = 0.3
hysteresis_low_thresh = 0.15
hysteresis_high_thresh = 0.4

In [None]:
for i in range(len(df)):
    if i == 0:
        curr_prob = df.loc[i, "probability"]
        df.loc[i, "smoothed probabilities"] = curr_prob
        if curr_prob < hysteresis_high_thresh:
            df.loc[i, "WNA setting"] = "NO"
        else:
            df.loc[i, "WNA setting"] = "YES"
    else:
        prev_ema = ema(df.loc[i, "probability"], df.loc[i-1, "smoothed probabilities"], ema_hyperparamater)
        df.loc[i, "smoothed probabilities"] = prev_ema
        if prev_ema <= hysteresis_low_thresh:
            df.loc[i, "WNA setting"] = "NO"
        elif prev_ema >= hysteresis_high_thresh:
            df.loc[i, "WNA setting"] = "YES"
        else:
            df.loc[i, "WNA setting"] = df.loc[i-1, "WNA setting"]

In [None]:
df.to_csv("white_noise_actuator_results.csv", index=False)

In [None]:
%pip install librosa matplotlib pandas

In [None]:
import os
import numpy as np
import librosa
import librosa.display
import matplotlib.pyplot as plt

In [None]:
AUDIO_DIR = "/Users/ian.straits/Documents/Graduate School/SAHB/Attempt 1/All Chunks"

# List of the four chunks + titles
examples = [
    ("data1-05_chunk_916.wav", "Nighttime (Label 1)"),
    ("data1-07_chunk_687.wav", "Reverse Alarm (Label 0)"),
    ("data1-07_chunk_322.wav", "Hammer (Label 0)"),
    ("data1-07_chunk_560.wav", "Emergency Siren (Label 0)"),
]

# Make fonts a bit bigger globally
plt.rcParams.update({'font.size': 12})

fig, axes = plt.subplots(2, 2, figsize=(14, 8), sharey=True)
axes_flat = axes.ravel()

imgs = []

for ax, (fname, title) in zip(axes_flat, examples):
    path = os.path.join(AUDIO_DIR, fname)
    print("Loading:", path)

    # Load audio
    y, sr = librosa.load(path, sr=None, mono=True)

    # Mel spectrogram
    S = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=64)
    S_db = librosa.power_to_db(S, ref=np.max)

    # Plot
    img = librosa.display.specshow(
        S_db,
        sr=sr,
        x_axis="time",
        y_axis="mel",
        ax=ax
    )
    ax.set_title(title, fontsize=14)
    ax.tick_params(axis='both', which='major', labelsize=10)
    imgs.append(img)

# Only label left and bottom axes to avoid clutter
axes[1, 0].set_xlabel("Time (s)", fontsize=12)
axes[1, 1].set_xlabel("Time (s)", fontsize=12)
axes[0, 0].set_ylabel("Mel frequency (Hz)", fontsize=12)
axes[1, 0].set_ylabel("Mel frequency (Hz)", fontsize=12)

# Add extra space at bottom for horizontal colorbar
plt.subplots_adjust(bottom=-0.15)

# One horizontal colorbar below all four plots
cbar = fig.colorbar(
    imgs[0],
    ax=axes_flat,
    orientation="horizontal",
    fraction=0.05,
    pad=0.18
)
cbar.set_label("Amplitude (dB)", fontsize=12)
cbar.ax.tick_params(labelsize=10)

plt.tight_layout()
plt.show()

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

CSV_PATH = "/Users/ian.straits/Documents/Graduate School/SAHB/Attempt 1/labels_full_with_source_edit.csv"   # change if needed

# 1. Load CSV, ignore weird encoding issues from Excel
df = pd.read_csv(CSV_PATH, encoding_errors="ignore")

print(df.head())

# 2. Convert TOD column (like "11:30:00 AM") to datetime
df["TOD"] = pd.to_datetime(df["TOD"], format="%I:%M:%S %p")

# 3. Keep only rows where Label == 0
df_zero = df[df["Label"] == 0]

print(f"Total rows: {len(df)}")
print(f"Rows with Label = 0: {len(df_zero)}")

# 4. Plot histogram of TOD for Label = 0
plt.figure(figsize=(12, 5))
plt.hist(df_zero["TOD"], bins=50)

plt.xlabel("Time of Day")
plt.ylabel("Count of Label = 0")
plt.title("Label = 0 (construction) vs Time of Day")

import matplotlib.dates as mdates
ax = plt.gca()
ax.xaxis.set_major_formatter(mdates.DateFormatter("%H:%M"))
ax.xaxis.set_major_locator(mdates.HourLocator(interval=1))

plt.gcf().autofmt_xdate()
plt.tight_layout()
plt.show()

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.dates as mdates

In [None]:
df = pd.read_csv("white_noise_actuator_results.csv")
df['TOD'] = pd.to_datetime(df['TOD'], format='%I:%M:%S %p')
df['WNA_numeric'] = df['WNA setting'].map({'YES': 1, 'NO': 0})

fig, ax1 = plt.subplots(figsize=(12,5))

ax1.scatter(df['TOD'], df['WNA_numeric'], color='red', s=2)
ax1.set_ylim(-0.1, 1.1)
ax1.set_yticks([0,1])
ax1.set_yticklabels(['OFF','ON'])
ax1.set_xlabel("Hour of Day")
ax1.set_ylabel("White Noise Actuator")

ax1.xaxis.set_major_formatter(mdates.DateFormatter('%H:%M'))
ax1.xaxis.set_major_locator(mdates.HourLocator(interval=1))
plt.setp(ax1.get_xticklabels(), rotation=45)
print(ax1.get_xticklabels())

df0 = df[df["label"] == 0].copy()
df0['hour'] = df0['TOD'].dt.hour
counts0 = df0.groupby('hour').size()
counts0 = counts0.reindex(range(24), fill_value=0)

start_date = df['TOD'].dt.normalize().iloc[0]
hist_hours = [start_date + pd.Timedelta(hours=h) for h in counts0.index]

ax2 = ax1.twinx()
ax2.bar(hist_hours, counts0.values, width=pd.Timedelta(minutes=60), color='skyblue', align='edge', alpha=0.3)
ax2.set_ylabel("Construction Data Point Frequency")

ax1.set_title("White Noise Actuator ON/OFF Over Time")
plt.show()