In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install tensorflow tensorflow-io matplotlib xgboost catboost

In [None]:
#necessary libraries
import os
import itertools
from matplotlib import pyplot as plt
import tensorflow as tf
import tensorflow_io as tfio
import numpy as np
import pywt
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Conv2D, Dense, Flatten, Dropout
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix

#loading audio file function
def load_wav_16k_mono(filename):
    # Load encoded wav file
    file_contents = tf.io.read_file(filename)
    # Decode wav (tensors by channels)
    wav, sample_rate = tf.audio.decode_wav(file_contents, desired_channels=1)
    # Removes trailing axis
    wav = tf.squeeze(wav, axis=-1)
    sample_rate = tf.cast(sample_rate, dtype=tf.int64)
    # Goes from 44100Hz to 16000hz - amplitude of the audio signal
    wav = tfio.audio.resample(wav, rate_in=sample_rate, rate_out=16000)
    return wav

#listing gunshot n non gunshot files
POS = '/content/drive/MyDrive/aidataset/gunshot'
NEG = '/content/drive/MyDrive/aidataset/nongunshot'
pos = tf.data.Dataset.list_files(POS+'/*.wav')
neg = tf.data.Dataset.list_files(NEG+'/*.wav')

#labeling gunshot as 1 and non gunshot as 0
positives = tf.data.Dataset.zip((pos, tf.data.Dataset.from_tensor_slices(tf.ones(len(pos)))))
negatives = tf.data.Dataset.zip((neg, tf.data.Dataset.from_tensor_slices(tf.zeros(len(neg)))))
#calculating the quotient n remainder
repeat_count = len(negatives) // len(positives)
remainder = len(negatives) % len(positives)
#oversampling the minority with given values of quotient n remainder
new_positives = positives.repeat(repeat_count) #repeat that many times as quotient
new_positives = new_positives.concatenate(positives.take(remainder)) #add that many as remainder
data = new_positives.concatenate(negatives) #concatenate gunshot n nonshot

# function provided in tenserflow
def preprocess(file_path, label):
    wav = load_wav_16k_mono(file_path)
    wav = wav[:40000]
    #padding to make equal wav
    zero_padding = tf.zeros([40000] - tf.shape(wav), dtype=tf.float32)
    wav = tf.concat([zero_padding, wav], 0)
    spectrogram = tf.signal.stft(wav, frame_length=320, frame_step=32) #short term fourier transform
    spectrogram = tf.abs(spectrogram) #taking absolute of it
    spectrogram = tf.expand_dims(spectrogram, axis=2)
    return spectrogram, label

#preprocess, shuffling n batch for data
data = data.map(preprocess)
data = data.cache()
data = data.shuffle(buffer_size=5000)
data = data.batch(16)
data = data.prefetch(8)

#splitting of data into train, validation and test
train = data.take(130)
test = data.skip(130).take(30)
ttest = data.skip(160).take(20)


#Custom CNN model
model = Sequential()
model.add(Conv2D(32, (3,3), activation='relu', input_shape=(1241, 257, 1)))
model.add(Conv2D(32, (3,3), activation='relu'))
model.add(Dropout(0.5))  # Add dropout layer with dropout rate of 0.5
model.add(Flatten())
model.add(Dense(16, activation='relu'))
model.add(Dropout(0.5))  # Add dropout layer with dropout rate of 0.5

# Create a feature extraction model
feature_extractor = Model(inputs=model.input, outputs=model.layers[-1].output)

# Extract features from train in the dataset
features = []
labels = []
for spectrogram, label in train:
    extracted_features = feature_extractor.predict(spectrogram)
    features.append(extracted_features)
    labels.append(label)

features = np.concatenate(features, axis=0)
labels = np.concatenate(labels, axis=0)

# Extract features from test set
test_features = []
test_labels = []
for spectrogram, label in test:
    extracted_features = feature_extractor.predict(spectrogram)
    test_features.append(extracted_features)
    test_labels.append(label)

test_features = np.concatenate(test_features, axis=0)
test_labels = np.concatenate(test_labels, axis=0)

In [None]:
# Calculating the entropy value
def calc_entropy(labels):
    # H= - sum(p * log(p))
    unique_l, label_counts = np.unique(labels, return_counts=True) #getting count of 1 and 0
    prob = label_counts / len(labels) #Probabily of 0 and 1
    entropy = -np.sum(prob * np.log2(prob))
    return entropy

# Calculate Information Gain for a specific feature
def calc_info_gain(data, labels, feature_index):
    # IG= H(before ) - H(after)
    before_H = calc_entropy(labels) # calculating before entropy ie before spliting
    unique_values, value_counts = np.unique(data[:, feature_index], return_counts=True) #get unique feature value and its count

    after_H = 0
    for value, count in zip(unique_values, value_counts):
      #split data based on feature value
        child_labels = labels[data[:, feature_index] == value] # Get labels for the split data with this feature value
        after_H += (count / len(labels)) * calc_entropy(child_labels) #calculate weighted sum of entropy after split

    info_gain = before_H - after_H #information gain
    return info_gain

#Finding root node with feature with info gain highest
def find_root_node(data, labels):
    num_features = data.shape[1] #no of features
    #intialize info gain and feature index
    best_info_gain = -1
    best_feature_idx = -1

    for feature_index in range(num_features):
        info_gain = calc_info_gain(data, labels, feature_index) #calculate info gain for current feature
        if info_gain > best_info_gain:  #if cur_ig is more than best then cur_ig is best one also feature index
            best_info_gain = info_gain
            best_feature_idx = feature_index

    return best_feature_index

root_node_index = find_root_node(features, labels)
print("Root node feature index:", root_node_index)


Root node feature index: 4


In [None]:
# A2. Function to bin continuous features into categorical features
def bin_cont_feature(feature_col, num_bins=10, bin_type='equal_width'):
    if bin_type == 'equal_width': # Divide range into equal width bins
        bin_edges = np.linspace(np.min(feature_col), np.max(feature_col), num_bins + 1)
    elif bin_type == 'frequency': # Divide based on frequency of values
        bin_edges = np.histogram_bin_edges(feature_col, bins=num_bins, range=(np.min(feature_col), np.max(feature_col)))
    else:
        raise ValueError("Invalid binning type. Choose 'equal_width' or 'frequency'.")

    binned_features = np.digitize(feature_col, bins=bin_edges) - 1 # Assign each value to its corresponding bin
    return binned_features, bin_edges


# A3. Custom Decision Tree module
class DecisionTree:
    def __init__(self):
        self.tree = None

    def fit(self, X, y):
       # Build the decision tree
        self.tree = self._build_tree(X, y)

    def predict(self, X):
      # predict labels for provided data using built tree
        predictions = []
        for sample in X:
            predictions.append(self._traverse_tree(sample, self.tree))
        return np.array(predictions)

    def _build_tree(self, X, y):
        # Base case: if all labels are the same, return leaf node
        if len(set(y)) == 1:
            return {'class': y[0]}

        # Find the best feature to split on
        best_feature_index = find_root_node(X, y) #root is feature with highest info gain
        best_feature_values = X[:, best_feature_index] #get its values
        unique_values = np.unique(best_feature_values)

        # Initialize the tree
        tree = {'feature_index': best_feature_index, 'children': {}}

        # Recursively build subtrees
        for value in unique_values:
            subset_idx = np.where(best_feature_values == value)[0] #get indices of samples with particular feature values
            subset_X = X[subset_idx]
            subset_y = y[subset_idx]
            tree['children'][value] = self._build_tree(subset_X, subset_y) # Build subtree for the current feature value

        return tree

    def _traverse_tree(self, sample, node):
      # Traverse the decision tree to predict the label for a given sample
        if 'class' in node: #if the node is a leaf node return the class label
            return node['class']
        else:
            feature_value = sample[node['feature_index']] # Get the value of the feature at the current node
            if feature_value in node['children']:
                return self._traverse_tree(sample, node['children'][feature_value]) # Recursively traverse subtree
            else:
                # If the feature value is not in training data, return majority class
                return max(node['children'], key=lambda x: len(node['children'][x]))

# Instantiate and train the custom decision tree
custom_tree = DecisionTree()
custom_tree.fit(features, labels)
predictions = custom_tree.predict(test_features)

# Evaluation metrics
accuracy = accuracy_score(test_labels, predictions)
f1 = f1_score(test_labels, predictions)
precision = precision_score(test_labels, predictions)
recall = recall_score(test_labels, predictions)
conf_matrix = confusion_matrix(test_labels, predictions)

print("Accuracy:", accuracy)
print("F1 Score:", f1)
print("Precision:", precision)
print("Recall:", recall)
print("Confusion Matrix:")
print(conf_matrix)

# Determine the best feature index to bin based on information gain
best_binning_feature_index = find_root_node(features, labels)

# Bin the best feature
feature_needing_binning = features[:, best_binning_feature_index]
binned_feature, bin_edges = bin_cont_feature(feature_needing_binning)
# Replace the original feature column with the binned feature
features[:, best_binning_feature_index] = binned_feature

# Train the custom decision tree with binned features
custom_tree = DecisionTree()
custom_tree.fit(features, labels)
predictions = custom_tree.predict(test_features)

# Evaluation metrics
accuracy = accuracy_score(test_labels, predictions)
f1 = f1_score(test_labels, predictions)
precision = precision_score(test_labels, predictions)
recall = recall_score(test_labels, predictions)
conf_matrix = confusion_matrix(test_labels, predictions)

print("Accuracy:", accuracy)
print("F1 Score:", f1)
print("Precision:", precision)
print("Recall:", recall)
print("Confusion Matrix:")
print(conf_matrix)

Accuracy: 0.9395833333333333
F1 Score: 0.9376344086021505
Precision: 1.0
Recall: 0.8825910931174089
Confusion Matrix:
[[233   0]
 [ 29 218]]
Accuracy: 0.9375
F1 Score: 0.9356223175965664
Precision: 0.9954337899543378
Recall: 0.8825910931174089
Confusion Matrix:
[[232   1]
 [ 29 218]]
