# COMP4030 - Data Science and Machine Learning - Coursework 2

Firstly we ensure that the raw data provided by Phyphox is unzipped and ready for import.

In [None]:
# Utility functions - Ashley Hunt - psyah10
import os

def get_gestures():
      return ['circle', 'come', 'go', 'wave']

def get_columns():
    return ['time', 'accel_x', 'accel_y', 'accel_z', 'accel_abs']

def get_gesture_csvs(gesture_dir):
      if not os.path.exists(gesture_dir):
            os.makedirs(gesture_dir)
      return [file for file in os.listdir(gesture_dir) if file.endswith('.csv')]

Next we read the raw data from the CSV files and place the data inside a pandas DataFrame. We also use this opportunity to normalise our data using a MinMaxScaler, ensuring that all of our data lies between 0 and 1. This will ensure that no single field plays a more important role than it should.

In [None]:
# Data importing - Ashley Hunt - psyah10
import pandas as pd

## Simple function to read a csv file and return a dataframe - If scaler is provided, it will scale the data
def get_df(path, scaler=None, trim=True):
      if not path.endswith('.csv'):
            return []
      
      raw_data = pd.read_csv(path)
      raw_data.columns = get_columns()

      df = pd.DataFrame(scaler.fit_transform(raw_data) if scaler else raw_data, columns=raw_data.columns)

      return trim_recording(df) if trim else df

## Function to trim the recording to the first and last significant movement
def trim_recording(df, window_size = 20, threshold = 0.3, padding=90):

      df['rolling_max'] = df['accel_abs'].rolling(window=window_size, min_periods=1).mean()

      start_cut = df[df['rolling_max'] >= threshold].index.min()
      if pd.notna(start_cut):
            cut_index = max(start_cut - padding, 0)
            df = df.loc[cut_index:]
      
      end_cut = df[df['rolling_max'] >= threshold].index.max()
      if pd.notna(end_cut):
            end_cut_index = min(end_cut + padding, len(df) - 1) 
            df = df.loc[:end_cut_index]

      df = df.drop('rolling_max', axis=1)
      return df

## Function to get all the data from the files in the data folder
def get_data_from_files(scaler=None, trim=True, test=False):
      dfs = []
      gestures = get_gestures()
      if test:
            gestures.append('unknown')
      for gesture in gestures:
            folder_path = f'data/{gesture}' if not test else f'data/test/{gesture}/'
            files_in_folder = get_gesture_csvs(folder_path)
            if len(files_in_folder) == 0:
                  continue
            for file_index, file_name in enumerate(files_in_folder):
                  file_path = os.path.join(folder_path, file_name)
                  df = get_df(file_path, scaler, trim)
                  if len(df) == 0:
                        continue

                  df['file_number'] = int(file_index)
                  df['gesture'] = str(gesture)

                  dfs.append(df)
                  
      df = pd.concat(dfs, ignore_index=True) if len(dfs) > 1 else dfs[0]
      df.set_index(['gesture', 'file_number'], inplace=True)
      df.sort_index(inplace=True)
      return df

In [None]:
# Data importing - Ashley Hunt - psyah10
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler() ## StandardScaler() or MinMaxScaler()

df = get_data_from_files(scaler, trim=False)
df

In [None]:
# Balance ditribution - Ashley Hunt - psyah10

def balance_files(df):
      for gesture in df.index.get_level_values('gesture').unique():
            num_files = len(df.loc[(gesture), :].index.unique())
            print(f'{gesture} has {num_files} files (approx. {num_files * 8} gestures)')
            
      average_num_files = df.reset_index().groupby('gesture')['file_number'].nunique().mean()
      print(f'\nAverage number of files per gesture: {average_num_files}')
      
      remove = df.index.get_level_values('file_number').unique()[int(average_num_files)]
      balanced_df = df[df.index.get_level_values('file_number') < remove]
      
      print(f'\nRemoved {len(df) - len(balanced_df)} rows from dataset where file_number >= {remove}')
      return balanced_df

In [None]:
# Balance ditribution - Ashley Hunt - psyah10
df = balance_files(df)
df

In [None]:
# Data importing - Ashley Hunt - psyah10

test_df = get_data_from_files(scaler, trim=False, test=True)
print(test_df.index.unique(level='gesture'))
test_df

Next we visualise our raw data for exploratory analysis

In [None]:
# Visualising data - Ashley Hunt - psyah10
import matplotlib.pyplot as plt

def visualise_df(dataframe, files_per_gesture=2):
    fig, axs = plt.subplots(4, 2, figsize=(16, 12))

    for (gesture, file_number), group in dataframe.loc[(slice(None), range(0,files_per_gesture)), :].groupby(level=['gesture', 'file_number']):

            ax = axs[get_gestures().index(gesture), file_number]

            for col in get_columns()[1:4]:
                ax.plot(range(0, len(group)), group[col], label=col)
                ax.set_title("{gesture} {file_number}".format(gesture=gesture, file_number=file_number))
                ax.set_xlabel('Index')
                ax.set_ylabel('Acceleration')
                ax.legend()
            
    plt.tight_layout()
    plt.show()

In [None]:
visualise_df(df, files_per_gesture=2)

Next we apply a low-pass filter to reduce noise from the data and make our model more robust

In [None]:
# Low-pass filtering - Ashley Hunt - psyah10
from scipy.signal import butter, filtfilt

def butter_lowpass_filter(data, cutoff_freq, fs, order=5):
    nyquist_freq = 0.5 * fs
    normal_cutoff = cutoff_freq / nyquist_freq
    b, a = butter(order, normal_cutoff, btype='low', analog=False)
    filtered_data = filtfilt(b, a, data)
    return filtered_data

def apply_filter(df, cutoff_freq=2, fs=50, order=5):
    new_df = df.copy()
    for column in get_columns()[1:]:
        new_df[column] = butter_lowpass_filter(new_df[column], cutoff_freq, fs, order)
    return new_df

Now we again visualise our data and using interactive widgets we visually evaluate the performance of different parameter settings for apply the filter.

In [None]:
# Visualising data - Ashley Hunt - psyah10
import matplotlib.pyplot as plt

def df_preview(dataframe, files_per_gesture=2):
    
    plt.clf()
    
    fig, axs = plt.subplots(4, files_per_gesture, figsize=(15, files_per_gesture * 4))

    for (gesture, file_number), group in dataframe.loc[(slice(None), range(0, files_per_gesture)), :].groupby(level=['gesture', 'file_number']):

            ax = axs[get_gestures().index(gesture), file_number]

            for col in get_columns()[1:4]:
                ax.plot(range(0, len(group)), group[col], label=col)
                ax.set_title("{gesture} {file_number}".format(gesture=gesture, file_number=file_number))
                ax.set_xlabel('Index')
                ax.set_ylabel('Acceleration')
                ax.legend()
            
    plt.tight_layout()
    plt.show()

In [None]:
# Filter parameter tuning - Ashley Hunt - psyah10
%matplotlib inline
import ipywidgets as widgets
from IPython.display import display

cutoff_frequency_slider = widgets.IntSlider(value=2, min=1, max=15, step=1, description='cutoff_frequency')
sampling_rate_slider = widgets.IntSlider(value=70, min=25, max=150, step=1, description='sampling_rate')
filter_order_slider = widgets.IntSlider(value=5, min=1, max=100, step=1, description='filter_order')

def update_signal(cutoff_frequency, sampling_rate, filter_order):
    filtered_df = apply_filter(df, cutoff_frequency, sampling_rate, filter_order)
    df_preview(filtered_df, files_per_gesture=3)

interactive_plot = widgets.interactive(update_signal, cutoff_frequency=cutoff_frequency_slider, sampling_rate=sampling_rate_slider, filter_order=filter_order_slider)
display(interactive_plot)


Using the best parameters we then apply these settings to our data

In [None]:
# Filter parameter tuning - Ashley Hunt - psyah10

cutoff_frequency = 2  # Cutoff frequency in Hz - Higher = less smoothing
sampling_rate = 70  # Sampling rate in Hz - Higher = more smoothing
filter_order = 5  # Filter order - Higher = less smoothing

df = apply_filter(df, cutoff_frequency, sampling_rate, filter_order)
test_df = apply_filter(test_df, cutoff_frequency, sampling_rate, filter_order)

In [None]:
df

Now we split the data from distinct files into distinct gestures. We do this by using the natural peaks and troughs in absolute acceleration.

In [None]:
# Spltting gestures - Ashley Hunt - psyah10
from scipy.signal import find_peaks

def split_file_to_gestures(df, threshold=0.03, padding=10):
      gesture_data = []
      charts = []
      for (gesture, file_number), group in df.groupby(level=['gesture', 'file_number']):
            
            charts.append(gesture) ## Limit to 2 charts per gesture
            
            group.reset_index(drop=True, inplace=True)
            group.drop(['time'], axis=1, inplace=True)
            
            peaks, peak_info = find_peaks(group['accel_abs'], height=0.2, distance=20, width=10, prominence=0.1)
            
            if(len(peaks) < 6 or len(peaks) > 9):
                  print(f"Incorrect peaks in {gesture} {file_number} - {len(peaks)}")
                  plt.plot(group['accel_abs'])
                  plt.plot(peaks, group['accel_abs'][peaks], "x")
                  plt.title(f'{gesture} {file_number}')
                  plt.show()
            
            for peak in peaks:
                  
                  below_target = group.loc[:peak]
                  start_index = below_target[below_target['accel_abs'] < threshold].last_valid_index()
                  start_index = max(0, start_index - padding)
                  
                  above_target = group.loc[peak + 1:]
                  end_index = above_target[above_target['accel_abs'] < threshold].first_valid_index()
                  end_index = min(len(group), end_index + padding)
                  
                  data = group.loc[start_index:end_index].copy()
                  
                  data['gesture_number'] = len(gesture_data)
                  data['gesture'] = gesture
                  gesture_data.append(data)
      
      gesture_df = pd.concat(gesture_data, ignore_index=True)
      gesture_df.set_index(['gesture', 'gesture_number'], inplace=True)
      return gesture_df

In [None]:
# Spltting gestures - Ashley Hunt - psyah10
gesture_df = split_file_to_gestures(df, threshold=0.03, padding=10)
gesture_df

In [None]:
# Spltting gestures - Ashley Hunt - psyah10

test_gesture_df = split_file_to_gestures(test_df, threshold=0.03, padding=10)

Now we visualise these gestures.

In [None]:
# Visualising data - Ashley Hunt - psyah10

def visualise_gestures(dataframe, n):
      charts = []
      plt.figure(figsize=(n * 4, 10))
      for (gesture, gesture_number), group in dataframe.groupby(level=['gesture', 'gesture_number']):
            
            charts.append(gesture) ## Limit charts per gesture
            if(charts.count(gesture) > n):
                  continue
            
            gesture_i = get_gestures().index(gesture)
            plt.subplot(4, n, (n * gesture_i ) + charts.count(gesture) )
            
            group.reset_index(drop=True, inplace=True)
            for col in get_columns()[1:4]:
                  plt.plot(range(len(group)), group[col], label=col)
                  plt.title("{gesture} {gesture_number}".format(gesture=gesture, gesture_number=gesture_number))
                  plt.ylim(0, 1)
      plt.subplots_adjust(hspace=0.4)
      plt.show()

In [None]:
# Visualising data - Ashley Hunt - psyah10

visualise_gestures(gesture_df, 6)

If, due to slicing errors, there are duplicate gestures we will remove them

In [None]:
# Removing duplicate slices - Ashley Hunt - psyah10

def remove_duplicate_gestures(dataframe):
      data = dataframe.copy()
      
      data_columns = get_columns()[1:]
      group_columns = ['gesture', 'gesture_number']

      grouped = data.groupby(group_columns)

      group_representations = {}
      duplicates_found = False
      for name, group in grouped:
            group_tuple = tuple(group.sort_values(by=data_columns)[data_columns].itertuples(index=False, name=None))
            
            if group_tuple in group_representations:
                  duplicates_found = True
                  data = data.drop(name)
                  print("Removing identical gestures", name, "and", group_representations[group_tuple])
            group_representations[group_tuple] = name
            
      if not duplicates_found:
            print("No duplicate gestures found")
      return data

In [None]:
# Removing duplicate slices - Ashley Hunt - psyah10

gesture_df = remove_duplicate_gestures(gesture_df)
test_gesture_df = remove_duplicate_gestures(test_gesture_df)

Next we apply a Fourier transformation on the data and filter out the frequencies in the data that are not adding useful information. This is achieved using trial and error through visualisation of the wave before and after transformation.

In [None]:
# Fourier Transformation - Ashley Hunt - psyah10
import numpy as np

def fft_filter(data, sample_rate, cutoff_freq_low, cutoff_freq_high):
    fft_data = np.fft.fft(data)
    freqs = np.fft.fftfreq(len(data), 1/sample_rate)

    mask = (freqs > cutoff_freq_low) & (freqs < cutoff_freq_high)
    fft_data[~mask] = 0

    filtered_signal = np.fft.ifft(fft_data).real
    return filtered_signal

def apply_fft_filter(data, sample_rate, cutoff_freq_low, cutoff_freq_high):
    new_df = data.copy()
    for column in get_columns()[1:4]:
        new_df[column] = fft_filter(new_df[column], sample_rate, cutoff_freq_low, cutoff_freq_high)
    return new_df

def visually_compare_fft(original_df, filtered_df, visualise_n=3):

    for name, group in original_df.groupby(level='gesture'):
        
        first_n_gestures = group.index.get_level_values('gesture_number').unique()[:visualise_n]
        data = group[group.index.get_level_values('gesture_number').isin(first_n_gestures)]
        
        for n, g_data in data.groupby(level='gesture_number'):

            plt.figure(figsize=(10, 2))

            plt.subplot(1, 2, 1)
            for col in get_columns()[1:4]:
                plt.plot(range(len(g_data)), g_data[col], label=f'Original {col}', alpha=0.5)
            plt.title(f"{name}, {n}")

            plt.subplot(1, 2, 2)
            filtered_data = filtered_df.loc[(name, n)]
            for col in get_columns()[1:4]:
                plt.plot(range(len(filtered_data)), filtered_data[col], label=f'Filtered {col}', linestyle='--')
            plt.title(f'Filtered {name}')

            plt.tight_layout()
            plt.show()

In [None]:
# Fourier Transformation - Ashley Hunt - psyah10
 
sample_rate = 1000  # Sampling rate (Hz)
cutoff_low = 1  # Low cutoff frequency (Hz)
cutoff_high = 20  # High cutoff frequency (Hz) 

fft_filtered_gestures_df = apply_fft_filter(gesture_df, sample_rate, cutoff_low, cutoff_high)
fft_test_filtered_gestures_df = apply_fft_filter(test_gesture_df, sample_rate, cutoff_low, cutoff_high)

In [None]:
# Visualise the FFT filter alongside the original data
visually_compare_fft(gesture_df, fft_filtered_gestures_df, 1)

Next we can begin to extract our features for the models. These functions split the data into a fixed number of segments with some overlap and then take some key statistical values from these slices.

In [None]:
# Feature extraction - Ashley Hunt - psyah10

def extract_segments(df, num_segments, overlap_fraction):
    total_rows = len(df)
    segment_size = total_rows // num_segments
    overlap_size = int(segment_size * overlap_fraction)
    
    segments = []
    
    for i in range(num_segments):
        start_idx = i * segment_size
        if i > 0:
            start_idx -= overlap_size
        
        if i == num_segments - 1:
            end_idx = total_rows
        else:
            end_idx = start_idx + segment_size + overlap_size

        segments.append(df.iloc[start_idx:end_idx])
    
    return segments

def extract_features_from_df(df, feature_functions, num_segments = 4, overlap_fraction = 0.1):
      results = []
      
      for (gesture, gesture_number), group in df.groupby(level=['gesture', 'gesture_number']):
            
            group.reset_index(drop=True, inplace=True)

            result_dict = {}
            segments = extract_segments(group, num_segments, overlap_fraction)
            result_dict['gesture'] = gesture
            for n, segment in enumerate(segments):
                  for col in get_columns()[1:4]:
                        for f in feature_functions:
                              result_dict[f'{col}_{f}_{n+1}'] = segment[col].agg(f)

            results.append(result_dict)

      return pd.DataFrame(results)

In [None]:
# Feature extraction - Ashley Hunt - psyah10

# Function options 'mean', 'min', 'max', 'median', 'std', 'skew', 'kurtosis', 'quantile'
feature_functions = ['mean', 'min', 'max', 'std', 'kurtosis', 'skew'] 
num_segments = 8
overlap_fraction = 0.2

In [None]:
# Feature extraction - Ashley Hunt - psyah10

extracted_features = extract_features_from_df(fft_filtered_gestures_df, feature_functions, num_segments, overlap_fraction)
extracted_test_features = extract_features_from_df(fft_test_filtered_gestures_df, feature_functions, num_segments, overlap_fraction)

In [None]:
# Visualisting Feature extraction for feature selection - Ashley Hunt - psyah10

def show_trends(dataframe):
      for f in feature_functions:
            plt.figure(figsize=(15, 3))
            for col in get_columns()[1:4]:
                  plt.subplot(1, 3, get_columns().index(col))
                  plt.title(col)
                  for g in get_gestures():
                        points = []
                        for s in range(1, num_segments + 1):
                              points.append( dataframe.loc[dataframe["gesture"] == g][f'{col}_{f}_{s}'].mean())
                        plt.plot(range(1, num_segments + 1), points, marker='o', linestyle='-', label=g)
                        plt.xlabel('Segment')
                        plt.ylabel(f)
                        plt.legend()
                        plt.xticks(range(1, num_segments + 1))
            plt.suptitle(f"{f} by segment")
            plt.tight_layout()
            plt.show()

In [None]:
# Visualisting Feature extraction for feature selection - Ashley Hunt - psyah10

show_trends(extracted_features)

In [None]:
# Visualisting Feature extraction for feature selection - Ashley Hunt - psyah10

def show_trend(dataframe, function, charts_per_gesture=4):
      
      for g in get_gestures():
            if len(dataframe.loc[dataframe["gesture"] == g]) == 0:
                  continue
            plt.figure(figsize=(charts_per_gesture * 4, 3))
            plt.title(f"{charts_per_gesture} {g}'s {function}")
            for graphs in range(0, charts_per_gesture):
                  plt.subplot(1, charts_per_gesture, graphs + 1)
                  
                  data = dataframe.loc[dataframe["gesture"] == g].iloc[graphs]
                  for col in get_columns()[1:4]:
                        points = []
                        for s in range(1, num_segments + 1):
                              points.append(data[f'{col}_{function}_{s}'])
                        plt.plot(range(1, num_segments + 1), points, marker='o', linestyle='-', label=col)
                  plt.legend()
                  plt.xticks(range(1, num_segments + 1))
            plt.show()

In [None]:
# Visualisting Feature extraction for feature selection - Ashley Hunt - psyah10

# for f in feature_functions:
#       show_trend(extracted_features, f, 3)

In [None]:
# Balance data - Ashley Hunt - psyah10

# Find out the minimum number of gestures in a category
min_gestures = extracted_features['gesture'].value_counts().min()

# Drop the extra gestures
extracted_features = extracted_features.groupby('gesture').head(min_gestures)

In [None]:
from sklearn.decomposition import PCA
import seaborn as sns

def extract_principle_components(dataframe, components=2):
      
      data = dataframe.reset_index()
      gestures = data['gesture']
      data = data.loc[:, (data.columns != 'gesture') & (data.columns != 'index')]

      pca = PCA(n_components=components)
      principal_components = pca.fit_transform(data)
      cols = [f'principal component {i}' for i in range(1, components + 1)]
      principle_df = pd.DataFrame(data = principal_components, columns = cols)

      # Visualising the PCA results
      plt.figure(figsize=(components * 4, components * 4))
      for i in range(1, components + 1):
            for j in range(1, components + 1):
                  if i == j:
                        continue
                  plt.subplot(components, components, (i - 1) * components + j)
                  sns.scatterplot(x=f"principal component {i}", y=f"principal component {j}", data=principle_df, hue=gestures, palette="deep")
                  plt.title(f"Principle Component Extraction {i} vs {j}")
      plt.tight_layout()
      plt.show()
      return principle_df, gestures

principle_features, principle_gestures = extract_principle_components(extracted_features, components=4)

In [None]:
# Label encoding - Ashley Hunt - psyah10
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()

def encode_labels(labels):
    return label_encoder.fit_transform(labels)

def decode_labels(encoded_labels):
    return label_encoder.inverse_transform(encoded_labels)

In [None]:
# Data splitting - Ashley Hunt - psyah10
from sklearn.model_selection import train_test_split

X = extracted_features.loc[:, extracted_features.columns != 'gesture']
Y = encode_labels(extracted_features['gesture'])

# X = principalDf
# Y = encode_labels(gestures)

UNSEEN_FEATURES = extracted_test_features.loc[:, extracted_test_features.columns != 'gesture']

train_features, test_features, train_labels, test_labels = train_test_split(X, Y, test_size=0.3, random_state=42)

In [None]:
# Random Forest Training - Ashley Hunt - psyah10
from sklearn.ensemble import RandomForestClassifier

rfc_model = RandomForestClassifier(n_estimators=100, random_state=42)
rfc_model.fit(train_features, train_labels)

In [None]:
# Random Forest Evaluation - Ashley Hunt - psyah10
from sklearn.metrics import accuracy_score, classification_report

rfc_prediction = rfc_model.predict(test_features)

print(f"Accuracy: {accuracy_score(test_labels, rfc_prediction)}")
print(classification_report(test_labels, rfc_prediction))

unseen_feature_prediction = rfc_model.predict(UNSEEN_FEATURES)
print("Prediction on unseen data:", decode_labels(unseen_feature_prediction))

actual_unseen_data = ['circle', 'circle', 'come', 'come', 'go', 'go', 'wave', 'wave']
print("Actual unseen data:", actual_unseen_data)
unseen_results = [a == b for a, b in zip(actual_unseen_data, decode_labels(unseen_feature_prediction))]
print("Accuracy:", unseen_results.count(True) / len(unseen_results))


In [None]:
# Support Vector Machines - Ashley Hunt - psyah10
from sklearn import svm

svm_model = svm.SVC(kernel='linear') # Linear Kernel

svm_model.fit(train_features, train_labels)

In [None]:
svm_y_pred = svm_model.predict(test_features)

print(f"Accuracy: {accuracy_score(test_labels, svm_y_pred)}")
print(classification_report(test_labels, svm_y_pred))

In [None]:
# Ashley Hunt - psyah10
from sklearn.model_selection import RandomizedSearchCV
import numpy as np

n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
max_features = ['auto', 'sqrt']
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4]
bootstrap = [True, False]
random_grid = {'n_estimators': n_estimators,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap
               }
rf = RandomForestClassifier()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores

DO_MODEL_TUNING = False

if(DO_MODEL_TUNING):
      rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1, error_score='raise')
      rf_random.fit(train_features, train_labels)
      
      rf_random.best_params_

In [None]:
# def evaluate(model, test_features, test_labels):
#     predictions = model.predict(test_features)
#     errors = abs(predictions - test_labels)
#     mape = 100 * np.mean(errors / test_labels)
#     accuracy = 100 - mape
#     print('Model Performance')
#     print('Average Error: {:0.4f} degrees.'.format(np.mean(errors)))
#     print('Accuracy = {:0.2f}%.'.format(accuracy))
#     return accuracy

# base_model = RandomForestClassifier(n_estimators = 10, random_state = 42)
# base_model.fit(features, encoded_labels)
# base_accuracy = evaluate(base_model, test_features, test_labels)

# best_random = rf_random.best_estimator_
# random_accuracy = evaluate(best_random, test_features, test_labels)

# print('Improvement of {:0.2f}%.'.format( 100 * (random_accuracy - base_accuracy) / base_accuracy))

In [None]:
# Amit Kumar
from sklearn.model_selection import StratifiedShuffleSplit, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import balanced_accuracy_score

sss = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=42)

# for train_index, test_index in sss.split(features, encoded_labels):
#     train_features, test_features = features[train_index], features[test_index]
#     train_labels, test_labels = encoded_labels[train_index], encoded_labels[test_index]
#     print(f"  Test:  index={test_index}")
#     print(f"  Train: index={train_index}")
    
# rfc = RandomForestClassifier(random_state=42)
 
# param_grid = {
#     'n_estimators': [50, 100, 150],
#     'max_depth': [None, 10, 20],
#     'min_samples_split': [2, 5, 10],
#     'min_samples_leaf': [1, 2, 4]
# }
 
# grid_search = GridSearchCV(estimator=rfc, param_grid=param_grid, cv=5, scoring='balanced_accuracy')
# grid_search.fit(train_features, train_labels)

# best_params = grid_search.best_params_
# best_estimator = grid_search.best_estimator_
 
# y_pred = best_estimator.predict(test_features)
# balanced_accuracy = balanced_accuracy_score(test_labels, y_pred)
# print("Best Parameters:", best_params)
# print("Balanced Accuracy:", balanced_accuracy)

In [None]:
# Amit Kumar
# from xgboost import XGBClassifier
 
# sss = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=42)
 
# for train_index, test_index in sss.split(X, y):
#     train_features, test_features = X[train_index], X[test_index]
#     train_labels, test_labels = y[train_index], y[test_index]
    
# xgb_classifier = XGBClassifier(random_state=42)
 
# param_grid = {
#     'n_estimators': [50, 100, 150],
#     'max_depth': [3, 5, 7],
#     'learning_rate': [0.1, 0.01, 0.001],
#     'subsample': [0.8, 0.9, 1.0],
#     'colsample_bytree': [0.8, 0.9, 1.0]
# }
 
# grid_search = GridSearchCV(estimator=xgb_classifier, param_grid=param_grid, scoring='balanced_accuracy', cv=5)
# grid_search.fit(train_features, train_labels)
 
# best_params = grid_search.best_params_
# best_estimator = grid_search.best_estimator_
 
# y_pred = best_estimator.predict(test_features)
 
# balanced_accuracy = balanced_accuracy_score(test_labels, y_pred)
# print("Best Parameters:", best_params)
# print("Balanced Accuracy:", balanced_accuracy)