In [15]:
import pandas as pd
import numpy as np
from scipy.fft import fft
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler

# Load the dataset
data = pd.read_csv('Dataset/dataset_no_outlier.csv')

# Print first few rows and column names to understand the format
print(data.head())
print(data.columns)

# Assuming the class label column is named differently, let's identify it
class_column = 'ClassLabel'  # Replace with the actual column name if different

# If necessary, rename the class column to 'ClassLabel'
data.rename(columns={class_column: 'ClassLabel'}, inplace=True)

# Check the data types of the columns
print(data.dtypes)

# Aggregate every 10 rows into a sequence
def aggregate_data(df, window_size=10):
    aggregated_data = {'X': [], 'Y': [], 'Z': [], 'ClassLabel': []}
    
    for i in range(0, len(df), window_size):
        if i + window_size <= len(df):
            window = df.iloc[i:i+window_size]
            aggregated_data['X'].append(window['X'].values)
            aggregated_data['Y'].append(window['Y'].values)
            aggregated_data['Z'].append(window['Z'].values)
            # Assuming all rows in the window belong to the same class
            aggregated_data['ClassLabel'].append(window['ClassLabel'].mode()[0])
    
    return pd.DataFrame(aggregated_data)

# Aggregate data
agg_data = aggregate_data(data, window_size=10)

# Function to apply FFT and extract features
def extract_fft_features(row):
    # Apply FFT to each of the signal columns
    fft_x = np.abs(fft(row['X']))
    fft_y = np.abs(fft(row['Y']))
    fft_z = np.abs(fft(row['Z']))
    
    # Take the first few coefficients as features
    features = np.concatenate([fft_x[:5], fft_y[:5], fft_z[:5]])
    return features

# Apply the function to each row in the DataFrame
fft_features = agg_data.apply(extract_fft_features, axis=1)

# Convert to a DataFrame
fft_features_df = pd.DataFrame(fft_features.tolist())

# Combine FFT features with the class label
fft_features_df['ClassLabel'] = agg_data['ClassLabel']

# Export FFT features to CSV
fft_features_df.to_csv('fft_features.csv', index=False)

# Features and labels
X = fft_features_df.drop(columns=['ClassLabel'])
y = fft_features_df['ClassLabel']

# Standardize the features
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

# Initialize the Naive Bayes classifier
nb_clf = GaussianNB()

# Apply cross-validation
nb_cv_scores = cross_val_score(nb_clf, X, y, cv=10)  # 10-fold cross-validation

# Print cross-validation scores
print(f'Naive Bayes Cross-validation scores: {nb_cv_scores}')
print(f'Naive Bayes Mean cross-validation score: {nb_cv_scores.mean()}')

# Train the Naive Bayes classifier
nb_clf.fit(X_train, y_train)

# Make predictions
nb_y_pred = nb_clf.predict(X_test)

# Calculate accuracy
nb_accuracy = accuracy_score(y_test, nb_y_pred)
print(f'Naive Bayes Accuracy: {nb_accuracy}')

# Print classification report
print('Naive Bayes Classification Report:')
print(classification_report(y_test, nb_y_pred))

# Initialize the Random Forest classifier
rf_clf = RandomForestClassifier(n_estimators=100, random_state=42)

# Apply cross-validation
rf_cv_scores = cross_val_score(rf_clf, X, y, cv=10)  # 10-fold cross-validation

# Print cross-validation scores
print(f'Random Forest Cross-validation scores: {rf_cv_scores}')
print(f'Random Forest Mean cross-validation score: {rf_cv_scores.mean()}')

# Train the Random Forest classifier
rf_clf.fit(X_train, y_train)

# Make predictions
rf_y_pred = rf_clf.predict(X_test)

# Calculate accuracy
rf_accuracy = accuracy_score(y_test, rf_y_pred)
print(f'Random Forest Accuracy: {rf_accuracy}')

# Print classification report
print('Random Forest Classification Report:')
print(classification_report(y_test, rf_y_pred))

# Hyperparameter tuning for Random Forest (Optional)
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth': [4, 6, 8, 10],
    'criterion': ['gini', 'entropy']
}

grid_search = GridSearchCV(estimator=rf_clf, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)

best_rf_clf = grid_search.best_estimator_

# Make predictions with the best estimator
best_rf_y_pred = best_rf_clf.predict(X_test)

# Calculate accuracy
best_rf_accuracy = accuracy_score(y_test, best_rf_y_pred)
print(f'Best Random Forest Accuracy after Hyperparameter Tuning: {best_rf_accuracy}')

# Print classification report
print('Best Random Forest Classification Report:')
print(classification_report(y_test, best_rf_y_pred))


          X         Y         Z     Mixed  ClassLabel
0  0.125022  0.094986  0.001297  0.157018         1.0
1  0.150710  0.083282 -0.023514  0.173788         1.0
2  0.102941  0.111084  0.010075  0.151782         1.0
3  0.038450  0.049911  0.007511  0.063451         1.0
4 -0.029148 -0.105423  0.017124  0.110711         1.0
Index(['X', 'Y', 'Z', 'Mixed', 'ClassLabel'], dtype='object')
X             float64
Y             float64
Z             float64
Mixed         float64
ClassLabel    float64
dtype: object
Naive Bayes Cross-validation scores: [0.57142857 0.51020408 0.59183673 0.63265306 0.55102041 0.69387755
 0.67346939 0.66666667 0.6875     0.625     ]
Naive Bayes Mean cross-validation score: 0.6203656462585034
Naive Bayes Accuracy: 0.5510204081632653
Naive Bayes Classification Report:
              precision    recall  f1-score   support

         1.0       0.64      0.47      0.54        15
         2.0       1.00      1.00      1.00         4
         3.0       0.45      0.62      0.

120 fits failed out of a total of 360.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
78 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Python311\Lib\site-packages\sklearn\model_selection\_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Python311\Lib\site-packages\sklearn\base.py", line 1145, in wrapper
    estimator._validate_params()
  File "c:\Python311\Lib\site-packages\sklearn\base.py", line 638, in _validate_params
    validate_parameter_constraints(
  File "c:\Python311\Lib\site-packages\sklearn\utils\_param_validation.py", line 96, in validate_parameter_constraints
    raise InvalidParameterError(
sklearn.utils._param_validation.Invalid

Best Random Forest Accuracy after Hyperparameter Tuning: 0.4489795918367347
Best Random Forest Classification Report:
              precision    recall  f1-score   support

         1.0       0.50      0.33      0.40        15
         2.0       1.00      1.00      1.00         4
         3.0       0.31      0.62      0.42         8
         4.0       0.36      0.83      0.50         6
         5.0       0.60      0.19      0.29        16

    accuracy                           0.45        49
   macro avg       0.55      0.60      0.52        49
weighted avg       0.53      0.45      0.43        49

