<a href="https://colab.research.google.com/github/aadityane93/SortingLine-ml/blob/main/Feature_Selection_and_Model_Training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Importing Python Libraries

In [1]:
!pip install tsfresh
from google.colab import files
import numpy as np
import pandas as pd
from tsfresh import extract_features, select_features
from tsfresh.utilities.dataframe_functions import impute
from tsfresh.feature_selection.relevance import calculate_relevance_table
from sklearn.model_selection import train_test_split, KFold, cross_val_predict, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, f1_score, precision_score
import xgboost as xgb
from sklearn.feature_selection import SelectFromModel
from sklearn.decomposition import PCA



# Importing Files from Google Drive

In [2]:
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
import os

# Authenticate and create the PyDrive client.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)
# folder_id = '1fX9iKRJcP8UwyI_xhI-TlJdxQj1NzvBz' # 20% files
folder_id ='1WFFJcTKOLcoFFzzm29Ch33qotT2h5gX3' #50% files
# folder_id ='1lcYHGesfAhSlpaJu3CwDWo65R6bnaqCo' #80% files

folder_list = drive.ListFile({'q': "'{}' in parents and mimeType='application/vnd.google-apps.folder' and trashed=false".format(folder_id)}).GetList()


for folder in folder_list:
    folder_title = folder['title']
    print(f"Processing folder: {folder_title}")
    file_list = drive.ListFile({'q': "'{}' in parents and trashed=false".format(folder['id'])}).GetList()

    for index, file in enumerate(file_list):
        file_title = file['title']
        print(f"Downloading {file_title}")
        file.GetContentFile(file_title)



Processing folder: 800hz extracted
Downloading extracted-800hz-test-feature-50percent.csv
Downloading extracted-800hz-test-target-50percent.csv
Downloading extracted-800hz-train-target-50percent.csv
Downloading extracted-800hz-train-feature-50percent.csv
Processing folder: 400hz extracted
Processing folder: 100hz extracted
Downloading extracted-100hz-train-feature-50percent-1000files.csv
Downloading extracted-100hz-test-feature-50percent-1000files.csv
Downloading extracted-100hz-test-target-50percent-1000files.csv
Downloading extracted-100hz-train-target-50percent-1000files.csv


# Reading CSV files

In [3]:
hz = 800
percent = 50

def read_file(filename):
  return pd.read_csv(filename, sep=',')

extracted_train_feature = pd.DataFrame(read_file(f'extracted-{hz}hz-train-feature-{percent}percent.csv'))
extracted_test_feature = pd.DataFrame(read_file(f'extracted-{hz}hz-test-feature-{percent}percent.csv'))
extracted_train_target = pd.DataFrame(read_file(f'extracted-{hz}hz-train-target-{percent}percent.csv'))
extracted_test_target = pd.DataFrame(read_file(f'extracted-{hz}hz-test-target-{percent}percent.csv'))

extracted_train_target = extracted_train_target.drop(columns=['Unnamed: 0'])
extracted_test_target = extracted_test_target.drop(columns=['Unnamed: 0'])
extracted_train_feature = extracted_train_feature.drop(columns=['Unnamed: 0'])
extracted_test_feature = extracted_test_feature.drop(columns=['Unnamed: 0'])

# Feature Selecion

In [4]:
selected_features_train = select_features(extracted_train_feature, extracted_train_target['color'])
selected_features_test = extracted_test_feature[selected_features_train.columns]

In [5]:
extracted_train_feature.head()

Unnamed: 0,I_In__variance_larger_than_standard_deviation,I_In__has_duplicate_max,I_In__has_duplicate_min,I_In__has_duplicate,I_In__sum_values,I_In__abs_energy,I_In__mean_abs_change,I_In__mean_change,I_In__mean_second_derivative_central,I_In__median,...,Temp(°C)__fourier_entropy__bins_5,Temp(°C)__fourier_entropy__bins_10,Temp(°C)__fourier_entropy__bins_100,Temp(°C)__permutation_entropy__dimension_3__tau_1,Temp(°C)__permutation_entropy__dimension_4__tau_1,Temp(°C)__permutation_entropy__dimension_5__tau_1,Temp(°C)__permutation_entropy__dimension_6__tau_1,Temp(°C)__permutation_entropy__dimension_7__tau_1,Temp(°C)__query_similarity_count__query_None__threshold_0.0,Temp(°C)__mean_n_absolute_max__number_of_maxima_7
0,0.0,1.0,1.0,1.0,1977.74,798.2818,0.009838,6e-05,-4e-06,0.37,...,0.882111,1.239139,2.862287,0.171539,0.256905,0.341994,0.426801,0.530712,0.0,35.38
1,0.0,1.0,1.0,1.0,1774.69,727.4351,0.009924,7.2e-05,0.0,0.37,...,0.859734,1.117167,2.746032,0.173866,0.260386,0.346621,0.432567,0.538297,0.0,35.38
2,0.0,1.0,1.0,1.0,1939.15,783.9183,0.00966,4.6e-05,4e-06,0.37,...,0.815593,1.154067,2.682073,0.178493,0.267291,0.355781,0.443956,0.554093,0.0,35.38
3,0.0,1.0,1.0,1.0,1887.17,759.3447,0.009686,5.1e-05,0.0,0.37,...,0.852957,1.209221,2.800608,0.165534,0.247936,0.330088,0.411985,0.515183,0.0,35.38
4,0.0,1.0,1.0,1.0,2002.71,810.5145,0.009848,5.6e-05,0.0,0.37,...,0.887858,1.249438,2.911501,0.186467,0.279194,0.371573,0.463597,0.580967,0.0,35.38


In [6]:
selected_features_train.head()

Unnamed: 0,I_Out(A)__abs_energy,I_In__abs_energy,I_Out(A)__energy_ratio_by_chunks__num_segments_10__segment_focus_7,I_Out(A)__sum_values,I_Out(A)__sum_of_reoccurring_data_points,"I_Out(A)__fft_coefficient__attr_""abs""__coeff_0","I_Out(A)__fft_coefficient__attr_""real""__coeff_0",I_In__sum_values,I_In__sum_of_reoccurring_data_points,"I_In__fft_coefficient__attr_""real""__coeff_0",...,Temp(°C)__energy_ratio_by_chunks__num_segments_10__segment_focus_3,"I_Out(A)__fft_coefficient__attr_""real""__coeff_93",Temp(°C)__partial_autocorrelation__lag_7,Temp(°C)__binned_entropy__max_bins_10,"I_In__fft_coefficient__attr_""real""__coeff_83","V_In__agg_linear_trend__attr_""intercept""__chunk_len_50__f_agg_""max""","V_In__fft_coefficient__attr_""real""__coeff_64","I_Out(A)__fft_coefficient__attr_""real""__coeff_36","I_Out(A)__fft_coefficient__attr_""abs""__coeff_72","I_In__fft_coefficient__attr_""imag""__coeff_86"
0,517.6085,798.2818,0.080636,1569.77,1569.77,1569.77,1569.77,1977.74,1977.74,1977.74,...,0.100135,13.975952,-0.01985,0.372093,-10.874078,8.801458,0.959434,-23.014467,15.120978,-0.438043
1,476.2405,727.4351,0.089917,1422.53,1422.53,1422.53,1422.53,1774.69,1774.69,1774.69,...,0.100081,-8.854706,-0.009602,0.376753,15.844641,8.78695,1.612304,-6.815895,10.774303,10.148619
2,510.4352,783.9183,0.077622,1548.96,1548.96,1548.96,1548.96,1939.15,1939.15,1939.15,...,0.100156,5.801261,0.0302,0.390436,-12.606943,8.784169,8.786931,-13.247585,11.377868,-8.839917
3,504.3916,759.3447,0.076024,1519.94,1519.94,1519.94,1519.94,1887.17,1887.17,1887.17,...,0.100064,6.822597,-0.018227,0.362265,-11.177412,8.788103,0.231034,-12.002991,19.726945,-13.229477
4,524.137,810.5145,0.077922,1590.7,1590.7,1590.7,1590.7,2002.71,2002.71,2002.71,...,0.100088,6.004394,-0.045281,0.393642,-10.804195,8.791819,-3.753572,-21.752296,15.105383,0.285768


In [7]:
extracted_train_target.head()

Unnamed: 0,color
0,0
1,0
2,0
3,0
4,0


# Training Models

## Random Forest

In [8]:
model = RandomForestClassifier()
model.fit(selected_features_train,extracted_train_target)
y_pred = model.predict(selected_features_test)
accuracy = accuracy_score(extracted_test_target, y_pred)
precision = precision_score(extracted_test_target, y_pred, average='weighted')
print(classification_report(extracted_test_target, y_pred))
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(confusion_matrix(extracted_test_target, y_pred))

  return fit_method(estimator, *args, **kwargs)


              precision    recall  f1-score   support

           0       0.98      1.00      0.99       500
           1       0.55      1.00      0.71       500
           2       1.00      0.35      0.52       500
           3       1.00      0.80      0.89       500

    accuracy                           0.79      2000
   macro avg       0.88      0.79      0.78      2000
weighted avg       0.88      0.79      0.78      2000

Accuracy: 0.7880
Precision: 0.8822
[[500   0   0   0]
 [  0 500   0   0]
 [  9 315 176   0]
 [  0 100   0 400]]


## XG Boost

In [9]:
final_model = xgb.XGBClassifier(
    alpha=10,
    reg_lambda=1,
    min_child_weight=10,
    colsample_bytree=0.8
)

final_model.fit(selected_features_train,extracted_train_target)
y_pred = final_model.predict(selected_features_test)
accuracy = accuracy_score(extracted_test_target, y_pred)
precision = precision_score(extracted_test_target, y_pred, average='weighted')
print(classification_report(extracted_test_target, y_pred))
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(confusion_matrix(extracted_test_target, y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       500
           1       1.00      0.99      0.99       500
           2       1.00      0.99      1.00       500
           3       0.99      1.00      0.99       500

    accuracy                           1.00      2000
   macro avg       1.00      1.00      1.00      2000
weighted avg       1.00      1.00      1.00      2000

Accuracy: 0.9955
Precision: 0.9955
[[499   0   1   0]
 [  0 495   0   5]
 [  2   0 497   1]
 [  0   0   0 500]]


# Feature Reduction

### Using RandomForest

In [10]:
selector = SelectFromModel(
    RandomForestClassifier(n_estimators=100),
    threshold="mean"
)
selector.fit(selected_features_train,extracted_train_target)
reduced_selected_features_train = selector.transform(selected_features_train)
reduced_selected_features_test = selector.transform(selected_features_test)

print(selected_features_train.shape)
print(reduced_selected_features_train.shape)

final_model.fit(reduced_selected_features_train,extracted_train_target)
y_pred = final_model.predict(reduced_selected_features_test)
accuracy = accuracy_score(extracted_test_target, y_pred)
precision = precision_score(extracted_test_target, y_pred, average='weighted')
print(classification_report(extracted_test_target, y_pred))
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(confusion_matrix(extracted_test_target, y_pred))

  return fit_method(estimator, *args, **kwargs)


(2000, 2122)
(2000, 187)
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       500
           1       1.00      0.99      0.99       500
           2       1.00      0.99      0.99       500
           3       0.99      1.00      1.00       500

    accuracy                           1.00      2000
   macro avg       1.00      1.00      1.00      2000
weighted avg       1.00      1.00      1.00      2000

Accuracy: 0.9955
Precision: 0.9955
[[499   0   1   0]
 [  0 496   0   4]
 [  2   1 496   1]
 [  0   0   0 500]]


### Using the XG Boost

In [11]:
from sklearn.feature_selection import SelectFromModel
from xgboost import XGBClassifier

xgb = XGBClassifier(n_estimators=100, random_state=42)
xgb.fit(selected_features_train,extracted_train_target)


selector = SelectFromModel(
    xgb,
    threshold="mean"
)
X_train_reduced = selector.fit_transform(selected_features_train,extracted_train_target)
X_test_reduced = selector.transform(selected_features_test)

print(selected_features_train.shape)
print(X_train_reduced.shape)

final_model.fit(X_train_reduced,extracted_train_target)
y_pred = final_model.predict(X_test_reduced)
accuracy = accuracy_score(extracted_test_target, y_pred)
precision = precision_score(extracted_test_target, y_pred, average='weighted')
print(classification_report(extracted_test_target, y_pred))
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(confusion_matrix(extracted_test_target, y_pred))

(2000, 2122)
(2000, 76)
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       500
           1       1.00      0.99      0.99       500
           2       0.99      0.99      0.99       500
           3       0.99      1.00      0.99       500

    accuracy                           0.99      2000
   macro avg       1.00      1.00      1.00      2000
weighted avg       1.00      0.99      1.00      2000

Accuracy: 0.9950
Precision: 0.9950
[[499   0   1   0]
 [  0 495   1   4]
 [  2   0 497   1]
 [  0   0   1 499]]


# Further Feature Reduction

## Reducton to 50 features

In [16]:
xgb = XGBClassifier(n_estimators=100, random_state=42)
xgb.fit(selected_features_train,extracted_train_target)


selector = SelectFromModel(
    xgb,
    threshold="mean",
    max_features=50
)
X_train_reduced = selector.fit_transform(selected_features_train,extracted_train_target)
X_test_reduced = selector.transform(selected_features_test)

print(selected_features_train.shape)
print(X_train_reduced.shape)

final_model.fit(X_train_reduced,extracted_train_target)
y_pred = final_model.predict(X_test_reduced)
accuracy = accuracy_score(extracted_test_target, y_pred)
precision = precision_score(extracted_test_target, y_pred, average='weighted')
print(classification_report(extracted_test_target, y_pred))
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(confusion_matrix(extracted_test_target, y_pred))

(2000, 2122)
(2000, 50)
              precision    recall  f1-score   support

           0       0.99      1.00      1.00       500
           1       1.00      0.98      0.99       500
           2       0.99      0.99      0.99       500
           3       0.98      1.00      0.99       500

    accuracy                           0.99      2000
   macro avg       0.99      0.99      0.99      2000
weighted avg       0.99      0.99      0.99      2000

Accuracy: 0.9930
Precision: 0.9931
[[499   0   1   0]
 [  1 490   2   7]
 [  2   0 497   1]
 [  0   0   0 500]]


## Reducton to 1 feature

In [18]:
xgb = XGBClassifier(n_estimators=100, random_state=42)
xgb.fit(selected_features_train,extracted_train_target)


selector = SelectFromModel(
    xgb,
    threshold="mean",
    max_features=1
)
X_train_reduced = selector.fit_transform(selected_features_train,extracted_train_target)
X_test_reduced = selector.transform(selected_features_test)

print(selected_features_train.shape)
print(X_train_reduced.shape)

final_model.fit(X_train_reduced,extracted_train_target)
y_pred = final_model.predict(X_test_reduced)
accuracy = accuracy_score(extracted_test_target, y_pred)
precision = precision_score(extracted_test_target, y_pred, average='weighted')
print(classification_report(extracted_test_target, y_pred))
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(confusion_matrix(extracted_test_target, y_pred))

(2000, 2122)
(2000, 1)
              precision    recall  f1-score   support

           0       0.98      1.00      0.99       500
           1       0.93      0.84      0.89       500
           2       0.89      0.96      0.92       500
           3       0.96      0.96      0.96       500

    accuracy                           0.94      2000
   macro avg       0.94      0.94      0.94      2000
weighted avg       0.94      0.94      0.94      2000

Accuracy: 0.9395
Precision: 0.9403
[[498   0   1   1]
 [  0 421  61  18]
 [  9  11 479   1]
 [  0  19   0 481]]


In [19]:
pd.DataFrame(X_test_reduced)

Unnamed: 0,0
0,506.7549
1,492.7285
2,492.3093
3,479.3532
4,486.3565
...,...
1995,696.5408
1996,692.3252
1997,712.1299
1998,676.3045
