# First test within Behacom dataset.
## Description:
### 1. Use all features without PCA and without balancing.
### 2. Study correlation using Tableau.
### 3. Combine the files into a single dataframe.
### 4. Define features and target, split, and apply shuffling for good measure.
### 5. Train Gradient Boosting (boosting algorithm) regarding the complex patterns that may be in the data.
### 6. Use metrics compatible with imbalanced datasets (F1-score, recall, precision).
### 7. Test the model manually (not accurate, of course).
### 8. Test the model using cross-validation on the train data.
### 9. Test the model using cross-validation on the test data.
### 10. No need to fine-tune.
### 11. Use metrics compatible with imbalanced datasets (F1-score, recall, precision).


#laod the data

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os
import pandas as pd
import pandas as pd
import glob

In [None]:
def combine_csv_files(root_dir):
    csv_files = glob.glob(root_dir + '/**/*.csv', recursive=True)
    # print(csv_files)
    dfs = [pd.read_csv(file, encoding='ISO-8859-1') for file in csv_files[:5]]
    combined_df = pd.concat(dfs, ignore_index=True)
    return combined_df

In [None]:
root_directory = '/content/drive/MyDrive/BEHACOM'
combined_data_frame = combine_csv_files(root_directory)

In [None]:
combined_data_frame.head()

Unnamed: 0,timestamp,keystroke_counter,erase_keys_counter,erase_keys_percentage,press_press_average_interval,press_press_stddev_interval,press_release_average_interval,press_release_stddev_interval,word_counter,word_average_length,...,current_app_stddev_cpu,system_average_cpu,system_stddev_cpu,current_app_average_mem,current_app_stddev_mem,system_average_mem,system_stddev_mem,received_bytes,sent_bytes,USER
0,1575903000000.0,0,0,0.0,0.0,0.0,0.0,0.0,0,0.0,...,0.0,10.03,5.29,1.5,0.0,17.84,0.01,16583.0,16833.0,3
1,1575903000000.0,0,0,0.0,0.0,0.0,0.0,0.0,0,0.0,...,0.58,9.3,4.87,0.94,0.37,16.31,1.93,32693.0,27653.0,3
2,1575903000000.0,0,0,0.0,0.0,0.0,0.0,0.0,0,0.0,...,0.0,8.1,0.0,0.76,0.0,6.16,0.0,33846.0,81040.0,3
3,1575903000000.0,0,0,0.0,0.0,0.0,0.0,0.0,0,0.0,...,0.55,3.59,2.8,0.43,0.0,4.65,0.0,163729.0,119197.0,3
4,1575903000000.0,0,0,0.0,0.0,0.0,0.0,0.0,0,0.0,...,0.0,16.3,17.82,0.43,0.0,4.65,0.0,170175.0,173653.0,3


In [None]:
combined_data_frame.shape

(22218, 12051)

In [None]:
combined_data_frame.dtypes

timestamp                       float64
keystroke_counter                 int64
erase_keys_counter                int64
erase_keys_percentage           float64
press_press_average_interval    float64
                                 ...   
system_average_mem              float64
system_stddev_mem               float64
received_bytes                  float64
sent_bytes                      float64
USER                              int64
Length: 12051, dtype: object

In [None]:
combined_data_frame.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22218 entries, 0 to 22217
Columns: 12051 entries, timestamp to USER
dtypes: float64(6023), int64(6026), object(2)
memory usage: 2.0+ GB


In [None]:
combined_data_frame.columns

Index(['timestamp', 'keystroke_counter', 'erase_keys_counter',
       'erase_keys_percentage', 'press_press_average_interval',
       'press_press_stddev_interval', 'press_release_average_interval',
       'press_release_stddev_interval', 'word_counter', 'word_average_length',
       ...
       'current_app_stddev_cpu', 'system_average_cpu', 'system_stddev_cpu',
       'current_app_average_mem', 'current_app_stddev_mem',
       'system_average_mem', 'system_stddev_mem', 'received_bytes',
       'sent_bytes', 'USER'],
      dtype='object', length=12051)

In [None]:
combined_data_frame.to_csv('combined_data_frame.csv', index=False, encoding='utf-8')

In [None]:
from sklearn.model_selection import train_test_split

combined_data_frame=pd.read_csv('/content/drive/MyDrive/combined_data_frame.csv')

X = combined_data_frame.drop('USER', axis=1)  # Features
y = combined_data_frame['USER']  # Target variable

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42,shuffle=True)



In [None]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score

gb_model = GradientBoostingClassifier(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=3,
    random_state=42
)


In [None]:
# Train the model
gb_model.fit(X_train, y_train)

In [None]:
import joblib
joblib.dump(gb_model, 'gradient_boosting_model.pkl')

['gradient_boosting_model.pkl']

In [None]:
y_pred = gb_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)


Accuracy: 0.9921242124212422


In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score

# Precision
precision = precision_score(y_test, y_pred, average='weighted')
print("Precision:", precision)

# Recall
recall = recall_score(y_test, y_pred, average='weighted')
print("Recall:", recall)

# F1 Score
f1 = f1_score(y_test, y_pred, average='weighted')
print("F1 Score:", f1)


Precision: 0.9921359667122669
Recall: 0.9921242124212422
F1 Score: 0.9920721732899684


In [None]:
# Select a single sample from the test dataset
sample_index = 150
single_sample = X_test.iloc[[sample_index]]

prediction = gb_model.predict(single_sample)

print("Actual Label:", y_test.iloc[sample_index])
print("Predicted Label:", prediction[0])


Actual Label: 3
Predicted Label: 3


In [None]:
from sklearn.model_selection import cross_val_score, KFold
from sklearn.ensemble import GradientBoostingClassifier
num_folds = 3
kfold = KFold(n_splits=num_folds, shuffle=True, random_state=42)
cv_scores = cross_val_score(gb_model, X_train, y_train, cv=kfold, scoring='accuracy')
print("Cross-validation Scores:", cv_scores)
print("Mean Accuracy:", cv_scores.mean())
print("Standard Deviation:", cv_scores.std())


Cross-validation Scores: [0.99037975 0.99308017 0.99189737]
Mean Accuracy: 0.9917857607519912
Standard Deviation: 0.0011052636400837093


In [None]:
from sklearn.model_selection import cross_val_score, KFold
from sklearn.ensemble import GradientBoostingClassifier
num_folds = 3
kfold = KFold(n_splits=num_folds, shuffle=True, random_state=42)
cv_scores = cross_val_score(gb_model, X_test, y_test, cv=kfold, scoring='accuracy')
print("Cross-validation Scores:", cv_scores)
print("Mean Accuracy:", cv_scores.mean())
print("Standard Deviation:", cv_scores.std())


Cross-validation Scores: [0.9831309  0.9790682  0.98582039]
Mean Accuracy: 0.9826731643249643
Standard Deviation: 0.002775509184613724


In [None]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(cm)

Confusion Matrix:
[[  33    0    0    0    0]
 [   0  598    0    0   27]
 [   0    0  391    0    1]
 [   0    2    0  256    0]
 [   0    2    2    1 3131]]


#End simple test-1-(5 users test).

#first feed back:the data so clean.