In [1]:
import json
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier

In [2]:
# Load the dataset
df = pd.read_excel("AnomaData (1).xlsx", sheet_name=0)

# EDA - Data quality check
print(df.describe())

# Treat missing values, outliers, etc. if any
# For simplicity, let's assume data is clean

                  y            x1            x2            x3            x4  \
count  18398.000000  18398.000000  18398.000000  18398.000000  18398.000000   
mean       0.006740      0.011824      0.157986      0.569300     -9.958345   
std        0.081822      0.742875      4.939762      5.937178    131.033712   
min        0.000000     -3.787279    -17.316550    -18.198509   -322.781610   
25%        0.000000     -0.405681     -2.158235     -3.537054   -111.378372   
50%        0.000000      0.128245     -0.075505     -0.190683    -14.881585   
75%        0.000000      0.421222      2.319297      3.421223     92.199134   
max        1.000000      3.054156     16.742105     15.900116    334.694098   

                 x5            x6            x7            x8            x9  \
count  18398.000000  18398.000000  18398.000000  18398.000000  18398.000000   
mean       0.006518      2.387533      0.001647     -0.004125     -0.003056   
std        0.634054     37.104012      0.108870    

In [5]:
# Get the correct datatype for date
df['time'] = pd.to_datetime(df['time'])

In [6]:
# Feature Engineering and feature selection
X = df.drop(['time', 'y', 'y.1'], axis=1)
y = df['y']

In [7]:
# Train/Test Split - Apply a sampling distribution to find the best split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [9]:
# Model Selection, Training, Predicting and Assessment
model = RandomForestClassifier()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [10]:
# Choose the metrics for the model evaluation
accuracy = accuracy_score(y_test, y_pred)

In [11]:
import json
import pandas as pd
from sklearn.model_selection import GridSearchCV

In [13]:
# Hyperparameter tuning
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

grid_search = GridSearchCV(estimator=RandomForestClassifier(), param_grid=param_grid, cv=5)
grid_search.fit(X_train, y_train)


In [14]:
# Perform model validation
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_
result = f"Best parameters: {best_params}, Best model: {best_model}"
result

"Best parameters: {'max_depth': 20, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}, Best model: RandomForestClassifier(max_depth=20)"

In [15]:
# Model deployment plan
result = f"Model accuracy: {accuracy}"
result

'Model accuracy: 0.9972826086956522'