In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv('final_direct_train.csv')

In [3]:
df['label'].value_counts()

label
2    235251
5    162839
0     69216
1     30780
4     12968
3      5016
Name: count, dtype: int64

In [4]:
# train drop all the rows with a label = 4 
print(df.shape)
df = df[df['label'] != 4]
print(df.shape)

(516070, 25)
(503102, 25)


In [5]:
from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=8, random_state=42)
df['location_cluster'] = kmeans.fit_predict(df[['lon','lat']])



In [15]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score
from sklearn.model_selection import TimeSeriesSplit
from tqdm import tqdm
import numpy as np
import copy

# Convert 'create_dt' to datetime if not already in datetime format
df['create_dt'] = pd.to_datetime(df['create_dt'])

# Sort the data by 'create_dt' to ensure chronological order
df = df.sort_values(by='create_dt')

# Define features and labels
features = df.drop(columns=["label","create_dt"])  # Assuming 'label' is the target variable
labels = df["label"]

# Best parameters from your previous optimization
best_params = {
    'bootstrap': False,
    'class_weight': "balanced",
    'criterion': 'entropy',
    'max_depth': 10,
    'min_samples_leaf': 1,
    'min_samples_split': 5,
    'n_estimators': 500
}

# Initialize the Random Forest classifier
rf = RandomForestClassifier(**best_params, random_state=42)

# TimeSeriesSplit for cross-validation
tscv = TimeSeriesSplit(n_splits=5)

# Initialize the best F1 score and feature set tracking
best_f1 = 0
best_feature_set = list(features.columns)  # Start with all features
current_feature_set = list(features.columns)
best_rf = None


# Train the model with time-based splits
for train_index, test_index in tqdm(tscv.split(features), total=tscv.get_n_splits(), desc="Training Progress"):
    X_train, X_test = features[current_feature_set].iloc[train_index], features[current_feature_set].iloc[test_index]
    y_train, y_test = labels.iloc[train_index], labels.iloc[test_index]
    
    # Train the model on selected features (including 'create_dt')
    rf.fit(X_train, y_train)

    # Predict and evaluate F1 score
    y_pred = rf.predict(X_test)
    f1 = f1_score(y_test, y_pred, average='weighted')  # Assuming multiclass classification

    print(f"F1 score: {f1:.4f}")

    # Update best F1 score and model if necessary
    if f1 > best_f1:
        best_f1 = f1
        best_feature_set = current_feature_set.copy()  # Update the best feature set
        best_rf = copy.deepcopy(rf)  # Save the best model

    
    # Feature importance analysis
    feature_importances = rf.feature_importances_
    sorted_idx = np.argsort(feature_importances)[::-1]  # Sort in descending order of importance
    
    # Keep the top 70% of the most important features
    num_features_to_keep = int(len(current_feature_set) * 0.85)
    
    # Dynamically update the current feature set
    current_feature_set = [current_feature_set[i] for i in sorted_idx[:num_features_to_keep]]
    
    print(f"Current feature set after update: {current_feature_set}")

print("Best F1 score:", best_f1)
print("Best feature set:", best_feature_set)


Training Progress:  20%|██        | 1/5 [00:54<03:38, 54.57s/it]

F1 score: 0.6933
Current feature set after update: ['speed_gps', 'lon', 'lat', 'direction', 'alt', 'accel_angular_nn', 'location_cluster', 'accel_vertical_nn', 'accel_forward_nn', 'accel_braking_nn', 'hour_cos', 'hour', 'hour_sin', 'day_of_week_sin', 'day', 'minute', 'day_of_week', 'day_of_week_cos', 'month', 'mdm_object_name']


Training Progress:  40%|████      | 2/5 [02:58<04:45, 95.10s/it]

F1 score: 0.7463
Current feature set after update: ['speed_gps', 'lon', 'lat', 'direction', 'alt', 'accel_angular_nn', 'accel_vertical_nn', 'location_cluster', 'accel_forward_nn', 'accel_braking_nn', 'day', 'day_of_week', 'day_of_week_cos', 'hour_cos', 'hour', 'mdm_object_name', 'day_of_week_sin']


Training Progress:  60%|██████    | 3/5 [06:29<04:56, 148.29s/it]

F1 score: 0.8355
Current feature set after update: ['speed_gps', 'lon', 'lat', 'direction', 'alt', 'accel_angular_nn', 'accel_vertical_nn', 'location_cluster', 'accel_forward_nn', 'accel_braking_nn', 'day', 'day_of_week', 'mdm_object_name', 'hour_cos']


Training Progress:  80%|████████  | 4/5 [10:43<03:10, 190.15s/it]

F1 score: 0.7668
Current feature set after update: ['speed_gps', 'lon', 'lat', 'direction', 'alt', 'accel_angular_nn', 'location_cluster', 'accel_vertical_nn', 'accel_braking_nn', 'accel_forward_nn', 'day']


Training Progress: 100%|██████████| 5/5 [16:30<00:00, 198.05s/it]

F1 score: 0.8043
Current feature set after update: ['speed_gps', 'lon', 'lat', 'alt', 'direction', 'accel_angular_nn', 'accel_vertical_nn', 'location_cluster', 'accel_braking_nn']
Best F1 score: 0.8355459197882839
Best feature set: ['speed_gps', 'lon', 'lat', 'direction', 'alt', 'accel_angular_nn', 'accel_vertical_nn', 'location_cluster', 'accel_forward_nn', 'accel_braking_nn', 'day', 'day_of_week', 'day_of_week_cos', 'hour_cos', 'hour', 'mdm_object_name', 'day_of_week_sin']





In [16]:
best_feature_set

['speed_gps',
 'lon',
 'lat',
 'direction',
 'alt',
 'accel_angular_nn',
 'accel_vertical_nn',
 'location_cluster',
 'accel_forward_nn',
 'accel_braking_nn',
 'day',
 'day_of_week',
 'day_of_week_cos',
 'hour_cos',
 'hour',
 'mdm_object_name',
 'day_of_week_sin']

In [21]:
# best_feature_set = ['speed_gps',
#  'direction',
#  'lon',
#  'lat',
#  'alt',
#  'accel_angular_nn',
#  'location_cluster',
#  'accel_vertical_nn',
#  'accel_braking_nn',
#  'accel_forward_nn',
#  'day_of_week']

In [19]:
df = df[best_feature_set]

In [20]:
df

Unnamed: 0,speed_gps,direction,lon,lat,alt,accel_angular_nn,location_cluster,accel_vertical_nn,accel_braking_nn,accel_forward_nn,day_of_week
278478,0.0,326.0,0.005753,0.004778,-50,0.000,2,0.00,0.000,0.000,2
278479,0.0,326.0,0.005753,0.004778,-50,0.000,2,0.00,0.000,0.000,2
278480,0.0,326.0,0.005753,0.004778,-50,0.000,2,0.00,0.000,0.000,2
278481,0.0,326.0,0.005753,0.004778,-50,0.000,2,0.00,0.000,0.000,2
278482,0.0,326.0,0.005753,0.004778,-50,0.000,2,0.00,0.000,0.000,2
...,...,...,...,...,...,...,...,...,...,...,...
278122,21.3,110.0,0.013819,0.002965,-51,0.098,5,0.98,0.000,0.000,4
278123,14.2,122.0,0.013979,0.002917,-51,0.000,5,0.00,0.686,0.000,4
278124,17.1,135.0,0.014139,0.002829,-51,0.000,5,0.00,0.784,0.000,4
278125,18.6,154.0,0.014171,0.002785,-50,0.294,5,0.98,0.000,0.392,4


In [17]:
valid = pd.read_csv('final_direct_valid.csv')

In [18]:
valid['create_dt'] = pd.to_datetime(valid['create_dt'])


In [19]:
from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=8, random_state=42)
valid['location_cluster'] = kmeans.fit_predict(valid[['lon','lat']])



In [20]:
X_valid = valid[best_feature_set]
y_valid_pred = best_rf.predict(X_valid)

In [21]:
# Step 3: Prepare the submission DataFrame
submission_df = valid[['create_dt', 'mdm_object_name']].copy()  # Keep the required columns
submission_df['operation_kind_id'] = y_valid_pred  # Assign predictions to 'operation_kind_id'

# Save the submission DataFrame to a CSV file
submission_df.to_csv('submission.csv', index=False)

print("Submission file 'submission.csv' created successfully.")

# Step 4: Merge the submission with the original test set (telemetry_for_operations_validation.csv)
sub1 = pd.read_csv('dataset/telemetry_for_operations_validation.csv')
sub = pd.read_csv('submission.csv')

# Perform a left merge on 'create_dt' and 'mdm_object_name'
final_sub = pd.merge(sub1, sub[['create_dt', 'mdm_object_name', 'operation_kind_id']], 
                     on=['create_dt', 'mdm_object_name'], 
                     how='left')

# Fill missing 'operation_kind_id' values with 0 (as per your requirement)
final_sub['operation_kind_id'].fillna(0, inplace=True)

# Ensure 'operation_kind_id' is an integer
final_sub['operation_kind_id'] = final_sub['operation_kind_id'].astype(int)

# Save the final result as a CSV file for submission
final_sub.to_csv('final_submission.csv', index=False)

print("Final submission saved as 'final_submission.csv'")

Submission file 'submission.csv' created successfully.
Final submission saved as 'final_submission.csv'
