In [1]:
import os
import re
import pandas as pd
import joblib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
# from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.utils.class_weight import compute_class_weight
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
from lightgbm import LGBMClassifier

Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



In [69]:
df=pd.read_csv(r'C:\Users\Akash tiwari\log prediction model\Logs\system_logs.csv')
print(df['server-up'].value_counts())

server-up
2    1121680
1     172841
Name: count, dtype: int64


In [70]:
df.head()

Unnamed: 0,timestamp,load-1m,load-5m,load-15m,sys-mem-swap-total,sys-mem-swap-free,sys-mem-free,sys-mem-cache,sys-mem-buffered,sys-mem-available,...,disk-io-time,disk-bytes-read,disk-bytes-written,disk-io-read,disk-io-write,cpu-iowait,cpu-system,cpu-user,server-up,source_file
0,0,0.22,0.18,0.18,16953372672,16953372672,2071302144,10307330048,1937584128,15328792576,...,0.0076,0.0,16588.8,0.0,2.1,0.0095,0.024,0.0405,2,system-1.csv
1,30,0.26,0.19,0.18,16953372672,16953372672,2072969216,10307371008,1937584128,15330496512,...,0.006,0.0,14745.6,0.0,1.5,0.0125,0.027,0.0475,2,system-1.csv
2,60,0.16,0.17,0.18,16953372672,16953372672,2071818240,10307375104,1937584128,15329353728,...,0.0134,0.0,15564.8,0.0,1.65,0.0135,0.0255,0.026,2,system-1.csv
3,90,0.16,0.17,0.18,16953372672,16953372672,2071830528,10307379200,1937584128,15329370112,...,0.0078,0.0,16179.2,0.0,1.8,0.016,0.0305,0.0315,2,system-1.csv
4,120,0.1,0.15,0.17,16953372672,16953372672,2071859200,10307379200,1937584128,15329398784,...,0.0066,0.0,13721.6,0.0,1.7,0.008,0.0225,0.027,2,system-1.csv


In [71]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1294521 entries, 0 to 1294520
Data columns (total 25 columns):
 #   Column                   Non-Null Count    Dtype  
---  ------                   --------------    -----  
 0   timestamp                1294521 non-null  int64  
 1   load-1m                  1294521 non-null  float64
 2   load-5m                  1294521 non-null  float64
 3   load-15m                 1294521 non-null  float64
 4   sys-mem-swap-total       1294521 non-null  int64  
 5   sys-mem-swap-free        1294521 non-null  int64  
 6   sys-mem-free             1294521 non-null  int64  
 7   sys-mem-cache            1294521 non-null  int64  
 8   sys-mem-buffered         1294521 non-null  int64  
 9   sys-mem-available        1294521 non-null  int64  
 10  sys-mem-total            1294521 non-null  int64  
 11  sys-fork-rate            1294521 non-null  float64
 12  sys-interrupt-rate       1294521 non-null  float64
 13  sys-context-switch-rate  1294521 non-null 

In [72]:
df = df.drop(columns=['source_file','sys-thermal'])

In [73]:
df['is_error'] = df['server-up'].apply(lambda x: 1 if x == 1 else 0)

In [74]:
X = df.drop(columns=['timestamp', 'server-up','is_error'])
y = df['is_error']

In [75]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

In [76]:
# Define preprocessing
numeric_features = X.columns.tolist()
preprocessor = ColumnTransformer([('num', StandardScaler(), numeric_features)])

In [77]:
# ========== 1️⃣ SMOTE-Based Model ==========
print("\n🔁 Training SMOTE-based Model...")

smote = SMOTE(random_state=42)

model_smote = ImbPipeline(steps=[
    ('preprocessor', preprocessor),
    ('smote', smote),
    ('classifier', LGBMClassifier(
        n_estimators=100,
        learning_rate=0.1,
        class_weight='balanced',  
        random_state=42
    ))
])

model_smote.fit(X_train, y_train)
y_pred_smote = model_smote.predict(X_test)


🔁 Training SMOTE-based Model...
[LightGBM] [Info] Number of positive: 897343, number of negative: 897343
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.142172 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5005
[LightGBM] [Info] Number of data points in the train set: 1794686, number of used features: 21
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


In [78]:
print("\n📊 SMOTE Model Report:")
print(classification_report(y_test, y_pred_smote))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_smote))


📊 SMOTE Model Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    224337
           1       1.00      1.00      1.00     34568

    accuracy                           1.00    258905
   macro avg       1.00      1.00      1.00    258905
weighted avg       1.00      1.00      1.00    258905

Confusion Matrix:
 [[224337      0]
 [     2  34566]]


In [79]:
# Save
joblib.dump(model_smote, r'C:\Users\Akash tiwari\log prediction model\model_smote.pkl')

['C:\\Users\\Akash tiwari\\log prediction model\\model_smote.pkl']