In [68]:
import pandas as pd
import joblib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.utils.class_weight import compute_class_weight
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
from lightgbm import LGBMClassifier

In [69]:
df=pd.read_csv(r'C:\Users\Akash tiwari\log prediction model\Logs\system_logs.csv')

In [70]:
df.head()

Unnamed: 0,timestamp,load-1m,load-5m,load-15m,sys-mem-swap-total,sys-mem-swap-free,sys-mem-free,sys-mem-cache,sys-mem-buffered,sys-mem-available,...,disk-io-time,disk-bytes-read,disk-bytes-written,disk-io-read,disk-io-write,cpu-iowait,cpu-system,cpu-user,server-up,source_file
0,0,0.22,0.18,0.18,16953372672,16953372672,2071302144,10307330048,1937584128,15328792576,...,0.0076,0.0,16588.8,0.0,2.1,0.0095,0.024,0.0405,2,system-1.csv
1,30,0.26,0.19,0.18,16953372672,16953372672,2072969216,10307371008,1937584128,15330496512,...,0.006,0.0,14745.6,0.0,1.5,0.0125,0.027,0.0475,2,system-1.csv
2,60,0.16,0.17,0.18,16953372672,16953372672,2071818240,10307375104,1937584128,15329353728,...,0.0134,0.0,15564.8,0.0,1.65,0.0135,0.0255,0.026,2,system-1.csv
3,90,0.16,0.17,0.18,16953372672,16953372672,2071830528,10307379200,1937584128,15329370112,...,0.0078,0.0,16179.2,0.0,1.8,0.016,0.0305,0.0315,2,system-1.csv
4,120,0.1,0.15,0.17,16953372672,16953372672,2071859200,10307379200,1937584128,15329398784,...,0.0066,0.0,13721.6,0.0,1.7,0.008,0.0225,0.027,2,system-1.csv


In [71]:
df = df.drop(columns=['source_file','sys-thermal'])

In [72]:
df.head()

Unnamed: 0,timestamp,load-1m,load-5m,load-15m,sys-mem-swap-total,sys-mem-swap-free,sys-mem-free,sys-mem-cache,sys-mem-buffered,sys-mem-available,...,sys-context-switch-rate,disk-io-time,disk-bytes-read,disk-bytes-written,disk-io-read,disk-io-write,cpu-iowait,cpu-system,cpu-user,server-up
0,0,0.22,0.18,0.18,16953372672,16953372672,2071302144,10307330048,1937584128,15328792576,...,2217.2,0.0076,0.0,16588.8,0.0,2.1,0.0095,0.024,0.0405,2
1,30,0.26,0.19,0.18,16953372672,16953372672,2072969216,10307371008,1937584128,15330496512,...,2693.7,0.006,0.0,14745.6,0.0,1.5,0.0125,0.027,0.0475,2
2,60,0.16,0.17,0.18,16953372672,16953372672,2071818240,10307375104,1937584128,15329353728,...,2275.35,0.0134,0.0,15564.8,0.0,1.65,0.0135,0.0255,0.026,2
3,90,0.16,0.17,0.18,16953372672,16953372672,2071830528,10307379200,1937584128,15329370112,...,2302.55,0.0078,0.0,16179.2,0.0,1.8,0.016,0.0305,0.0315,2
4,120,0.1,0.15,0.17,16953372672,16953372672,2071859200,10307379200,1937584128,15329398784,...,2180.8,0.0066,0.0,13721.6,0.0,1.7,0.008,0.0225,0.027,2


In [73]:
df = df.dropna()

In [74]:
print(df['server-up'].value_counts())

server-up
2    1121680
1     172841
Name: count, dtype: int64


In [75]:
df['is_error'] = df['server-up'].apply(lambda x: 1 if x == 1 else 0)


In [76]:
X = df.drop(columns=['timestamp', 'server-up','is_error'])
y = df['is_error']

In [77]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

In [78]:
# Define preprocessing
numeric_features = X.columns.tolist()
preprocessor = ColumnTransformer([('num', StandardScaler(), numeric_features)])

In [82]:
# ========== 2️⃣ Class-Weighted Model ==========
print("\n🔁 Training Class-Weighted Model...")

# Compute class weights manually
classes = sorted(y.unique())
weights = compute_class_weight(class_weight='balanced', classes=classes, y=y_train)
class_weight_dict = {cls: weight for cls, weight in zip(classes, weights)}

model_weighted = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LGBMClassifier(
        n_estimators=100,
        learning_rate=0.1,
        random_state=42,
        class_weight=class_weight_dict
    ))
])

model_weighted.fit(X_train, y_train)
y_pred_weighted = model_weighted.predict(X_test)


🔁 Training Class-Weighted Model...
[LightGBM] [Info] Number of positive: 138273, number of negative: 897343
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.084362 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4590
[LightGBM] [Info] Number of data points in the train set: 1035616, number of used features: 21
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000


In [83]:
print("\n📊 Class Weight Model Report:")
print(classification_report(y_test, y_pred_weighted))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_weighted))


📊 Class Weight Model Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    224337
           1       1.00      1.00      1.00     34568

    accuracy                           1.00    258905
   macro avg       1.00      1.00      1.00    258905
weighted avg       1.00      1.00      1.00    258905

Confusion Matrix:
 [[224337      0]
 [     2  34566]]


In [84]:
# Save class-weight model
joblib.dump(model_weighted, r'C:\Users\Akash tiwari\log prediction model\model_class_weight.pkl')

['C:\\Users\\Akash tiwari\\log prediction model\\model_class_weight.pkl']