In [1]:
!pip install hyperopt





## Importing the modules

In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report, auc, confusion_matrix, precision_score, recall_score, f1_score, roc_auc_score, average_precision_score, roc_curve, precision_recall_curve, balanced_accuracy_score, log_loss
from lightgbm import LGBMClassifier
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, IterativeImputer
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import ConfusionMatrixDisplay

In [3]:
# Load the data
features_train = pd.read_csv("../GSTIN dataset/Train_60/Train_60/Train_60/X_Train_Data_Input.csv")
labels_train = pd.read_csv("../GSTIN dataset/Train_60/Train_60/Train_60/Y_Train_Data_Target.csv")
features_test = pd.read_csv("../GSTIN dataset/Test_20/Test_20/Test_20/X_Test_Data_Input.csv")
labels_test = pd.read_csv("../GSTIN dataset/Test_20/Test_20/Test_20/Y_Test_Data_Target.csv")

In [4]:
# Combine features and labels
df_train = pd.concat([features_train, labels_train], axis=1)
df_test = pd.concat([features_test, labels_test], axis=1)

In [5]:
df_train.head()

Unnamed: 0,ID,Column0,Column1,Column2,Column3,Column4,Column5,Column6,Column7,Column8,...,Column14,Column15,Column16,Column17,Column18,Column19,Column20,Column21,ID.1,target
0,ad1a67e4cbddc767a3456b0d94299b9e,2.0,2495,3726.0,0.678139,0.701403,-0.007468,0.43419,-0.015603,0.606265,...,0.001351,0.00339,0.0,0,0.0,0,0,0,ad1a67e4cbddc767a3456b0d94299b9e,0
1,7246d2f76ac0c217ec25e72ea5f014cb,0.0,2495,3454.0,0.45258,0.701403,-0.007468,1.554998,-0.015574,0.329946,...,0.001351,0.00339,0.0,0,0.0,0,0,0,7246d2f76ac0c217ec25e72ea5f014cb,0
2,22ba388e7dd14c13342c49e75fc29dda,2.0,2495,4543.0,-1.577453,-1.42954,-0.007469,-0.407939,-0.015607,-0.774979,...,0.001351,0.00339,0.0,0,0.0,0,0,0,22ba388e7dd14c13342c49e75fc29dda,0
3,59f9b981472d97342587fb3e6392aeb1,0.0,211,59.0,,,,-0.407939,-0.015607,-0.774979,...,,0.00339,0.0,0,1.0,0,0,0,59f9b981472d97342587fb3e6392aeb1,1
4,f6317cf7ecf126859804eddff279aead,0.0,718,950.0,-2.028572,-1.855728,,-0.407939,-0.015607,-0.774979,...,,0.00339,0.0,0,0.0,0,0,0,f6317cf7ecf126859804eddff279aead,0


In [6]:
df_test.head()

Unnamed: 0,ID,Column0,Column1,Column2,Column3,Column4,Column5,Column6,Column7,Column8,...,Column14,Column15,Column16,Column17,Column18,Column19,Column20,Column21,ID.1,target
0,07cf2025382f6325b316e128b1b90999,1.0,1986,53.0,0.678139,0.701403,-0.007469,-0.407939,-0.015607,0.55486,...,0.001351,0.00339,0.0,0,0.0,0,0,0,07cf2025382f6325b316e128b1b90999,0
1,eb972eb3a1f8d0d1a13f45e7c07d37d4,2.0,1579,12.0,0.678139,0.701403,-0.007468,-0.407939,-0.015607,0.142149,...,0.001351,0.00339,0.0,0,0.0,0,0,0,eb972eb3a1f8d0d1a13f45e7c07d37d4,0
2,ee35e164b3ddc25a9f40243b81ad290d,0.0,898,3817.0,-2.028572,-1.855728,,-0.407939,-0.015607,-0.774979,...,,0.00339,0.0,0,0.0,0,0,0,ee35e164b3ddc25a9f40243b81ad290d,0
3,28229ccd7bad7dd83324a4175a7e0531,0.0,79,3449.0,-0.675216,-0.577162,-0.007469,0.00402,-0.015607,0.635264,...,0.001351,0.00339,0.0,0,0.0,0,0,0,28229ccd7bad7dd83324a4175a7e0531,0
4,2f94873da2c332d28f111742818e0fbb,1.0,646,6510.0,-2.028572,-1.855728,,-0.407939,-0.015434,-0.774979,...,,0.00339,0.0,0,0.0,0,0,0,2f94873da2c332d28f111742818e0fbb,0


In [7]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 785133 entries, 0 to 785132
Data columns (total 25 columns):
 #   Column    Non-Null Count   Dtype  
---  ------    --------------   -----  
 0   ID        785133 non-null  object 
 1   Column0   785124 non-null  float64
 2   Column1   785133 non-null  int64  
 3   Column2   785133 non-null  float64
 4   Column3   658830 non-null  float64
 5   Column4   657423 non-null  float64
 6   Column5   617953 non-null  float64
 7   Column6   781283 non-null  float64
 8   Column7   785133 non-null  float64
 9   Column8   781283 non-null  float64
 10  Column9   52996 non-null   float64
 11  Column10  785133 non-null  int64  
 12  Column11  785133 non-null  int64  
 13  Column12  785133 non-null  int64  
 14  Column13  785133 non-null  int64  
 15  Column14  419430 non-null  float64
 16  Column15  768677 non-null  float64
 17  Column16  785133 non-null  float64
 18  Column17  785133 non-null  int64  
 19  Column18  785133 non-null  float64
 20  Colu

In [8]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 261712 entries, 0 to 261711
Data columns (total 25 columns):
 #   Column    Non-Null Count   Dtype  
---  ------    --------------   -----  
 0   ID        261712 non-null  object 
 1   Column0   261710 non-null  float64
 2   Column1   261712 non-null  int64  
 3   Column2   261712 non-null  float64
 4   Column3   219478 non-null  float64
 5   Column4   219002 non-null  float64
 6   Column5   206053 non-null  float64
 7   Column6   260478 non-null  float64
 8   Column7   261712 non-null  float64
 9   Column8   260478 non-null  float64
 10  Column9   17859 non-null   float64
 11  Column10  261712 non-null  int64  
 12  Column11  261712 non-null  int64  
 13  Column12  261712 non-null  int64  
 14  Column13  261712 non-null  int64  
 15  Column14  140033 non-null  float64
 16  Column15  256227 non-null  float64
 17  Column16  261712 non-null  float64
 18  Column17  261712 non-null  int64  
 19  Column18  261712 non-null  float64
 20  Colu

In [9]:
df_train.shape

(785133, 25)

In [10]:
df_test.shape

(261712, 25)

## Data preprocessing

In [11]:
# Drop unnecessary columns
columns_to_drop = ["ID", "Column9"]
df_train.drop(columns=columns_to_drop, axis=1, inplace=True)
df_test.drop(columns=columns_to_drop, axis=1, inplace=True)

In [12]:
# Handle missing values
def handle_missing_values(df):
    imputers = {
        'Column0': SimpleImputer(strategy='mean'),
        'Column3': SimpleImputer(strategy='median'),
        'Column4': SimpleImputer(strategy='median'),
        'Column5': IterativeImputer(),
        'Column6': SimpleImputer(strategy='median'),
        'Column8': SimpleImputer(strategy='median'),
        'Column14': IterativeImputer(),
        'Column15': SimpleImputer(strategy='median')
    }
    
    for column, imputer in imputers.items():
        if column in df.columns:
            df[column] = imputer.fit_transform(df[[column]])
    
    return df

In [13]:
df_train = handle_missing_values(df_train)
df_test = handle_missing_values(df_test)

In [14]:
# Feature reduction
features_to_drop = ['Column10', 'Column11', 'Column12', 'Column13', 'Column15', 'Column16', 'Column19', 'Column20', 'Column21']
df_train.drop(columns=features_to_drop, axis=1, inplace=True)
df_test.drop(columns=features_to_drop, axis=1, inplace=True)

In [15]:
# Outlier detection and removal (only for training data)
def remove_outliers(df):
    z_scores = np.abs(stats.zscore(df.drop(columns=['target'])))
    threshold = 3
    df_clean = df[(z_scores < threshold).all(axis=1)]
    return df_clean

In [16]:
df_train_clean = remove_outliers(df_train)

In [17]:
# Separate features and target
X_train = df_train_clean.drop(columns=['target'])
y_train = df_train_clean['target']
X_test = df_test.drop(columns=['target'])
y_test = df_test['target']

In [18]:
# Feature scaling
scaler = StandardScaler()
X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns)
X_test_scaled = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns)

## Training the model

In [19]:
# Define the search space for Hyperopt
space = {
    'n_estimators': hp.quniform('n_estimators', 100, 1000, 1),
    'learning_rate': hp.loguniform('learning_rate', np.log(0.01), np.log(0.3)),
    'max_depth': hp.quniform('max_depth', 3, 10, 1),
    'num_leaves': hp.quniform('num_leaves', 20, 100, 1),
    'min_child_samples': hp.quniform('min_child_samples', 10, 100, 1),
    'subsample': hp.uniform('subsample', 0.5, 1.0),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.5, 1.0),
    'reg_alpha': hp.loguniform('reg_alpha', np.log(1e-8), np.log(1.0)),
    'reg_lambda': hp.loguniform('reg_lambda', np.log(1e-8), np.log(1.0))
}

In [20]:
# Objective function for Hyperopt
def objective(params):
    params['n_estimators'] = int(params['n_estimators'])
    params['max_depth'] = int(params['max_depth'])
    params['num_leaves'] = int(params['num_leaves'])
    params['min_child_samples'] = int(params['min_child_samples'])
    
    lgbm = LGBMClassifier(**params, random_state=42)
    lgbm.fit(X_train_scaled, y_train)
    y_pred = lgbm.predict(X_test_scaled)
    
    accuracy = accuracy_score(y_test, y_pred)
    return {'loss': -accuracy, 'status': STATUS_OK}

In [None]:
# Run the Hyperopt optimization
trials = Trials()
best = fmin(fn=objective, 
            space=space, 
            algo=tpe.suggest, 
            max_evals=50, 
            trials=trials)

  0%|                                                                           | 0/50 [00:00<?, ?trial/s, best loss=?]

[WinError 2] The system cannot find the file specified

  File "C:\Users\TSA\anaconda3\envs\tf\lib\site-packages\joblib\externals\loky\backend\context.py", line 257, in _count_physical_cores
    cpu_info = subprocess.run(

  File "C:\Users\TSA\anaconda3\envs\tf\lib\subprocess.py", line 505, in run
    with Popen(*popenargs, **kwargs) as process:

  File "C:\Users\TSA\anaconda3\envs\tf\lib\subprocess.py", line 951, in __init__
    self._execute_child(args, executable, preexec_fn, close_fds,

  File "C:\Users\TSA\anaconda3\envs\tf\lib\subprocess.py", line 1436, in _execute_child
    hp, ht, pid, tid = _winapi.CreateProcess(executable, args,



[LightGBM] [Info] Number of positive: 61092, number of negative: 644029                                                
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.016142 seconds.                
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1899                                                                                      
[LightGBM] [Info] Number of data points in the train set: 705121, number of used features: 12                          
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.086640 -> initscore=-2.355363                                        
[LightGBM] [Info] Start training from score -2.355363                                                                  
[LightGBM] [Info] Number of positive: 61092, number of negative: 644029                                                
[LightGBM] [Info] Auto-choosing row-wi

In [None]:
# Print the best parameters and accuracy
print(f"Best Hyperparameters: {best}")

In [None]:
# Train the final model with the best hyperparameters
best_params = {
    'n_estimators': int(best['n_estimators']),
    'learning_rate': best['learning_rate'],
    'max_depth': int(best['max_depth']),
    'num_leaves': int(best['num_leaves']),
    'min_child_samples': int(best['min_child_samples']),
    'subsample': best['subsample'],
    'colsample_bytree': best['colsample_bytree'],
    'reg_alpha': best['reg_alpha'],
    'reg_lambda': best['reg_lambda']
}

In [None]:
final_model = LGBMClassifier(**best_params, random_state=42)
final_model.fit(X_train_scaled, y_train)

In [None]:
# Make predictions on the test data
y_pred = final_model.predict(X_test_scaled)
y_prob = final_model.predict_proba(X_test_scaled)[:, 1]  # For ROC and Precision-Recall curves

## Evaluation metrics

In [None]:
# Evaluation Metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_prob)
balanced_accuracy = balanced_accuracy_score(y_test, y_pred)
log_loss_value = log_loss(y_test, y_prob)

In [None]:
print(f"Accuracy: {accuracy * 100:.2f}%")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")
print(f"AUC-ROC: {roc_auc:.2f}")
print(f"Balanced Accuracy: {balanced_accuracy:.2f}")
print(f"Log Loss: {log_loss_value:.2f}")

In [None]:
# Confusion Matrix Visualization
ConfusionMatrixDisplay.from_estimator(final_model, X_test_scaled, y_test, cmap='Blues')
plt.title("Confusion Matrix")
plt.show()

In [None]:
# ROC Curve and AUC
fpr, tpr, thresholds = roc_curve(y_test, y_prob)
roc_auc = auc(fpr, tpr)

plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (AUC = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc="lower right")
plt.show()

In [None]:
# Precision-Recall Curve
precision_vals, recall_vals, _ = precision_recall_curve(y_test, y_prob)
average_precision = average_precision_score(y_test, y_prob)

plt.figure(figsize=(8, 6))
plt.plot(recall_vals, precision_vals, color='b', lw=2, label=f'AP = {average_precision:.2f}')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve')
plt.legend(loc="lower left")
plt.show()

In [None]:
# Final Model Accuracy Plot
train_accuracy = accuracy_score(y_train, final_model.predict(X_train_scaled))
test_accuracy = accuracy_score(y_test, y_pred)

plt.figure(figsize=(6, 4))
plt.bar(['Train Accuracy', 'Test Accuracy'], [train_accuracy * 100, accuracy * 100], color=['green', 'blue'])
plt.title('Train vs Test Accuracy')
plt.ylabel('Accuracy (%)')
plt.show()