In [None]:
import pandas as pd
import re

# Load your dataset
df = pd.read_csv('/content/ds.csv')

# Define rule-based detection function
def is_malicious(query):
    # Normalize
    q = query.lower().strip()

    # Define suspicious patterns
    patterns = [
        r"or\s+1\s*=\s*1",                     # Tautology
        r"and\s+1\s*=\s*1",
        r"union\s+select",                    # Union attacks
        r"select\s+.*\s+from\s+.*",           # Generic select
        r";",                                 # Piggybacked query
        r"--",                                # SQL comment
        r"#",                                 # SQL comment
        r"sleep\s*\(",                        # Time delay
        r"benchmark\s*\(",                    # Time-based blind
        r"load_file\s*\(",                    # File access
        r"into\s+outfile",                    # File write
        r"xp_cmdshell",                       # Remote command
        r"waitfor\s+delay",                   # SQL Server time delay
        r"exec\s+",                           # Execute
        r"information_schema",                # DB discovery
        r"@@version",                         # Environment info
    ]

    # Match any pattern
    for pattern in patterns:
        if re.search(pattern, q):
            return 1  # Malicious
    return 0  # Benign

# Apply the rule-based classifier
df['Predicted_Label'] = df['Query'].apply(is_malicious)

# Evaluate if actual labels are present
if 'Label' in df.columns:
    from sklearn.metrics import classification_report
    print(classification_report(df['Label'], df['Predicted_Label']))
else:
    print(df[['Query', 'Predicted_Label']].head())


              precision    recall  f1-score   support

           0       0.76      0.47      0.58     19537
           1       0.45      0.74      0.56     11382

    accuracy                           0.57     30919
   macro avg       0.60      0.61      0.57     30919
weighted avg       0.64      0.57      0.57     30919



**Dataset -1 **

In [None]:
import pandas as pd
import re
from sklearn.metrics import accuracy_score, classification_report

# Load the dataset
df = pd.read_csv('/content/ds.csv')

# Rule-based detection function
def is_malicious(query):
    q = str(query).lower().strip()
    patterns = [
        r"or\s+1\s*=\s*1",
        r"and\s+1\s*=\s*1",
        r"union\s+select",
        r"select\s+.*\s+from\s+.*",
        r";",
        r"--",
        r"#",
        r"sleep\s*\(",
        r"benchmark\s*\(",
        r"load_file\s*\(",
        r"into\s+outfile",
        r"xp_cmdshell",
        r"waitfor\s+delay",
        r"exec\s+",
        r"information_schema",
        r"@@version",
    ]
    return int(any(re.search(pattern, q) for pattern in patterns))

# Apply rule-based detection
df['Predicted_Label'] = df['Query'].apply(is_malicious)

# Ensure the actual labels are integers
df['Label'] = df['Label'].astype(int)

# Compute accuracy
accuracy = accuracy_score(df['Label'], df['Predicted_Label'])
print(f"Test Accuracy: {accuracy:.4f}")

# Optional: detailed classification report
print("\nClassification Report:")
print(classification_report(df['Label'], df['Predicted_Label']))


Test Accuracy: 0.5694

Classification Report:
              precision    recall  f1-score   support

           0       0.76      0.47      0.58     19537
           1       0.45      0.74      0.56     11382

    accuracy                           0.57     30919
   macro avg       0.60      0.61      0.57     30919
weighted avg       0.64      0.57      0.57     30919



# Dataset -2 Biggest

In [None]:
import pandas as pd
import re
from sklearn.metrics import accuracy_score, classification_report

# Load the dataset
df = pd.read_csv('/content/big.csv')

# Rule-based detection function
def is_malicious(query):
    q = str(query).lower().strip()
    patterns = [
        r"or\s+1\s*=\s*1",
        r"and\s+1\s*=\s*1",
        r"union\s+select",
        r"select\s+.*\s+from\s+.*",
        r";",
        r"--",
        r"#",
        r"sleep\s*\(",
        r"benchmark\s*\(",
        r"load_file\s*\(",
        r"into\s+outfile",
        r"xp_cmdshell",
        r"waitfor\s+delay",
        r"exec\s+",
        r"information_schema",
        r"@@version",
    ]
    return int(any(re.search(pattern, q) for pattern in patterns))

# Apply rule-based detection
df['Predicted_Label'] = df['Query'].apply(is_malicious)

# Ensure the actual labels are integers
df['Label'] = df['Label'].astype(int)

# Compute accuracy
accuracy = accuracy_score(df['Label'], df['Predicted_Label'])
print(f"Test Accuracy: {accuracy:.4f}")

# Optional: detailed classification report
print("\nClassification Report:")
print(classification_report(df['Label'], df['Predicted_Label']))


Test Accuracy: 0.7953

Classification Report:
              precision    recall  f1-score   support

           0       0.79      0.78      0.78     70576
           1       0.80      0.81      0.81     77750

    accuracy                           0.80    148326
   macro avg       0.79      0.79      0.79    148326
weighted avg       0.80      0.80      0.80    148326

