In [None]:
import pandas as pd
from geoip2.database import Reader

# Load the dataset
blacklisted_ips = "/Users/devvratsolanki/Downloads/bad_ip_dataset.txt"

# Parse IPs into a DataFrame
with open(blacklisted_ips, "r") as file:
    ips = file.readlines()

ips = [ip.strip() for ip in ips if ip.strip()]  # Remove empty lines
df = pd.DataFrame(ips, columns=["IP"])

# Initialize GeoLite2 reader
geoip_db_path = '/Users/devvratsolanki/Downloads/GeoLite2-City_20241210/GeoLite2-City.mmdb'
reader = Reader(geoip_db_path)

# Extract features
features = []
for i, ip in enumerate(df["IP"]):
    try:
        response = reader.city(ip)
        features.append({
            "IP": ip,
            "Country": response.country.name if response.country else None,
            "City": response.city.name if response.city else None,
            "Latitude": response.location.latitude if response.location else None,
            "Longitude": response.location.longitude if response.location else None
        })
    except Exception as e:
        print(f"Error processing IP {ip}: {e}")
        features.append({"IP": ip, "Country": None, "City": None, "Latitude": None, "Longitude": None})
    
    # Optional: Print progress every 10 IPs
    if i % 10 == 0:
        print(f"Processed {i}/{len(df)} IPs...")

# Create a DataFrame from the features
feature_df = pd.DataFrame(features)

# Add labels
feature_df["Label"] = 1  # Since these are blacklisted, label them as malicious

# Save to CSV
output_path = "/Users/devvratsolanki/Downloads/processed_features.csv"
feature_df.to_csv(output_path, index=False)

print(f"Processed data saved to {output_path}")
reader.close()


In [None]:
feature_df.to_csv("processed_features.csv", index=False)
print("Feature extraction completed. Data saved to 'processed_features.csv'.")


In [50]:
df = pd.read_csv("processed_features.csv")


In [51]:
# Add the 'Label' column indicating malicious IPs
feature_df["Label"] = 1  # Label all blacklisted IPs as malicious


In [52]:
from sklearn.model_selection import train_test_split

# Drop IP column for model training
X = feature_df.drop(columns=["IP", "Label"])
y = feature_df["Label"]

# Handle missing values
X.fillna(0, inplace=True)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [53]:
# Drop unnecessary columns and split into features (X) and labels (y)
X = feature_df.drop(columns=["IP", "Label"])
y = feature_df["Label"]

# Handle missing values
X.fillna(0, inplace=True)

# Split the data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [54]:
X.fillna(0, inplace=True)
X = X.infer_objects()  # Infer better data types for object columns


In [55]:
pd.set_option('future.no_silent_downcasting', True)
X.fillna(0, inplace=True)


In [56]:
X.fillna(0, inplace=True)

# Example: Cast object columns to strings, numerical columns to float
X = X.astype({"Country": "string", "City": "string", "Latitude": "float", "Longitude": "float"})


In [57]:
import ipaddress

def is_valid_ip(ip):
    try:
        return ipaddress.ip_address(ip).is_global
    except ValueError:
        return False

df = df[df["IP"].apply(is_valid_ip)]


In [None]:
from geoip2.database import Reader

# Open the GeoLite2 database
reader = Reader('/Users/devvratsolanki/Downloads/GeoLite2-City_20241210/GeoLite2-City.mmdb')  # Replace with your actual path

# Perform geolocation lookup
response = reader.city("8.8.8.8")  # Google's public DNS IP
print(response.country.name, response.city.name, response.location.latitude, response.location.longitude)

# Close the reader after usage
reader.close()


In [None]:
from geoip2.database import Reader

# Path to the downloaded GeoLite2 database (replace with your actual path)
db_path = '/Users/devvratsolanki/Downloads/GeoLite2-City_20241210/GeoLite2-City.mmdb'

# Open the database
with Reader(db_path) as reader:
    # Perform IP lookup
    ip_address = '1.23.253.10'  # Example IP (Google DNS)
    response = reader.city(ip_address)
    
    # Extract information
    print(f"Country: {response.country.name}")
    print(f"City: {response.city.name}")
    print(f"Latitude: {response.location.latitude}")
    print(f"Longitude: {response.location.longitude}")


In [58]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()

# Encode the 'Country' and 'City' columns
X['Country'] = label_encoder.fit_transform(X['Country'])
X['City'] = label_encoder.fit_transform(X['City'])


In [None]:
import pandas as pd
from geoip2.database import Reader

# Load the dataset of malicious IPs
file_path = "bad_ip_dataset.txt"
with open(file_path, "r") as file:
    # Reading the file content, splitting it into a list of IPs, and removing any empty lines
    malicious_ips = [line.strip() for line in file if line.strip()]

# Function to check if an IP is malicious
def check_ip(ip_address, ip_list):
    return ip_address in ip_list

# Function to extract features of an IP using the GeoLite2 database
def get_ip_features(ip_address, reader):
    try:
        response = reader.city(ip_address)
        return {
            "IP": ip_address,
            "Country": response.country.name,
            "City": response.city.name,
            "Latitude": response.location.latitude,
            "Longitude": response.location.longitude
        }
    except Exception as e:
        return {
            "IP": ip_address,
            "Country": None,
            "City": None,
            "Latitude": None,
            "Longitude": None
        }

# Open the GeoLite2 database
geoip_db_path = "/Users/devvratsolanki/Downloads/GeoLite2-City_20241210/GeoLite2-City.mmdb"
reader = Reader(geoip_db_path)

# User input for IP address
user_ip = input("Enter an IP address to check: ")

# Check if the entered IP is malicious
if check_ip(user_ip, malicious_ips):
    print("This is a malicious IP address!")
else:
    print("This IP address is not malicious.")

# Extract and display IP features
features = get_ip_features(user_ip, reader)
print("Extracted Features:")
print(features)

# Close the GeoLite2 database
reader.close()

In [None]:
import pandas as pd
import numpy as np
from geoip2.database import Reader
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import warnings
import matplotlib.pyplot as plt
import seaborn as sns

warnings.filterwarnings('ignore')

# -----------------------
# Step 1: Load Processed IP Dataset
# -----------------------

df = pd.read_csv("processed_features.csv")
df = df.dropna(subset=["Latitude", "Longitude"])  # Remove rows with missing location info

# Simulate benign data for training
num_samples = len(df)
np.random.seed(42)
benign_df = pd.DataFrame({
    "IP": [f"192.0.2.{i}" for i in range(num_samples)],
    "Country": ["BenignCountry"] * num_samples,
    "City": ["BenignCity"] * num_samples,
    "Latitude": np.random.uniform(-90, 90, num_samples),
    "Longitude": np.random.uniform(-180, 180, num_samples),
    "Label": 0
})

# Combine with malicious data
df["Label"] = 1
full_df = pd.concat([df, benign_df], ignore_index=True)

# -----------------------
# Step 2: Train a Decision Tree
# -----------------------

X = full_df[["Latitude", "Longitude"]]
y = full_df["Label"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

clf = DecisionTreeClassifier()
clf.fit(X_train, y_train)

# Evaluate accuracy
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"\n✅ Model trained. Accuracy on test data: {accuracy * 100:.2f}%")

# -----------------------
# Step 3: Visualize the Training Data
# -----------------------

plt.figure(figsize=(10, 6))
sns.scatterplot(x=full_df[full_df['Label'] == 0]['Longitude'], y=full_df[full_df['Label'] == 0]['Latitude'], label='Benign', alpha=0.6)
sns.scatterplot(x=full_df[full_df['Label'] == 1]['Longitude'], y=full_df[full_df['Label'] == 1]['Latitude'], label='Malicious', alpha=0.6)
plt.title('Distribution of Benign and Malicious IP Locations (Simulated)')
plt.xlabel('Longitude')
plt.ylabel('Latitude')
plt.legend()
plt.grid(True)
plt.show()

# -----------------------
# Step 4: Visualize Decision Boundaries (Simplified for 2D)
# -----------------------

# Create a meshgrid of points
x_min, x_max = X['Longitude'].min() - 1, X['Longitude'].max() + 1
y_min, y_max = X['Latitude'].min() - 1, X['Latitude'].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.1),
                     np.arange(y_min, y_max, 0.1))

# Predict the class for each point in the meshgrid
Z = clf.predict(np.c_[yy.ravel(), xx.ravel()])
Z = Z.reshape(xx.shape)

# Plot the decision boundary
plt.figure(figsize=(10, 6))
plt.contourf(xx, yy, Z, alpha=0.4, cmap=plt.cm.RdYlBu)
sns.scatterplot(x=X_train['Longitude'], y=X_train['Latitude'], c=y_train, s=20, edgecolor='k', cmap=plt.cm.RdYlBu, label='Training Data')
plt.title('Decision Boundary of the Decision Tree')
plt.xlabel('Longitude')
plt.ylabel('Latitude')
plt.legend()
plt.grid(True)
plt.show()

# -----------------------
# Step 5: Ask for IP Address and Predict
# -----------------------

user_ip = input("\n🔍 Enter an IP address to analyze: ").strip()

# Load GeoIP DB
geoip_path = "/Users/devvratsolanki/Downloads/GeoLite2-City_20241210/GeoLite2-City.mmdb"  # Adjust this path if needed
reader = Reader(geoip_path)

try:
    response = reader.city(user_ip)
    lat = response.location.latitude
    lon = response.location.longitude
    country = response.country.name
    city = response.city.name

    print(f"\n🌍 Location Info for {user_ip}:")
    print(f"  - Country : {country}")
    print(f"  - City    : {city}")
    print(f"  - Latitude: {lat}")
    print(f"  - Longitude: {lon}")

    # Predict using Decision Tree
    prediction = clf.predict([[lat, lon]])[0]
    result = "⚠️ Malicious" if prediction == 1 else "✅ Benign"
    print(f"\n📊 Prediction based on location: {result}")
    print(f"🔁 Model Accuracy: {accuracy * 100:.2f}% (based on training data)")

    # -----------------------
    # Step 6: Visualize the Predicted IP on the Decision Boundary
    # -----------------------
    plt.figure(figsize=(10, 6))
    plt.contourf(xx, yy, Z, alpha=0.4, cmap=plt.cm.RdYlBu)
    sns.scatterplot(x=X_train['Longitude'], y=X_train['Latitude'], c=y_train, s=20, edgecolor='k', cmap=plt.cm.RdYlBu, label='Training Data')
    plt.scatter(lon, lat, color='lime' if prediction == 0 else 'red', s=100, marker='*', label=f'Predicted IP ({result})')
    plt.title('Decision Boundary with Predicted IP Location')
    plt.xlabel('Longitude')
    plt.ylabel('Latitude')
    plt.legend()
    plt.grid(True)
    plt.show()

except Exception as e:
    print(f"\n❌ Could not fetch location info for IP {user_ip}: {e}")

reader.close()

In [None]:
import pandas as pd
import numpy as np
from geoip2.database import Reader
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import warnings
import matplotlib.pyplot as plt
import seaborn as sns

warnings.filterwarnings('ignore')

# -----------------------
# Step 1: Load Processed IP Dataset and Malicious IP List
# -----------------------

df = pd.read_csv("processed_features.csv")
df = df.dropna(subset=["Latitude", "Longitude"])  # Remove rows with missing location info
malicious_ips_db = set(df[df['Label'] == 1]['IP'])

# Load the dataset of external malicious IPs
file_path = "bad_ip_dataset.txt"
try:
    with open(file_path, "r") as file:
        external_malicious_ips = set(line.strip() for line in file if line.strip())
except FileNotFoundError:
    print(f"⚠️ Warning: Malicious IP list file not found at '{file_path}'.")
    external_malicious_ips = set()

# Simulate benign data for training
num_samples = len(df)
np.random.seed(42)
benign_df = pd.DataFrame({
    "IP": [f"192.0.2.{i}" for i in range(num_samples)],
    "Country": ["BenignCountry"] * num_samples,
    "City": ["BenignCity"] * num_samples,
    "Latitude": np.random.uniform(-90, 90, num_samples),
    "Longitude": np.random.uniform(-180, 180, num_samples),
    "Label": 0
})

# Combine with malicious data for the model
df["Label"] = 1
full_df = pd.concat([df, benign_df], ignore_index=True)

# -----------------------
# Step 2: Train a Decision Tree
# -----------------------

X = full_df[["Latitude", "Longitude"]]
y = full_df["Label"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

clf = DecisionTreeClassifier()
clf.fit(X_train, y_train)

# Evaluate accuracy
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"\n✅ Model trained. Accuracy on test data: {accuracy * 100:.2f}%")

# -----------------------
# Step 3: Ask for IP Address and Analyze
# -----------------------

user_ip = input("\n🔍 Enter an IP address to analyze: ").strip()

# Load GeoIP DB
geoip_path = "/Users/devvratsolanki/Downloads/GeoLite2-City_20241210/GeoLite2-City.mmdb"  # Adjust this path if needed
reader = Reader(geoip_path)

# Check against your database and the external list
if user_ip in malicious_ips_db:
    print(f"\n🚨 The IP address '{user_ip}' exists in your primary database and is labeled as Malicious.")
elif user_ip in external_malicious_ips:
    print(f"\n🚨 The IP address '{user_ip}' was found in the external list of malicious IPs.")
else:
    print(f"\n✅ The IP address '{user_ip}' was not found in your databases of malicious IPs. Proceeding with location-based analysis.")

    try:
        response = reader.city(user_ip)
        lat = response.location.latitude
        lon = response.location.longitude
        country = response.country.name
        city = response.city.name

        print(f"\n🌍 Location Info for {user_ip}:")
        print(f"  - Country : {country}")
        print(f"  - City    : {city}")
        print(f"  - Latitude: {lat}")
        print(f"  - Longitude: {lon}")

        # Predict using Decision Tree
        prediction = clf.predict([[lat, lon]])[0]
        result = "⚠️ Malicious (Location-based)" if prediction == 1 else "✅ Benign (Location-based)"
        print(f"\n📊 Prediction based on location: {result}")
        print(f"🔁 Model Accuracy: {accuracy * 100:.2f}% (based on training data)")

        # -----------------------
        # Step 4: Visualize the Predicted IP on the Decision Boundary
        # -----------------------
        plt.figure(figsize=(10, 6))
        plt.contourf(xx, yy, Z, alpha=0.4, cmap=plt.cm.RdYlBu)
        sns.scatterplot(x=X_train['Longitude'], y=X_train['Latitude'], c=y_train, s=20, edgecolor='k', cmap=plt.cm.RdYlBu, label='Training Data')
        plt.scatter(lon, lat, color='lime' if prediction == 0 else 'red', s=100, marker='*', label=f'Predicted IP ({result})')
        plt.title('Decision Boundary with Predicted IP Location')
        plt.xlabel('Longitude')
        plt.ylabel('Latitude')
        plt.legend()
        plt.grid(True)
        plt.show()

    except Exception as e:
        print(f"\n❌ Could not fetch location info for IP {user_ip}: {e}")

reader.close()