In [3]:
import pandas as pd
import os                                                                                       # TO CREATE A LIST OF SUSPICIOUS IPs USING AI MODEL 
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
import joblib

# === User Input - Case Folder ===
base_path = "/home/jovyan/my_android_logs/CASE_FILES_raw_logs"
case_name = input("📂 Enter the case folder name: ")

case_folder = os.path.join(base_path, case_name)
log_file = os.path.join(case_folder, "resolved_dns_log.csv")

if not os.path.exists(log_file):
    raise FileNotFoundError(f"❌ Log file not found at: {log_file}")

print(f"✅ Using log file: {log_file}")

# === Load the Log Data ===
df = pd.read_csv(log_file)

# === Initialize Feature List ===
features = [
    'flag_uncommon_tld',
    'domain_count',
    'ip_count',
    'flag_foreign_ip',
    'abuse_score'
]
for col in features :
    if col not in df.columns:
        df[col] = None

# ===Timestamp-Based Features ===
if 'timestamp' in df.columns:
    df['timestamp'] = pd.to_datetime(df['timestamp'], errors='coerce')
    df['hour'] = df['timestamp'].dt.hour
    df['dayofweek'] = df['timestamp'].dt.dayofweek
    df['is_weekend'] = (df['dayofweek'] >= 5).astype(int)
    df['flag_odd_hour'] = df['hour'].apply(lambda h: h < 5 or h > 23 if pd.notnull(h) else 0)

    features += ['hour', 'dayofweek', 'is_weekend', 'flag_odd_hour']
    print("⏱️ Timestamp-based features enabled.")
else:
    print("⚠️ No 'timestamp' column found — skipping time-based features.")

# === Select Features and Target ===
X = df[features].fillna(0)

# === Ensure 'is_suspicious' column exists ===
if 'is_suspicious' not in df.columns:
    if 'ip' in df.columns and df['ip'].notna().any():
        # Load master list of known suspicious IPs 
        master_list_path = os.path.join(case_folder, "master_list.csv")
        if os.path.exists(master_list_path):
            master_df = pd.read_csv(master_list_path)
            master_ips = set(master_df["ip"].dropna())
            print(f"✅ Loaded master IP list with {len(master_ips)} entries.")
        else:
            master_ips = set()
            print("⚠️ No master list found — assuming empty list.")

        df["is_suspicious"] = df["ip"].apply(lambda ip: ip in master_ips if pd.notna(ip) else False)
        print("✅ 'is_suspicious' column created based on master list.")
    else:
        print("⚠️ No IPs found — creating dummy 'is_suspicious' column (all 0).")
        df["is_suspicious"] = 0

y = df['is_suspicious'].astype(int)

# === Scale the Features ===
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# === Train-Test Split ===
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.3, random_state=42
)

# === Train the Random Forest Model ===
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# === Evaluate the Model ===
y_pred = model.predict(X_test)
print("📊 Classification Report:")
print(classification_report(y_test, y_pred))

# === Predict on All Logs ===
df['predicted_suspicious'] = model.predict(X_scaled)
df['suspicion_probability'] = model.predict_proba(X_scaled)[:, 1]

# === Export Flagged Entries to Case Folder ===
flagged_output = os.path.join(case_folder, "ml_flagged_suspicious.csv")
df[df['predicted_suspicious'] == 1].to_csv(flagged_output, index=False)
print(f"📁 Saved flagged logs → {flagged_output}")

# === Save Model and Scaler to Case Folder ===
model_path = os.path.join(case_folder, "suspicious_model.pkl")                          #change model name
scaler_path = os.path.join(case_folder, "scaler.pkl")
joblib.dump(model, model_path)
joblib.dump(scaler, scaler_path)
print(f"📁 Model saved at → {model_path}")
print(f"📁 Scaler saved at → {scaler_path}")

# === Rank IPs by Risk (and Timestamp if available) ===
suspicious_df = df[df['predicted_suspicious'] == 1]

# Group IPs by suspicion probability and earliest timestamp 
agg_dict = {'suspicion_probability': 'max'}
if 'timestamp' in df.columns:
    agg_dict['timestamp'] = 'min'

ip_risk_scores = suspicious_df.groupby('ip').agg(agg_dict).reset_index()

# Risk Level Assignment
def risk_level(score):
    if score >= 0.9:
        return "High"
    elif score >= 0.7:
        return "Medium"
    else:
        return "Low"

ip_risk_scores["risk_level"] = ip_risk_scores["suspicion_probability"].apply(risk_level)

# Save Ranked IPs
ranked_ip_path = os.path.join(case_folder, "ranked_suspicious_ips.csv")
ip_risk_scores.to_csv(ranked_ip_path, index=False)

print(f"📁 Ranked suspicious IPs saved → {ranked_ip_path}")
print("🔝 Top 10 Most Suspicious IPs:")
print(ip_risk_scores.head(10))


📂 Enter the case folder name:  trainning_case


✅ Using log file: /home/jovyan/my_android_logs/CASE_FILES_raw_logs/trainning_case/resolved_dns_log.csv
⚠️ No 'timestamp' column found — skipping time-based features.
📊 Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       203
           1       1.00      1.00      1.00        97

    accuracy                           1.00       300
   macro avg       1.00      1.00      1.00       300
weighted avg       1.00      1.00      1.00       300

📁 Saved flagged logs → /home/jovyan/my_android_logs/CASE_FILES_raw_logs/trainning_case/ml_flagged_suspicious.csv
📁 Model saved at → /home/jovyan/my_android_logs/CASE_FILES_raw_logs/trainning_case/suspicious_model.pkl
📁 Scaler saved at → /home/jovyan/my_android_logs/CASE_FILES_raw_logs/trainning_case/scaler.pkl
📁 Ranked suspicious IPs saved → /home/jovyan/my_android_logs/CASE_FILES_raw_logs/trainning_case/ranked_suspicious_ips.csv
🔝 Top 10 Most Suspicious IPs:
                ip

In [1]:
!pip install ipwhois

Collecting ipwhois
  Downloading ipwhois-1.3.0-py2.py3-none-any.whl.metadata (21 kB)
Collecting dnspython (from ipwhois)
  Downloading dnspython-2.7.0-py3-none-any.whl.metadata (5.8 kB)
Downloading ipwhois-1.3.0-py2.py3-none-any.whl (70 kB)
Downloading dnspython-2.7.0-py3-none-any.whl (313 kB)
Installing collected packages: dnspython, ipwhois
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2/2[0m [ipwhois]m1/2[0m [ipwhois]
[1A[2KSuccessfully installed dnspython-2.7.0 ipwhois-1.3.0


In [4]:
import os, json, requests
import pandas as pd                                                                 # TO GET DATA ABOUT SUSPICIOUS IP
from ipwhois import IPWhois
from tqdm import tqdm
from datetime import datetime, timedelta

#to open folder remove ir later
# base_path = "/home/jovyan/my_android_logs/CASE_FILES_raw_logs"
# case_name = input("📂 Enter the case folder name: ")
# case_folder = os.path.join(base_path, case_name)

# === Paths ===
case_cache_geo_path = os.path.join(case_folder, "geo_cache.json")
case_cache_whois_path = os.path.join(case_folder, "whois_cache.json")
master_geo_path = "/home/jovyan/my_android_logs/master_suspicious_geo_cache.json"
master_whois_path = "/home/jovyan/my_android_logs/master_suspicious_whois_cache.json"
master_report_path = "/home/jovyan/my_android_logs/master_suspicious_ip_report.csv"

# === Extract case name
case_name = os.path.basename(case_folder)

# === Load Cache Safely ===
def safe_load_json(path):
    if not os.path.exists(path): return {}
    try:
        with open(path, 'r') as f:
            data = json.load(f)
            return data if isinstance(data, dict) else {}
    except:
        return {}

geo_map = safe_load_json(case_cache_geo_path)
whois_map = safe_load_json(case_cache_whois_path)
master_geo_map = safe_load_json(master_geo_path)
master_whois_map = safe_load_json(master_whois_path)

# === Check and reset corrupted entries
def clean_cache(cache_name, cache_dict):
    corrupted_keys = [k for k, v in cache_dict.items() if not isinstance(v, dict)]
    if corrupted_keys:
        print(f"❌ Corrupt entries found in {cache_name}")                         
        for k in corrupted_keys:
            del cache_dict[k]
        print(f"✅ Removed {len(corrupted_keys)} corrupt entries from {cache_name}")
    return cache_dict

geo_map = clean_cache("geo_map", geo_map)
master_geo_map = clean_cache("master_geo_map", master_geo_map)
whois_map = clean_cache("whois_map", whois_map)
master_whois_map = clean_cache("master_whois_map", master_whois_map)

# === Expiry Logic
def is_expired(ts_str, days=365):
    try:
        return datetime.now() - datetime.fromisoformat(ts_str) > timedelta(days=days)
    except:
        return True

# === GeoIP Lookup
def geoip_lookup(ip):
    if (
        ip in geo_map and
        isinstance(geo_map[ip], dict) and
        not is_expired(geo_map[ip].get("timestamp", ""))
    ):
        return geo_map[ip]
    try:
        r = requests.get(f"http://ip-api.com/json/{ip}", timeout=5).json()
        result = {
            "lat": r.get("lat"),
            "lon": r.get("lon"),
            "country": r.get("countryCode"),
            "timestamp": datetime.now().isoformat()
        }
    except:
        result = {"lat": None, "lon": None, "country": "Unknown", "timestamp": datetime.now().isoformat()}
    geo_map[ip] = result
    master_geo_map[ip] = result
    return result

# === WHOIS Lookup
def get_whois_info(ip):
    if (
        ip in whois_map and
        isinstance(whois_map[ip], dict) and
        not is_expired(whois_map[ip].get("timestamp", ""))
    ):
        return whois_map[ip]
    try:
        obj = IPWhois(ip)
        result = obj.lookup_rdap()
        info = {
            "asn": result.get("asn"),
            "asn_description": result.get("asn_description"),
            "org_name": result["network"].get("name"),
            "cidr": result["network"].get("cidr"),
            "start_address": result["network"].get("start_address"),
            "end_address": result["network"].get("end_address"),
            "created": str(result["network"].get("created")),
            "updated": str(result["network"].get("updated")),
            "timestamp": datetime.now().isoformat()
        }
    except:
        info = {k: None for k in [
            "asn", "asn_description", "org_name", "cidr",
            "start_address", "end_address", "created", "updated"
        ]}
        info["timestamp"] = datetime.now().isoformat()
    whois_map[ip] = info
    master_whois_map[ip] = info
    return info

# === Process Only New Suspicious IPs
suspicious_ips = df[df['predicted_suspicious'] == 1]['ip'].dropna().unique()
existing_master_ips = set(master_geo_map.keys())

new_ips = [ip for ip in suspicious_ips if ip not in existing_master_ips]

if not new_ips:
    print("✅ No new suspicious IPs found. Master list unchanged.")
else:
    rows = []
    for ip in tqdm(new_ips, desc="🔍 Enriching New Suspicious IPs"):
        geo = geoip_lookup(ip)
        whois = get_whois_info(ip)
        rows.append({
            "ip": ip,
            "country": geo.get("country"),
            "latitude": geo.get("lat"),
            "longitude": geo.get("lon"),
            "case_name": case_name,
            **whois
        })

    # Save per-case report
    pd.DataFrame(rows).to_csv(os.path.join(case_folder, "suspicious_ip_geo_whois.csv"), index=False)

    # Append to master report
    if os.path.exists(master_report_path):
        master_df = pd.read_csv(master_report_path)
        updated_df = pd.concat([master_df, pd.DataFrame(rows)], ignore_index=True)
    else:
        updated_df = pd.DataFrame(rows)
    
    updated_df.drop_duplicates(subset='ip', keep='last', inplace=True)
    updated_df.to_csv(master_report_path, index=False)

    print(f"✅ Master list updated with {len(new_ips)} new IP(s) → master_suspicious_ip_report.csv")

# === Save linkage log if timestamp+IP info is available
link_cols = ['timestamp', 'app_name', 'domain', 'ip', 'suspicion_probability']
available_cols = [col for col in link_cols if col in df.columns]
if available_cols:
    df[available_cols].dropna().to_csv(
        os.path.join(case_folder, "linked_logs.csv"), index=False)

# === Save all cache files
with open(case_cache_geo_path, 'w') as f: json.dump(geo_map, f)
with open(case_cache_whois_path, 'w') as f: json.dump(whois_map, f)
with open(master_geo_path, 'w') as f: json.dump(master_geo_map, f)
with open(master_whois_path, 'w') as f: json.dump(master_whois_map, f)


✅ No new suspicious IPs found. Master list unchanged.
