In [None]:

import pandas as pd
import numpy as np
from scipy.stats import zscore
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split
from collections import Counter
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier

from sklearn.neighbors import KNeighborsClassifier, LocalOutlierFactor
from sklearn.svm import SVC, OneClassSVM
from sklearn.covariance import EllipticEnvelope
from sklearn.ensemble import IsolationForest, VotingClassifier
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix


import os

target_column = 'Label'

# Specify the path to your CSV file
csv_file_path = r"C:\Users\abett\Downloads\csci\combined_file.csv"

# Read the CSV file into a Pandas DataFrame
tf_df = pd.read_csv(csv_file_path)

# Display the first few rows of the DataFrame
print(tf_df.head())


In [None]:
# Get the columns of the DataFrame
columns = tf_df.columns

# Display the columns
print(columns)

In [None]:
# Display basic information about the dataset
tf_df.info()


In [None]:
# Display summary statistics for numerical columns
print(tf_df.describe())


In [None]:
# Check for missing values in each column
print(tf_df.isnull().sum())


In [None]:
#Getting the count of each type of attack
label_counts = tf_df[' Label'].value_counts()
print(label_counts)


In [None]:
#Converting 'int64' and 'float64' to 'int32' and 'float32' to save memory
# Identify integer and float columns
integer_columns = tf_df.select_dtypes(include=['int64']).columns
float_columns = tf_df.select_dtypes(include=['float64']).columns

# Convert integer columns to int32
tf_df[integer_columns] = tf_df[integer_columns].astype('int32')

# Convert float columns to float32
tf_df[float_columns] = tf_df[float_columns].astype('float32')

# Display updated DataFrame information
tf_df.info()


In [None]:
# Visualize the distribution of the target variable 'Label'
plt.figure(figsize=(10, 6))
sns.countplot(x=' Label', data=tf_df, order=tf_df[' Label'].value_counts().index)
plt.title('Distribution of Attack Types')
plt.xticks(rotation=45)
plt.show()



In [None]:
# Visualize the distribution of a specific feature (e.g., 'Flow Duration') for each attack type
plt.figure(figsize=(12, 6))
sns.boxplot(x=' Label', y=' Flow Duration', data=tf_df)
plt.title('Distribution of Flow Duration for Different Attack Types')
plt.xticks(rotation=45)
plt.show()




In [None]:


# Replace 'your_file.csv' with the actual file path or URL
file_path = r"C:\Users\abett\Downloads\csci\tdf.csv"

# Read the CSV file into a DataFrame
tdf = pd.read_csv(file_path)

# Display the first few rows of the DataFrame to verify the import
print(tdf.head(20))


In [None]:
# Concatenate vertically
cdf = pd.concat([tf_df, tdf], ignore_index=True)

# Display the combined DataFrame
print(cdf)

In [None]:
print(cdf.shape)


In [None]:
print(cdf.isnull().sum())


In [None]:
cdf = cdf.drop(['ipv4', 'Label', 'date'], axis=1)


In [None]:
print(cdf.isnull().sum())

cdf.fillna(22, inplace=True)  # Replace 'value' with the imputation value


In [None]:
# Perform label encoding
label_encoder = LabelEncoder()
cdf['Label_encoded'] = label_encoder.fit_transform(cdf[' Label'])

# Handle missing values using SimpleImputer
pipeline = make_pipeline(SimpleImputer(strategy='mean'), RandomForestClassifier(random_state=42))


# Define the features (X) and target variable (y)
X = cdf.drop([' Label', 'Label_encoded'], axis=1)
y = cdf['Label_encoded']

# Perform train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Signature-based detection models
signature_models = [
    ('RandomForest', RandomForestClassifier(random_state=42)),
    ('GradientBoosting', GradientBoostingClassifier(random_state=42)),
    ('DecisionTree', DecisionTreeClassifier(random_state=42)),
    ('KNeighbors', KNeighborsClassifier()),
    ('SVM', SVC(probability=True)),
]

# Anomaly-based detection models
anomaly_models = [
    ('OneClassSVM', OneClassSVM()),
    ('IsolationForest', IsolationForest(random_state=42)),
    ('EllipticEnvelope', EllipticEnvelope()),
    ('LocalOutlierFactor', LocalOutlierFactor()),
    # Add more anomaly-based models as needed
]

# Hybrid model (Voting Classifier)
hybrid_model = VotingClassifier(estimators=signature_models + anomaly_models, voting='soft')

# Train and evaluate signature-based models
for name, model in signature_models:
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(f"\n{name}:\n{classification_report(y_test, y_pred)}")

# Train and evaluate anomaly-based models
for name, model in anomaly_models:
    model.fit(X_train)
    y_pred = model.predict(X_test)
    y_pred[y_pred == 1] = 0  # Convert normal class to 0
    y_pred[y_pred == -1] = 1  # Convert anomaly class to 1
    print(f"\n{name}:\n{classification_report(y_test, y_pred)}")

# Train and evaluate hybrid model
hybrid_model.fit(X_train, y_train)
y_pred_hybrid = hybrid_model.predict(X_test)
print(f"\nHybrid Model:\n{classification_report(y_test, y_pred_hybrid)}")

In [None]:
# Convert 'float64' to 'float32' and 'int64' to 'int32'
float_columns = cdf.select_dtypes(include=['float64']).columns
int_columns = cdf.select_dtypes(include=['int64']).columns
cdf[float_columns] = cdf[float_columns].astype('float32')
cdf[int_columns] = cdf[int_columns].astype('int32')


In [None]:
# Replace 'your_dataset.csv' with the actual path to your CSV file
file_path = r"C:\Users\abett\Downloads\csci\tdf.csv"

# Read the CSV file into a Pandas DataFrame
df = pd.read_csv(file_path)

# Display the first few rows of the DataFrame to verify the import
print(df.head())


import requests
import pandas as pd
import json

# Assuming your DataFrame is named 'df' and the IPv4 addresses are in the 'ipv4' column
ipv4_column = df['ipv4']

# AlienVault OTX API configuration
api_key = '7eeddf89313553770b1cb75c392cd7eef85a514e72eedcb61b79be02f11da5bb'
otx_api_url = 'https://otx.alienvault.com/api/v1/indicators/IPv4/'

for ipv4_address in ipv4_column:
    # Make a GET request to the AlienVault OTX API
    response = requests.get(otx_api_url + ipv4_address, headers={'X-OTX-API-KEY': api_key})

    # Check if the request was successful (status code 200)
    if response.status_code == 200:
        # Beautify and print the JSON response
        formatted_response = json.dumps(response.json(), indent=2)
        print(f"IPv4 Address: {ipv4_address}")
        print("OTX API Response:")
        print(formatted_response)
        print("\n")
    else:
        print(f"Error for IPv4 Address {ipv4_address}. Status Code: {response.status_code}")

