In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df = pd.read_csv('Train_data.csv')

In [None]:
missing_data = df.isnull().sum()
inf_data = df.applymap(np.isinf)

In [None]:
unique_categories = df.nunique()
print(unique_categories)

In [None]:
df.describe()

In [None]:
def plot_pmf(column):
    pmf = df[column].value_counts(normalize=True)
    
    plt.figure(figsize=(8, 5))
    plt.bar(pmf.index, pmf.values, color='skyblue')
    plt.title(f'PMF of {column}')
    plt.ylabel('Probability')
    plt.xlabel('Categories')
    plt.xticks(rotation=45)
    plt.show()
    
for column in df.select_dtypes(include=['object', 'category', 'int64']).columns:
    plot_pmf(column)    
    

In [None]:
def plot_pdf(column):
    data = df[column].dropna()
    
    plt.figure(figsize=(8, 5))
    plt.hist(data, bins=30, density=True, color='skyblue', alpha=0.6)  # Histogram
    plt.title(f'PDF of {column}')
    plt.ylabel('Density')
    plt.xlabel('Values')
    plt.show()

for column in df.select_dtypes(include=['float64', 'int64']).columns:
    plot_pdf(column)

In [None]:
#CDF for discrete
def plot_cdf_from_pmf(pmf, column):
    cdf = pmf.cumsum()
    
    plt.figure(figsize=(8, 5))
    plt.plot(cdf.index, cdf.values, marker='o', linestyle='-', color='skyblue')
    plt.title(f'CDF of {column}')
    plt.ylabel('Cumulative Probability')
    plt.xlabel('Categories')
    plt.xticks(rotation=45)
    plt.grid(True)
    plt.show()


pmf_column = df[column].value_counts(normalize=True).sort_index()
plot_cdf_from_pmf(pmf_column, 'your_discrete_column')

In [None]:
#CDF for continous data

def plot_cdf_from_pdf(column):
    data = df[column].dropna()     # Removing the NaN values and sorting the data
    sorted_data = np.sort(data)
    
    cdf = np.arange(1, len(sorted_data) + 1) / len(sorted_data)
    
    plt.figure(figsize=(8, 5))
    plt.plot(sorted_data, cdf, marker='.', linestyle='-', color='skyblue')
    plt.title(f'CDF of {column}')
    plt.ylabel('Cumulative Probability')
    plt.xlabel('Values')
    plt.grid(True)
    plt.show()
    
plot_cdf_from_pdf(column)

In [None]:
#plot original and conditional PMF for discrete data
def plot_pmf_given_class_discrete(column, class_column):
    
    
    pmf_original = df[column].value_counts(normalize=True).sort_index()    #Calculate original PMF for the entire dataset

    # Calculate conditional PMF for the class "normal"
    pmf_normal = df[df[class_column] == 'normal'][column].value_counts(normalize=True).sort_index()

    #Calculate conditional PMF for the class "anomaly"
    pmf_anomaly = df[df[class_column] == 'anomaly'][column].value_counts(normalize=True).sort_index()
    
    #Plot PMF for the entire dataset
    plt.figure(figsize=(10, 6))
    pmf_original.plot(kind='bar', color='blue', alpha=0.6, label='Original PMF', width=0.4, position=1)

    #Plot conditional PMF for "normal"
    pmf_normal.plot(kind='bar', color='green', alpha=0.6, label='PMF (Normal)', width=0.4, position=0)

    #Plot conditional PMF for "anomaly"
    pmf_anomaly.plot(kind='bar', color='red', alpha=0.6, label='PMF (Anomaly)', width=0.4, position=2)

    plt.title(f'PMF of {column} Given Class')
    plt.xlabel('Categories')
    plt.ylabel('Probability')
    plt.legend()
    plt.xticks(rotation=45)
    plt.grid(True)
    plt.show()

plot_pmf_given_class_discrete(column, 'class')


In [None]:
#plot original and conditional PDF for continuous data
import seaborn as sns

def plot_pdf_given_class_continuous(column, class_column):
    # Remove NaN values
    data = df[column].dropna()

    #Plot original PDF using histogram for the entire dataset
    plt.figure(figsize=(10, 6))
    sns.histplot(data, bins=30, color='blue', kde=True, stat="density", label='Original PDF', alpha=0.6)

    #Plot conditional PDF for the class "normal"
    data_normal = df[df[class_column] == 'normal'][column].dropna()
    sns.histplot(data_normal, bins=30, color='green', kde=True, stat="density", label='PDF (Normal)', alpha=0.6)

    #Plot conditional PDF for the class "anomaly"
    data_anomaly = df[df[class_column] == 'anomaly'][column].dropna()
    sns.histplot(data_anomaly, bins=30, color='red', kde=True, stat="density", label='PDF (Anomaly)', alpha=0.6)

    plt.title(f'PDF of {column} Given Class')
    plt.xlabel('Values')
    plt.ylabel('Density')
    plt.legend()
    plt.grid(True)
    plt.show()

plot_pdf_given_class_continuous(column, 'class')


In [None]:
def scatter_plot(x_column, y_column):
    plt.figure(figsize=(16, 14))
    plt.scatter(df[x_column], df[y_column], color='blue', alpha=0.6)
    plt.title(f'Scatter Plot between {x_column} and {y_column}')
    plt.xlabel(x_column)
    plt.ylabel(y_column)
    plt.grid(True)
    plt.show()
    
scatter_plot('dst_host_serror_rate', 'dst_host_srv_serror_rate')    

In [None]:
def joint_pmf_discrete(field1, field2):
    #Create a contingency table showing joint frequencies
    joint_freq = pd.crosstab(df[field1], df[field2])
    
    #Normalize the contingency table to get the joint PMF (joint probabilities)
    joint_pmf = joint_freq / joint_freq.sum().sum()

    print(f'Joint PMF between {field1} and {field2}:')
    print(joint_pmf)
    
    #Plot the joint PMF as a heatmap
    plt.figure(figsize=(8, 6))
    sns.heatmap(joint_pmf, annot=True, cmap='Blues', cbar=True)
    plt.title(f'Joint PMF of {field1} and {field2}')
    plt.xlabel(field2)
    plt.ylabel(field1)
    plt.show()

joint_pmf_discrete('count', 'srv_count')

In [None]:
def joint_pdf_conditioned_continuous(field1, field2, class_column, class_value):
    plt.figure(figsize=(12, 5))
    
    #Plot the unconditioned joint PDF (KDE or 2D Histogram)
    plt.subplot(1, 2, 1)
    sns.kdeplot(x=df[field1], y=df[field2], cmap="Blues", fill=True, thresh=0.05)
    plt.title(f'Joint PDF of {field1} and {field2} (Unconditioned)')
    plt.xlabel(field1)
    plt.ylabel(field2)
    
    #Filter the data to include only the rows where the class matches
    df_conditioned = df[df[class_column] == class_value]
    
    #Plot the conditioned joint PDF
    plt.subplot(1, 2, 2)
    sns.kdeplot(x=df_conditioned[field1], y=df_conditioned[field2], cmap="Reds", fill=True, thresh=0.05)
    plt.title(f'Joint PDF of {field1} and {field2} conditioned on {class_value}')
    plt.xlabel(field1)
    plt.ylabel(field2)

    plt.show()

joint_pdf_conditioned_continuous('same_srv_rate', 'diff_srv_rate', 'class', 'anomaly')

In [None]:
correlation_matrix = df.corr()

print(correlation_matrix)


plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0)
plt.title('Correlation Matrix')
plt.show()

In [None]:
field1 = 'field1'
field2 = 'field2'

# Calculate the  correlation between the two fields
correlation = df[[field1, field2]].corr().iloc[0, 1]
print(f'Pearson Correlation between {field1} and {field2}: {correlation}')

# Plot a scatter plot with a regression line to visualize the relationship
sns.jointplot(x=field1, y=field2, data=df, kind='reg', height=6)
plt.show()

In [None]:
from scipy.stats import chi2_contingency

def chi_squared_test(field, class_column):

    contingency_table = pd.crosstab(df[field], df[class_column])
    

    chi2, p, _, _ = chi2_contingency(contingency_table)
    
    return p


p_value = chi_squared_test(column, 'class')
print(f'P-value for Chi-squared test between field_name and attack type: {p_value}')


alpha = 0.05
if p_value < alpha:
    print("Reject the null hypothesis: There is a dependency between the field and the attack type.")
else:
    print("Fail to reject the null hypothesis: There is no dependency between the field and the attack type.")

In [None]:
# Map 'anomaly' to 1 and 'normal' to 0
df['attack_type_numeric'] = df['class'].map({'normal': 0, 'anomaly': 1})

# Calculate the correlation with the class variable
correlation_with_attack_type = df.corr()['attack_type_numeric'].drop('attack_type_numeric')
print(correlation_with_attack_type)

#visualize the correlations
plt.figure(figsize=(10, 6))
correlation_with_attack_type.plot(kind='bar', color='skyblue')
plt.title('Correlation with Attack Type')
plt.ylabel('Correlation Coefficient')
plt.xlabel('Fields')
plt.axhline(0, color='gray', linewidth=0.8, linestyle='--')
plt.show()
