In [1]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from typing import List
from sklearn.model_selection import train_test_split

import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('/home/laptop-kl-11/personal_project/cyber_security_classification/dataset/train_subset.csv')
df = df[df['IncidentGrade'].notna()]

  df = pd.read_csv('/home/laptop-kl-11/personal_project/cyber_security_classification/dataset/train_subset.csv')


In [3]:
df = df.drop(['IncidentId','ActionGrouped','ActionGranular','MitreTechniques','EmailClusterId','LastVerdict',
                                                'SuspicionLevel','AntispamDirection','Roles','ResourceType','ThreatFamily','RegistryValueName','RegistryKey',
                                                'RegistryValueData','OAuthApplicationId','ResourceIdName'],axis = 1)

In [3]:
#preprocessing price feature
def preprocess_target_column(df: pd.DataFrame): 
        df = df.dropna(subset=['IncidentGrade'])
        df.loc[:,'IncidentGrade'] = df['IncidentGrade'].map({'BenignPositive':0,'TruePositive':1, 'FalsePositive':2})
        df.loc[:,'IncidentGrade'] = df['IncidentGrade'].astype(int)
        return df
df = preprocess_target_column(df)

In [4]:
X_train, X_test, y_train, y_test = train_test_split(df.drop('IncidentGrade',axis=1), df['IncidentGrade'], test_size=0.17, random_state=42,stratify=df['IncidentGrade'],shuffle= True)
del df

In [4]:
class RemoveUnwantedColsAndRows(BaseEstimator,TransformerMixin):
    def __init__(self, unwanted_columns: List[str]):
        self.unwanted_columns = unwanted_columns

    def fit(self,X):
        self.column = X.columns
        return self

    def transform(self,X):
        x_transformed = X.copy()
        x_transformed.drop(self.unwanted_columns,axis=1,inplace = True,errors = 'raise')
        return x_transformed.reset_index(drop=True)


class StandardizeColumnNames(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X):
        return self
    
    def transform(self, X): 
        transformed_df = X.copy()
        transformed_df.columns = [col.lower().replace(' ','_') for col in transformed_df.columns]
        return transformed_df
    

class one_to_one_target_encoding(BaseEstimator, TransformerMixin): 
    def __init__(self):
        pass

    def fit(self, X):
        self.id_lookup_table = X.groupby('Id')['IncidentGrade'].agg(lambda x: x.mode()[0])
        self.alert_id_lookup_table = X.groupby('AlertId')['IncidentGrade'].agg(lambda x: x.mode()[0])
        return self

    def transform(self, X): 
        transformed_df =  X.copy()
        transformed_df.loc[:,'Id'] = transformed_df['Id'].apply(lambda x : self.id_lookup_table.loc[x] if x in self.id_lookup_table.index else -1)
        transformed_df.loc[:, 'AlertId'] = transformed_df['AlertId'].apply(lambda x : self.alert_id_lookup_table.loc[x] if x in self.alert_id_lookup_table.index else -1)
        return transformed_df

class CreateDataFeatures(BaseEstimator, TransformerMixin): 
    def __init__(self):
        pass

    def fit(self, X): 
        return self
    
    def transform(self,X):
        transformed_df = X.copy()
        transformed_df['month'] = pd.to_datetime(transformed_df['Timestamp']).dt.month
        transformed_df['hour'] = pd.to_datetime(transformed_df['Timestamp']).dt.hour
        transformed_df['minute'] = pd.to_datetime(transformed_df['Timestamp']).dt.minute
        transformed_df = transformed_df.drop('Timestamp', axis = 1)
        return transformed_df
        


class CountEncoder(BaseEstimator, TransformerMixin): 
    def __init__(self, feature_thres_map): 
        self.feature_thres_map = feature_thres_map


    def fit(self, X):
        self.lookup_tables = {} 
        for feature,thres in self.feature_thres_map.items():
            val_counts = X[[feature]].value_counts()
            thres = val_counts.iloc[0] * 0.036
            self.lookup_tables[feature]  = val_counts[val_counts>thres]
        return self

    def transform(self,X): 
        transformed_x = X.copy()
        for feature in self.feature_thres_map.keys():
            feature_lookup = self.lookup_tables[feature]
            transformed_x.loc[:, feature] = transformed_x[feature].apply(lambda x : feature_lookup[x] if x in feature_lookup else -1)
        return transformed_x
    



class CustomOHE(BaseEstimator, TransformerMixin):
    def __init__(self,feature_name_thresh_map):
        self.feature_name_thresh_map = feature_name_thresh_map

    def fit(self, X):
        self.learned_obj = {}
        for feature,thresh in self.feature_name_thresh_map.items(): 
            value_counts = X[feature].value_counts()
            feature_thres_value = value_counts.iloc[0]*thresh
            #important values
            ohe = OneHotEncoder(sparse_output=False,dtype = int)
            categories = list(value_counts[value_counts > feature_thres_value].index) + ['Other']
            ohe.fit(pd.Series(categories).values.reshape(-1,1))
            self.learned_obj[feature] = [categories,ohe]
        return self

    def transform(self, X):
        transformed_df = X.copy()
        for feature in self.feature_name_thresh_map.keys(): 
            learned_categories = self.learned_obj[feature][0]
            learned_ohe = self.learned_obj[feature][1]
            transformed_df[feature] = transformed_df[feature].apply(lambda category : category if category in learned_categories else 'Other')
            ohe_value = learned_ohe.transform(transformed_df[feature].values.reshape(-1,1))
            category_df = pd.DataFrame(ohe_value, columns=learned_ohe.get_feature_names_out())
            transformed_df = pd.concat([transformed_df,category_df], axis= 1)
            transformed_df = transformed_df.drop(feature,axis = 1)

        return transformed_df
    

class CustomLabelEncoder(BaseEstimator, TransformerMixin): 
    def __init__(self):
        pass

    def fit(self,X):
        return self
    
    def transform(self,X):
        evidencerole_map = {'Related': 0 , 'Impacted':1}
        transformed_df = X.copy()
        transformed_df['EvidenceRole'] = transformed_df['EvidenceRole'].apply(lambda x : evidencerole_map.get(x,2))
        return transformed_df
    


class SkewedDataCountEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, feature_names):
        self.feature_names = feature_names

    def assign_counts(self,val,value_count, skewed_data_point): 
        if val == skewed_data_point:
            return -2
        elif val in value_count.index:
            return value_count[val]
        else:
            return -1
        
    def fit(self, X):
        self.learned_parameters = {}
        for feature in self.feature_names:
            feature_value_counts = X[feature].value_counts()
            skewed_data_point = feature_value_counts.iloc[0]
            thresh = feature_value_counts.iloc[1]*0.1
            filtered_value_counts = feature_value_counts.iloc[1:][feature_value_counts.iloc[1:]> thresh]
            self.learned_parameters[feature] = [filtered_value_counts, skewed_data_point]

        return self

    def transform(self, X): 
        transformed_df = X.copy()
        for feature in self.feature_names:
                skewed_data_point = self.learned_parameters[feature][1]
                feature_value_count = self.learned_parameters[feature][0]
                transformed_df[feature] = transformed_df[feature].apply(lambda x: self.assign_counts(x, feature_value_count, skewed_data_point))
        return transformed_df
    

class CustomNormalization(BaseEstimator, TransformerMixin):
    def __init__(self,column_names):
        self.column_names = column_names

    def fit(self,self2, X):
        print(type(self))
        print('dfdfd')
        print(type(self2))
        print('dfdd')
        print(type(X))
        self.learned_ss = {}
        for col in self.column_names:
                self.learned_ss[col]= MinMaxScaler()
                self.learned_ss[col].fit(X[X[col]>0][[col]])
        return self
    
    def transform(self,X):
        transformed_x = X.copy()
        for col in self.column_names:
            transformed_x[col] = transformed_x[col].astype(float)
            transformed_x.loc[transformed_x[col] > 0, col] = self.learned_ss[col].transform(transformed_x[transformed_x[col] > 0][[col]])

        return transformed_x


In [5]:
numerical_pipeline = Pipeline([
            ('remove_unwanted_col_row', RemoveUnwantedColsAndRows(
                            unwanted_columns = ['IncidentId','ActionGrouped','ActionGranular','MitreTechniques','EmailClusterId','LastVerdict',
                                                'SuspicionLevel','AntispamDirection','Roles','ResourceType','ThreatFamily','RegistryValueName','RegistryKey',
                                                'RegistryValueData','OAuthApplicationId','ResourceIdName'])),
            # ('standardize_column_names', StandardizeColumnNames()),
            ('create_date_features', CreateDataFeatures()),
            ('skewed_data_count_encoder',SkewedDataCountEncoder(['DeviceId', 'Sha256', 'Url', 'IpAddress', 'AccountSid', 'AccountUpn','AccountObjectId','AccountName','DeviceName','NetworkMessageId', 
                                    'ApplicationId', 'ApplicationName', 'FileName', 'FolderPath', 'OSVersion', 'CountryCode', 'State', 'City']))
                    ])


categorical_pipeline = Pipeline([
    ('encoding_id_alert_id',one_to_one_target_encoding()),
    ('count_encoding', CountEncoder({'DetectorId': 0.036, 'OrgId': 0.005,'AlertTitle': 0.05})),
    ('custom_ohe', CustomOHE({'Category': 0.05, 'EntityType': 0.01})),
    ('label_encoder', CustomLabelEncoder()),
    
])

In [6]:
final_pipeline = Pipeline([
    ('numerical_pipeline', numerical_pipeline),
    ('categorical_pipeline', categorical_pipeline),
    ('normalization', CustomNormalization(['DeviceId', 'Sha256', 'Url', 'IpAddress', 'AccountSid', 'AccountUpn','AccountObjectId','AccountName','DeviceName','NetworkMessageId', 
                                    'ApplicationId', 'ApplicationName', 'FileName', 'FolderPath', 'OSVersion', 'CountryCode', 'State', 'City','AlertTitle','DetectorId','OrgId']))
])

In [8]:
final_pipeline.fit(df)

TypeError: CustomNormalization.fit() takes 2 positional arguments but 3 were given

In [10]:
cn = CustomNormalization(['DeviceId', 'Sha256', 'Url', 'IpAddress', 'AccountSid', 'AccountUpn','AccountObjectId','AccountName','DeviceName','NetworkMessageId', 
                                    'ApplicationId', 'ApplicationName', 'FileName', 'FolderPath', 'OSVersion', 'CountryCode', 'State', 'City','AlertTitle','DetectorId','OrgId'])
cn.fit(df)