In [1]:
import os

In [2]:
os.chdir('../')

## Traning Pipline Config

In [3]:
import numpy as np
import pandas as pd

In [4]:
from networksecurity.constant import traning_pipline


## Config Entity

In [5]:
from datetime import datetime

class TraningPiplineConfig:
    def __init__(self) -> None:
      self.pipline_name=traning_pipline.PIPELINE_NAME   
      self.artifact_name=traning_pipline.ARTIFACT_DIR
      self.artifact_dir=os.path.join(self.artifact_name)
      
class DataIngestionConfig:
      def __init__(self,traning_pipline_config:TraningPiplineConfig) -> None:
         self.data_ingestion_dir:str=os.path.join(
            traning_pipline_config.artifact_dir, traning_pipline.DATA_INGESTION_DIR_NAME ## creating data ingestion dir inside artifacts
			)
         self.feature_store_file_path:str=os.path.join(
            traning_pipline_config.artifact_dir, traning_pipline.DATA_INGESTION_FEATURE_STORE_DIR ,traning_pipline.FILE_NAME ## saving raw data in artifact with file name
			)
         self.traning_data_store_path:str=os.path.join(
            traning_pipline_config.artifact_dir, traning_pipline.DATA_INGESTION__DIR, traning_pipline.TRAIN_FILE_NAME  ## artifacts folder , ingest folder , train data path
			)
         self.test_data_store_path:str=os.path.join(
            traning_pipline_config.artifact_dir,traning_pipline.DATA_INGESTION__DIR, traning_pipline.TEST_FILE_NAME ## artifacts folder , ingest folder , test data path
			)
         self.train_test_split_ratio:float=traning_pipline.DATA_INGESTION_TRAIN_TEST_SPLIT_RATIO
         

## Components Output Artifacts Entity

In [6]:
from dataclasses import dataclass
@dataclass
class DataIngestionArtifact:
    trained_file_path:str
    test_file_path:str

## Data Ingestion Component

In [7]:
from networksecurity.logging.logger import logging
from networksecurity.exception.exception import CustomException
from networksecurity.utils.utills import Data_read_from_db
import os
import sys
import numpy as np
import pandas as pd
from typing import List
from sklearn.model_selection import train_test_split
from dotenv import load_dotenv
load_dotenv()
from networksecurity.logging.logger import logging
from networksecurity.exception.exception import CustomException

In [8]:
class DataIngestion:
   def __init__(self,data_ingestion_config:DataIngestionConfig) -> None:
      self.config = data_ingestion_config
      self.url = os.getenv('url')
      self.database = os.getenv('db')
      self.collection = os.getenv('collection')
   def export_data_to_feature_store(self,df:pd.DataFrame) -> pd.DataFrame:
      try:
         feature_store_dir_path:str=self.config.feature_store_file_path
         dir_path=os.path.dirname(feature_store_dir_path)
         os.makedirs(dir_path,exist_ok=True)
         df.to_csv(feature_store_dir_path,index=False,header=True)
         return df
      except Exception as e:
         logging.info(f'Error in export data {str(e)}')
         raise CustomException(e,sys)
   def split_data_into_train_test(self,df):
      try:
         test_data,train_data=train_test_split(
            df,test_size=self.config.train_test_split_ratio,random_state=42
			)
         logging.info('Data split into training and testing sets successfully.')
         train_dir_path=os.path.dirname(self.config.traning_data_store_path)
         os.makedirs(train_dir_path,exist_ok=True)
         train_data.to_csv(self.config.traning_data_store_path)
         print(train_data.info())
         test_dir_path=os.path.dirname(self.config.test_data_store_path)
         os.makedirs(test_dir_path,exist_ok=True)
         test_data.to_csv(self.config.test_data_store_path)
         logging.info(f'Training and test data saved to {self.config.traning_data_store_path} and {self.config.test_data_store_path} respectively.')
      except Exception as e:
         raise CustomException(e,sys)
   def initiate_data_ingestion(self):
      try:
         df=Data_read_from_db(url=self.url,db=self.database,collection=self.collection)
         self.export_data_to_feature_store(df=df)
         self.split_data_into_train_test(df=df)
         data_ingestion_artifacts=DataIngestionArtifact(
            trained_file_path=self.config.traning_data_store_path,test_file_path=self.config.test_data_store_path
			)
         return data_ingestion_artifacts
      except Exception as e:
         logging.info(f'Error in Data Ingestion: {str(e)}')
         raise CustomException(e,sys)
      

## Execute Pipline

In [9]:
# from networksecurity.entity.artifact_entity import DataIngestionArtifact
# from networksecurity.entity.config_entity import DataIngestonConfig
# from networksecurity.logging.logger import logging
# from networksecurity.exception.exception import CustomException
# import sys

In [None]:
try:
   traning_pipline_config=TraningPiplineConfig()
   data_ingestion_config=DataIngestionConfig(traning_pipline_config=traning_pipline_config)
   data_ingestion=DataIngestion(data_ingestion_config=data_ingestion_config)
   data_ingestion.initiate_data_ingestion()
except Exception as e:
           raise CustomException(e,sys)

   having_IP_Address  URL_Length  Shortining_Service  having_At_Symbol  \
0                 -1           1                   1                 1   
1                  1           1                   1                 1   
2                  1           0                   1                 1   
3                  1           0                   1                 1   
4                  1           0                  -1                 1   

   double_slash_redirecting  Prefix_Suffix  having_Sub_Domain  SSLfinal_State  \
0                        -1             -1                 -1              -1   
1                         1             -1                  0               1   
2                         1             -1                 -1              -1   
3                         1             -1                 -1              -1   
4                         1             -1                  1               1   

   Domain_registeration_length  Favicon  ...  popUpWidnow  Iframe  \

CustomException: Error occurred python script name [C:\Users\www58\AppData\Local\Temp\ipykernel_7872\1019428492.py] line number [5] error message [cannot unpack non-iterable DataIngestionArtifact object]

In [16]:
pip install seaborn

Collecting seaborn
  Using cached seaborn-0.13.2-py3-none-any.whl.metadata (5.4 kB)
Using cached seaborn-0.13.2-py3-none-any.whl (294 kB)
Installing collected packages: seaborn
Successfully installed seaborn-0.13.2
Note: you may need to restart the kernel to use updated packages.


In [24]:
import seaborn as sns
df=sns.load_dataset('tips')
df=df.select_dtypes(exclude='category')
train,test=train_test_split(df,test_size=0.26,random_state=15)

In [25]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 180 entries, 122 to 200
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   total_bill  180 non-null    float64
 1   tip         180 non-null    float64
 2   size        180 non-null    int64  
dtypes: float64(2), int64(1)
memory usage: 5.6 KB


In [27]:
from scipy.stats import ks_2samp

# Assuming base_data and current_data are two datasets (e.g., pandas Series)
ks_stat, p_value = ks_2samp(train, test)
print("KS Test p-value:", p_value)
print(ks_stat)


KS Test p-value: [0.43641559 0.10972994 0.21902045]
[0.12256944 0.17118056 0.14895833]
