### Workflows: Do not run the code here, its only for understanding

    1-us_visa > Constants > __init__.py
    2-us_visa > Entity > config_entity.py > artifact_entity.py
    3-us_visa > Configuration > create the file “mongo_db_connection.py”
    4-Create the folder under us_visa “data_access”> create “__init__.py” > create file “usvisa_data.py”
    5-Update the Us_visa > components > data_ingestion.py
    6-Update the us_visa > pipeline > training_pipeline.py
    7-Update the demo.py



### constants -> __init__.py 


In [None]:

### constants -> __init__.py

import os
from datetime import date

DATABASE_NAME = "US_VISA"
COLLECTION_NAME = "visa_data"
MONGODB_URL_KEY = "MONGO_DB_URL"

PIPELINE_NAME: str = "usvisa"
ARTIFACT_DIR: str = "artifact"

TRAIN_FILE_NAME: str = "train.csv"
TEST_FILE_NAME: str = "test.csv"

FILE_NAME: str = "usvisa.csv"

MODEL_FILE_NAME = "model.pkl"

"""
Data Ingestion related constant start with DATA_INGESTION VAR NAME

"""	
DATA_INGESTION_COLLECTION_NAME: str = "visa_data"
DATA_INGESTION_DIR_NAME: str = "data_ingestion"
DATA_INGESTION_FEATURE_STORE_DIR: str = "feature_store"
DATA_INGESTION_INGESTED_DIR: str = "ingested"
DATA_INGESTION_TRAIN_TEST_SPLIT_RATIO: float = 0.2


### 2-	us_visa > Entity > config_entity.py > artifact_entity.py

In [1]:
### us_visa > Entity > config_entity.py
import os
from us_visa.constants import *
from dataclasses import dataclass
from datetime import datetime


TIMESTAMP: str = datetime.now().strftime("%m_%d_%Y_%H_%M_%S")

#training pipeline
@dataclass
class TrainingPipelineConfig:
    pipeline_name: str = PIPELINE_NAME
    artifact_dir: str = os.path.join(ARTIFACT_DIR, TIMESTAMP)
    timestamp: str = TIMESTAMP

training_pipeline_config: TrainingPipelineConfig = TrainingPipelineConfig()


#Data Ingestion
@dataclass
class DataIngestionConfig:
    data_ingestion_dir: str = os.path.join(training_pipeline_config.artifact_dir, DATA_INGESTION_DIR_NAME)
    feature_store_file_path: str = os.path.join(data_ingestion_dir, DATA_INGESTION_FEATURE_STORE_DIR, FILE_NAME)
    training_file_path: str = os.path.join(data_ingestion_dir, DATA_INGESTION_INGESTED_DIR, TRAIN_FILE_NAME)
    testing_file_path: str = os.path.join(data_ingestion_dir, DATA_INGESTION_INGESTED_DIR, TEST_FILE_NAME)
    train_test_split_ratio: float = DATA_INGESTION_TRAIN_TEST_SPLIT_RATIO
    collection_name: str = DATA_INGESTION_COLLECTION_NAME


In [2]:
### us_visa > Entity > artifact_entity.py

from dataclasses import dataclass

# data ingestion path
@dataclass
class DataIngestionArtifact:
    trained_file_path:str
    test_file_path:str
    

### 3-	us_visa > Configuration > create the file “mongo_db_connection.py”

In [4]:
### us_visa > Configuration > create the file “mongo_db_connection.py”
import sys
import os
from us_visa.exception import USvisaException
from us_visa.logger import logging
from us_visa.constants import DATABASE_NAME, MONGODB_URL_KEY
import pymongo
import certifi

ca = certifi.where()

class MongoDBClient:
    client = None

    def __init__(self, database_name = DATABASE_NAME) -> None:
        try:
            if MongoDBClient.client is None:
                #mongo_db_url = os.environ['MONGODB_URL_KEY']
                mongo_db_url = os.getenv(MONGODB_URL_KEY)
                if mongo_db_url is None:
                    raise Exception(f"Environment key : {MONGODB_URL_KEY} is not set")
                MongoDBClient.client = pymongo.MongoClient(mongo_db_url, tlsCAFile= ca)
            
            self.client = MongoDBClient.client
            self.database = self.client[database_name]
            self.database_name = database_name
            logging.info("MongoDB connection Successfully established")
        
        except Exception as e:
            raise USvisaException(e, sys)
        


### 4-	Create the folder under us_visa “data_access”> create “__init__.py” > create file "usvisa_data.py"

In [5]:
from us_visa.configuration.mongo_db_connection import MongoDBClient
# from us_visa.constants import DATABASE_NAME
from us_visa.constants import *
from us_visa.exception import USvisaException
import pandas as pd
import sys
from typing import Optional
import numpy as np

class USVisaData:
    def __init__(self):
        try:
            self.mongo_client = MongoDBClient(database_name=DATABASE_NAME)
        
        except Exception as e:
            raise USvisaException(e, sys)
        
    
    def export_collection_as_dataframe(self, collection_name:str, database_name:Optional[str]=None)->pd.DataFrame:
        try:
            if database_name is None:
                collection = self.mongo_client.database[collection_name]
            else:
                collection = self.mongo_client[database_name][collection_name]
            
            df = pd.DataFrame(list(collection.find()))
            if '_id' in df.columns.to_list():
                df = df.drop(columns=["_id"], axis=1)
            df.replace({"na":np.nan}, inplace=True)
            return df
        
        except Exception as e:
            raise USvisaException(e, sys)
        


### 5-	Update the Us_visa > components > data_ingestion.py

In [6]:
import os
import sys
from pandas import DataFrame
from sklearn.model_selection import train_test_split

from us_visa.entity.config_entity import DataIngestionConfig
from us_visa.entity.artifact_entity import DataIngestionArtifact
from us_visa.exception import USvisaException
from us_visa.logger import logging
from us_visa.data_access.usvisa_data import USVisaData


class DataIngestion:
    def __init__(self, data_ingetion_config:DataIngestionConfig=DataIngestionConfig()):

        try:
            self.data_ingestion_config = data_ingetion_config
        except Exception as e:
            raise USvisaException(e, sys)

    def export_data_into_feature_store(self)->DataFrame:

        try:
            logging.info(f"Exporting data from MongoDB into feature store")
            usvisa_data = USVisaData()
            dataframe = usvisa_data.export_collection_as_dataframe(
                collection_name=self.data_ingestion_config.collection_name
            )
            logging.info(f"shape of dataframe: {dataframe.shape}")

            feature_store_file_path = self.data_ingestion_config.feature_store_file_path
            dir_path = os.path.dirname(feature_store_file_path)
            os.makedirs(dir_path, exist_ok=True)
            logging.info(f"Saving Exported Data into feature store file path: {feature_store_file_path}")
            dataframe.to_csv(feature_store_file_path, index=False, header=True)
            return dataframe
        
        except Exception as e:
            raise USvisaException(e, sys)

    def split_data_as_train_test(self, dataframe: DataFrame) -> None:
        logging.info("Entered split_data_as_train_test method of Data_Ingestion class")
        try:
            # Check if the dataframe is empty
            if dataframe.empty:
                raise ValueError("The dataframe is empty. Please check the data source.")

            train_set, test_set = train_test_split(dataframe, test_size=self.data_ingestion_config.train_test_split_ratio)
            logging.info("Performed train test split on the dataframe")
            logging.info("Exited split_data_as_train_test method of Data_Ingestion class")
            dir_path = os.path.dirname(self.data_ingestion_config.training_file_path)
            os.makedirs(dir_path, exist_ok=True)
            
            logging.info("Exporting train and test data file path.")
            train_set.to_csv(self.data_ingestion_config.training_file_path, index=False, header=True)
            test_set.to_csv(self.data_ingestion_config.testing_file_path, index=False, header=True)
            logging.info("Exported train and test data files path")
        except Exception as e:
            raise USvisaException(e, sys) from e
        
    def initiate_data_ingestion(self) -> DataIngestionArtifact:
        logging.info("Entered initiate_data_ingestion method of Data_Ingestion class")

        try:
            dataframe = self.export_data_into_feature_store()
            logging.info("Got the data from from MongoDB")
            self.split_data_as_train_test(dataframe)
            logging.info("Performed the train and test split operation on the dataset.")
            logging.info("Exited initiate_data_ingestion method of Data_Ingestion class")
            
            data_ingestion_artifact = DataIngestionArtifact(
                trained_file_path=self.data_ingestion_config.training_file_path,
                test_file_path=self.data_ingestion_config.testing_file_path)
            logging.info(f"Data Ingestion artifact: {data_ingestion_artifact}")
            return data_ingestion_artifact
        except Exception as e:
            raise USvisaException(e, sys) from e       

        


### 6-	Update the us_visa > pipeline > training_pipeline.py

In [8]:
###6-	Update the us_visa > pipeline > training_pipeline.py

import sys
from us_visa.exception import USvisaException
from us_visa.logger import logging
from us_visa.components.data_ingestion import DataIngestion

from us_visa.entity.config_entity import (
    DataIngestionConfig,
)

from us_visa.entity.artifact_entity import(
    DataIngestionArtifact,
)

class TrainPipeline:
    def __init__(self):
        self.data_ingestion_config = DataIngestionConfig()

    # Data ingetion from MongoDB
    def start_data_ingestion(self) -> DataIngestionArtifact:
        try:    
            logging.info("Entered the start_data_ingestion method of TrainPipeline class")
            logging.info("Getting the data from mongodb")
            data_ingestion = DataIngestion(data_ingetion_config=self.data_ingestion_config)
            data_ingestion_artifact = data_ingestion.initiate_data_ingestion()
            logging.info("Got the train_set and test_set from mongodb")
            logging.info("Exited the start_data_ingestion method of TrainPipeline class")
            return data_ingestion_artifact
        except Exception as e:
            raise USvisaException(e, sys) from e



    def run_pipeline(self,) -> None:
        try:
            data_ingestion_artifact = self.start_data_ingestion()
        
        except Exception as e:
            raise USvisaException(e, sys)

### 7-	Update the demo.py

In [10]:
###7-	Update the demo.py

# import os
# mongo_db_url = os.getenv('MONGO_DB_URL')
# print(mongo_db_url)

from us_visa.pipline.training_pipeline import TrainPipeline

obj = TrainPipeline()
obj.run_pipeline()
