<a href="https://colab.research.google.com/github/birappankumar/Age-and-gender-predictor/blob/main/predii.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [8]:
# This cell contains the entire code for the model
# For step by step method you can see the below steps

import pandas as pd
import json
from sentence_transformers import SentenceTransformer, util
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from huggingface_hub import login
import csv
# This will make sure whether we are using GPU or not as CPU were exausted and crashing again and again

torch.cuda.empty_cache()
print("GPU Available:", torch.cuda.is_available())
print("GPU Name:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "None")


HF_API_KEY = "hf_key"
MODEL_NAME = "meta-llama/Llama-3.1-8B-Instruct"


login(HF_API_KEY)

# Load and preprocess dataset
def load_and_preprocess_dataset(file_path, makes=['FORD', 'TOYOTA']):
    """
    Loads the dataset and filters for specific makes.
    Combines relevant summary fields.
    """
    df = pd.read_csv(file_path)

    # Filter for selected makes
    df = df[df['Make'].str.lower().isin([make.lower() for make in makes])]

    # Clean and combine summaries
    df[['Description', 'Consequence', 'Remedy']] = df[['Description', 'Consequence', 'Remedy']].fillna('').astype(str)
    df['combined_summary'] = df[['Description', 'Consequence', 'Remedy']].agg(' '.join, axis=1)
    return df

# Building an Embedding-Based Retrieval System
def embed_documents(documents, embedding_model):
    """Generates embeddings for all documents using the specified model."""
    embeddings = embedding_model.encode(documents, convert_to_tensor=True)
    return embeddings

def retrieve_relevant_documents(input_query, df, embedding_model, document_embeddings, top_k=5):
    """Retrieves top-k relevant documents based on the input query."""
    query_embedding = embedding_model.encode([input_query], convert_to_tensor=True)
    similarities = util.pytorch_cos_sim(query_embedding, document_embeddings)[0]
    top_results = torch.topk(similarities, k=top_k)

    retrieved_docs = []
    for score, idx in zip(top_results[0], top_results[1]):
        idx = int(idx.item())
        retrieved_docs.append((df.iloc[idx]['Campaign_Number'], df.iloc[idx]['combined_summary'], float(score)))

    return retrieved_docs

# Summarize Retrieved Documents Using LLaMA
def summarize_with_llama(documents, model_name=MODEL_NAME, max_length=600):
    """
    Summarizes the retrieved documents using the LLaMA model.

    Args:
        documents (list): List of text documents to summarize.
        model_name (str): Path to the LLaMA model directory or Hugging Face repo.
        max_length (int): Maximum length of the summary.

    Returns:
        str: Generated summary.
    """
    # Load tokenizer and model
    tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False)
    model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", torch_dtype=torch.float16)

    # Prepare input for summarization
    input_text = " ".join(documents)  # Combine all retrieved documents
    inputs = tokenizer(input_text, return_tensors="pt", truncation=True, max_length=1024).to("cuda")

    # Generate summary
    summary_ids = model.generate(
        inputs['input_ids'],
        max_length=max_length,
        num_beams=4,
        early_stopping=True
    )
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary


# Main Function
def summarization_agent(input_json, dataset_path, embedding_model_name="all-MiniLM-L6-v2", llama_model_name=MODEL_NAME):
    """
    Main function to handle input JSON and produce a summarized output.
    """
    input_data = json.loads(input_json)
    input_query = f"{input_data['make']} {input_data['model']} {input_data['year']} {input_data['issue']}"
    df = load_and_preprocess_dataset(dataset_path)

    # Embedding model
    embedding_model = SentenceTransformer(embedding_model_name)
    document_embeddings = embed_documents(df['combined_summary'].tolist(), embedding_model)

    # Retrieve and summarize
    retrieved_docs_with_scores = retrieve_relevant_documents(input_query, df, embedding_model, document_embeddings)
    retrieved_docs = [doc for _, doc, _ in retrieved_docs_with_scores]

    # Summarizing documents
    summary = summarize_with_llama(retrieved_docs, model_name=llama_model_name)

    # Returning result
    return {
        "retrieved_documents": retrieved_docs_with_scores,
        "summary": summary
    }

if __name__ == "__main__":
    input_json = json.dumps({
        "make": "Ford",
        "model": "Escape",
        "year": "2001",
        "issue": "stuck throttle risk"
    })

    dataset_path = "/content/FLAT_RCL.csv"

    result = summarization_agent(input_json, dataset_path, embedding_model_name="all-MiniLM-L6-v2", llama_model_name=MODEL_NAME)
    print("Retrieved Documents:", result["retrieved_documents"])
    print("Summary:", result["summary"])


GPU Available: True
GPU Name: Tesla T4


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Retrieved Documents: [('02V117000', ' ON CERTAIN PASSENGER VEHICLES BUILT WITH MANUAL TRANSAXLES, THERE IS THE POTENTIAL FOR THE SPEED CONTROL CABLE TO HANG UP AT THE THROTTLE BODY BRACKET DURING HIGH, WIDE OPEN THROTTLE OPERATION, PREVENTING THE THROTTLE FROM RETURNING TO THE CLOSED POSITION WHEN THE ACCELERATOR IS RELEASED. IF THIS CONDITION WERE TO OCCUR, IT MAY BE NECESSARY TO DEPRESS THE CLUTCH AND APPLY THE BRAKE TO BRING THE VEHICLE TO A STOP.', 0.45075875520706177), ('02V266000', ' CERTAIN PASSENGER VEHICLES EQUIPPED WITH ADJUSTABLE PADELS ARE BEING RECALLED IN ORDER TO ADJUST THE BRAKE AND ACCELERATOR PEDALS TO A MINIMUM OF 50 MM OF LATERAL SEPARATION.  SIMULTANEOUS APPLICATION OF BOTH THE BRAKE AND ACCELERATOR PEDALS COULD RESULT IN AN INCREASE IN ENGINE RPM. THIS COULD RESULT IN A CUSTOMER EXPERIENCING AN "UNINTENDED VEHICLE SPEED INCREASE" OR A PERCEIVED "UNABLE TO STOP" CONDITION.', 0.3175155222415924), ('02V266000', ' CERTAIN PASSENGER VEHICLES EQUIPPED WITH ADJUSTABLE PA

In [1]:
import pandas as pd
import json
from sentence_transformers import SentenceTransformer, util
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from huggingface_hub import login
import csv

torch.cuda.empty_cache()
print("GPU Available:", torch.cuda.is_available())
print("GPU Name:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "None")

HF_API_KEY = "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"
MODEL_NAME = "meta-llama/Llama-3.1-8B-Instruct"

login(HF_API_KEY)

GPU Available: False
GPU Name: None


In [2]:
file_path = "/content/FLAT_RCL.txt"
with open(file_path, "r", encoding="utf-8") as f:
        for _ in range(10):
            print(f.readline().strip())


1	02V288000	FORD	FOCUS	2000	02S41	ELECTRICAL SYSTEM:12V/24V/48V BATTERY:CABLES	FORD MOTOR COMPANY	19990719	20010531	V	291854	20030210	ODI	Ford Motor Company	20021106	20021106			CERTAIN PASSENGER VEHICLES EQUIPPED WITH ZETEC ENGINES, LOOSE OR BROKEN ATTACHMENTS AND MISROUTED BATTERY CABLES COULD LEAD TO CABLE INSULATION DAMAGE.	THIS, IN TURN, COULD CAUSE THE BATTERY CABLES TO SHORT RESULTING IN HEAT DAMAGE TO THE CABLES.  BESIDES HEAT DAMAGE, THE "CHECK ENGINE" LIGHT MAY ILLUMINATE, THE VEHICLE MAY FAIL TO START, OR SMOKE, MELTING, OR FIRE COULD ALSO OCCUR.	DEALERS WILL INSPECT THE BATTERY CABLES FOR THE CONDITION OF THE CABLE INSULATION AND PROPER TIGHTENING OF THE TERMINAL ENDS.  AS NECESSARY, CABLES WILL BE REROUTED, RETAINING CLIPS INSTALLED, AND DAMAGED BATTERY CABLES REPLACED.   OWNER NOTIFICATION BEGAN FEBRUARY 10, 2003.   OWNERS WHO DO NOT RECEIVE THE FREE REMEDY  WITHIN A REASONABLE TIME SHOULD CONTACT FORD AT 1-866-436-7332.	ALSO CONTACT THE NATIONAL HIGHWAY TRAFFIC SAFETY ADM

In [3]:

input_file = '/content/FLAT_RCL.txt'
output_file = '/content/FLAT_RCL1.csv'

with open(input_file, 'r') as infile:
    lines = infile.readlines()

with open(output_file, 'w', newline='') as outfile:
    writer = csv.writer(outfile)

    for line in lines:
        fields = line.strip().split('\t')
        writer.writerow(fields)

In [4]:

output_file_path = 'FLAT_RCL1.csv'
df_output = pd.read_csv(output_file_path)
df_output.head()

Unnamed: 0,1,02V288000,FORD,FOCUS,2000,02S41,ELECTRICAL SYSTEM:12V/24V/48V BATTERY:CABLES,FORD MOTOR COMPANY,19990719,20010531,...,Ford Motor Company,20021106,20021106.1,Unnamed: 17,Unnamed: 18,"CERTAIN PASSENGER VEHICLES EQUIPPED WITH ZETEC ENGINES, LOOSE OR BROKEN ATTACHMENTS AND MISROUTED BATTERY CABLES COULD LEAD TO CABLE INSULATION DAMAGE.","THIS, IN TURN, COULD CAUSE THE BATTERY CABLES TO SHORT RESULTING IN HEAT DAMAGE TO THE CABLES. BESIDES HEAT DAMAGE, THE ""CHECK ENGINE"" LIGHT MAY ILLUMINATE, THE VEHICLE MAY FAIL TO START, OR SMOKE, MELTING, OR FIRE COULD ALSO OCCUR.","DEALERS WILL INSPECT THE BATTERY CABLES FOR THE CONDITION OF THE CABLE INSULATION AND PROPER TIGHTENING OF THE TERMINAL ENDS. AS NECESSARY, CABLES WILL BE REROUTED, RETAINING CLIPS INSTALLED, AND DAMAGED BATTERY CABLES REPLACED. OWNER NOTIFICATION BEGAN FEBRUARY 10, 2003. OWNERS WHO DO NOT RECEIVE THE FREE REMEDY WITHIN A REASONABLE TIME SHOULD CONTACT FORD AT 1-866-436-7332.",ALSO CONTACT THE NATIONAL HIGHWAY TRAFFIC SAFETY ADMINISTRATION'S AUTO SAFETY HOTLINE AT 1-888-DASH-2-DOT (1-888-327-4236).,000015339000215021000000202
0,2,02V288000,FORD,FOCUS,2001,02S41,ELECTRICAL SYSTEM:12V/24V/48V BATTERY:CABLES,FORD MOTOR COMPANY,19990719.0,20010531.0,...,Ford Motor Company,20021106.0,20021106.0,,,CERTAIN PASSENGER VEHICLES EQUIPPED WITH ZETEC...,"THIS, IN TURN, COULD CAUSE THE BATTERY CABLES ...",DEALERS WILL INSPECT THE BATTERY CABLES FOR TH...,ALSO CONTACT THE NATIONAL HIGHWAY TRAFFIC SAFE...,000015339000215022000000202
1,3,02V236000,JAYCO,FT EAGLE 10 SG,2003,,EQUIPMENT:OTHER:LABELS,"JAYCO, INC.",20020730.0,20020813.0,...,"Jayco, Inc.",20020904.0,20020912.0,,,"ON CERTAIN FOLDING TENT CAMPERS, THE FEDERAL C...","IF THE TIRES WERE INFLATED TO 80 PSI, THEY COU...",OWNERS WILL BE MAILED CORRECT LABELS FOR INSTA...,"ALSO, CUSTOMERS CAN CONTACT THE NATIONAL HIGHW...",000015210000106403000000349
2,4,02V237000,HOLIDAY RAMBLER,ENDEAVOR,2000,,STRUCTURE,MONACO COACH CORP.,,,...,MONACO COACH CORPORATION,20020909.0,20020912.0,,,"ON CERTAIN CLASS A MOTOR HOMES, THE FLOOR TRUS...",CONDITIONS CAN RESULT IN THE BOTTOMING OUT THE...,DEALERS WILL INSPECT THE FLOOR TRUSS NETWORK S...,CUSTOMERS CAN ALSO CONTACT THE NATIONAL HIGHWA...,000015211000083965000000272
3,5,02V237000,HOLIDAY RAMBLER,ENDEAVOR,1999,,STRUCTURE,MONACO COACH CORP.,,,...,MONACO COACH CORPORATION,20020909.0,20020912.0,,,"ON CERTAIN CLASS A MOTOR HOMES, THE FLOOR TRUS...",CONDITIONS CAN RESULT IN THE BOTTOMING OUT THE...,DEALERS WILL INSPECT THE FLOOR TRUSS NETWORK S...,CUSTOMERS CAN ALSO CONTACT THE NATIONAL HIGHWA...,000015211000080938000000272
4,6,02V237000,HOLIDAY RAMBLER,VACATIONER,2000,,STRUCTURE,MONACO COACH CORP.,,,...,MONACO COACH CORPORATION,20020909.0,20020912.0,,,"ON CERTAIN CLASS A MOTOR HOMES, THE FLOOR TRUS...",CONDITIONS CAN RESULT IN THE BOTTOMING OUT THE...,DEALERS WILL INSPECT THE FLOOR TRUSS NETWORK S...,CUSTOMERS CAN ALSO CONTACT THE NATIONAL HIGHWA...,000015211000087576000000272


In [5]:
# Giving name to the column
columns = [
    "Record_ID", "Campaign_Number", "Make", "Model", "Year", "Recall_Number",
    "Component", "Manufacturer", "Start_Date", "End_Date", "Type",
    "Units_Affected", "Notification_Date", "Initiator", "Report_Manufacturer",
    "Defect_Report_Received", "Recall_Initiated", "Recall_Submitted",
    "Description", "Consequence", "Remedy", "Notes", "Contact_Info", "Other_Info"
]

csv_file = 'FLAT_RCL1.csv'
df = pd.read_csv(csv_file, header=None, names=columns)
df.to_csv('named_output.csv', index=False)
df.head()


Unnamed: 0,Record_ID,Campaign_Number,Make,Model,Year,Recall_Number,Component,Manufacturer,Start_Date,End_Date,...,Report_Manufacturer,Defect_Report_Received,Recall_Initiated,Recall_Submitted,Description,Consequence,Remedy,Notes,Contact_Info,Other_Info
0,1,02V288000,FORD,FOCUS,2000,02S41,ELECTRICAL SYSTEM:12V/24V/48V BATTERY:CABLES,FORD MOTOR COMPANY,19990719.0,20010531.0,...,Ford Motor Company,20021106.0,20021106.0,,,CERTAIN PASSENGER VEHICLES EQUIPPED WITH ZETEC...,"THIS, IN TURN, COULD CAUSE THE BATTERY CABLES ...",DEALERS WILL INSPECT THE BATTERY CABLES FOR TH...,ALSO CONTACT THE NATIONAL HIGHWAY TRAFFIC SAFE...,000015339000215021000000202
1,2,02V288000,FORD,FOCUS,2001,02S41,ELECTRICAL SYSTEM:12V/24V/48V BATTERY:CABLES,FORD MOTOR COMPANY,19990719.0,20010531.0,...,Ford Motor Company,20021106.0,20021106.0,,,CERTAIN PASSENGER VEHICLES EQUIPPED WITH ZETEC...,"THIS, IN TURN, COULD CAUSE THE BATTERY CABLES ...",DEALERS WILL INSPECT THE BATTERY CABLES FOR TH...,ALSO CONTACT THE NATIONAL HIGHWAY TRAFFIC SAFE...,000015339000215022000000202
2,3,02V236000,JAYCO,FT EAGLE 10 SG,2003,,EQUIPMENT:OTHER:LABELS,"JAYCO, INC.",20020730.0,20020813.0,...,"Jayco, Inc.",20020904.0,20020912.0,,,"ON CERTAIN FOLDING TENT CAMPERS, THE FEDERAL C...","IF THE TIRES WERE INFLATED TO 80 PSI, THEY COU...",OWNERS WILL BE MAILED CORRECT LABELS FOR INSTA...,"ALSO, CUSTOMERS CAN CONTACT THE NATIONAL HIGHW...",000015210000106403000000349
3,4,02V237000,HOLIDAY RAMBLER,ENDEAVOR,2000,,STRUCTURE,MONACO COACH CORP.,,,...,MONACO COACH CORPORATION,20020909.0,20020912.0,,,"ON CERTAIN CLASS A MOTOR HOMES, THE FLOOR TRUS...",CONDITIONS CAN RESULT IN THE BOTTOMING OUT THE...,DEALERS WILL INSPECT THE FLOOR TRUSS NETWORK S...,CUSTOMERS CAN ALSO CONTACT THE NATIONAL HIGHWA...,000015211000083965000000272
4,5,02V237000,HOLIDAY RAMBLER,ENDEAVOR,1999,,STRUCTURE,MONACO COACH CORP.,,,...,MONACO COACH CORPORATION,20020909.0,20020912.0,,,"ON CERTAIN CLASS A MOTOR HOMES, THE FLOOR TRUS...",CONDITIONS CAN RESULT IN THE BOTTOMING OUT THE...,DEALERS WILL INSPECT THE FLOOR TRUSS NETWORK S...,CUSTOMERS CAN ALSO CONTACT THE NATIONAL HIGHWA...,000015211000080938000000272


In [None]:
def load_and_preprocess_dataset(file_path, makes=['FORD', 'TOYOTA']):
    """
    Loads the dataset and filters for specific makes.
    Combines relevant summary fields.
    """
    df = pd.read_csv(file_path)

    df = df[df['Make'].str.lower().isin([make.lower() for make in makes])]

    df[['Description', 'Consequence', 'Remedy']] = df[['Description', 'Consequence', 'Remedy']].fillna('').astype(str)
    df['combined_summary'] = df[['Description', 'Consequence', 'Remedy']].agg(' '.join, axis=1)
    return df

In [None]:
# Build an Embedding-Based Retrieval System
def embed_documents(documents, embedding_model):
    """Generates embeddings for all documents using the specified model."""
    embeddings = embedding_model.encode(documents, convert_to_tensor=True)
    return embeddings

In [None]:
def retrieve_relevant_documents(input_query, df, embedding_model, document_embeddings, top_k=5):
    """Retrieves top-k relevant documents based on the input query."""

    query_embedding = embedding_model.encode([input_query], convert_to_tensor=True)
    similarities = util.pytorch_cos_sim(query_embedding, document_embeddings)[0]
    top_results = torch.topk(similarities, k=top_k)

    retrieved_docs = []
    for score, idx in zip(top_results[0], top_results[1]):
        idx = int(idx.item())
        retrieved_docs.append((df.iloc[idx]['Campaign_Number'], df.iloc[idx]['combined_summary'], float(score)))

    return retrieved_docs

In [None]:
# Summarize Retrieved Documents Using LLaMA
def summarize_with_llama(documents, model_name=MODEL_NAME, max_length=600):
    """
    Summarizes the retrieved documents using the LLaMA model.

    Args:
        documents (list): List of text documents to summarize.
        model_name (str): Path to the LLaMA model directory or Hugging Face repo.
        max_length (int): Maximum length of the summary.

    Returns:
        str: Generated summary.
    """
    tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False)
    model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", torch_dtype=torch.float16)

    # Prepare input for summarization
    input_text = " ".join(documents)  # Combine all retrieved documents
    inputs = tokenizer(input_text, return_tensors="pt", truncation=True, max_length=1024).to("cuda")

    # Generate summary
    summary_ids = model.generate(
        inputs['input_ids'],
        max_length=max_length,
        num_beams=4,
        early_stopping=True
    )
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary


In [None]:
# Main Agent Function
def summarization_agent(input_json, dataset_path, embedding_model_name="all-MiniLM-L6-v2", llama_model_name=MODEL_NAME):
    """
    Main function to handle input JSON and produce a summarized output.
    """
    input_data = json.loads(input_json)
    input_query = f"{input_data['make']} {input_data['model']} {input_data['year']} {input_data['issue']}"
    df = load_and_preprocess_dataset(dataset_path)

    # Embedding model
    embedding_model = SentenceTransformer(embedding_model_name)
    document_embeddings = embed_documents(df['combined_summary'].tolist(), embedding_model)

    # Retrieve and summarize
    retrieved_docs_with_scores = retrieve_relevant_documents(input_query, df, embedding_model, document_embeddings)
    retrieved_docs = [doc for _, doc, _ in retrieved_docs_with_scores]

    # Summarize documents
    summary = summarize_with_llama(retrieved_docs, model_name=llama_model_name)

    # Returning result
    return {
        "retrieved_documents": retrieved_docs_with_scores,
        "summary": summary
    }


In [None]:
if __name__ == "__main__":
    input_json = json.dumps({
        "make": "Ford",
        "model": "Escape",
        "year": "2001",
        "issue": "stuck throttle risk"
    })

    dataset_path = "/content/FLAT_RCL.csv"

    try:
        result = summarization_agent(input_json, dataset_path, embedding_model_name="all-MiniLM-L6-v2", llama_model_name=MODEL_NAME)
        print("Retrieved Documents:", result["retrieved_documents"])
        print("Summary:", result["summary"])
    except Exception as e:
        print("An error occurred:", str(e))
