In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/2012-2014/accidents_2012_to_2014.csv
/kaggle/input/llama-2/pytorch/7b-chat-hf/1/model.safetensors.index.json
/kaggle/input/llama-2/pytorch/7b-chat-hf/1/config.json
/kaggle/input/llama-2/pytorch/7b-chat-hf/1/model-00001-of-00002.safetensors
/kaggle/input/llama-2/pytorch/7b-chat-hf/1/model-00002-of-00002.safetensors
/kaggle/input/llama-2/pytorch/7b-chat-hf/1/pytorch_model-00002-of-00002.bin
/kaggle/input/llama-2/pytorch/7b-chat-hf/1/README.md
/kaggle/input/llama-2/pytorch/7b-chat-hf/1/USE_POLICY.md
/kaggle/input/llama-2/pytorch/7b-chat-hf/1/tokenizer.json
/kaggle/input/llama-2/pytorch/7b-chat-hf/1/tokenizer_config.json
/kaggle/input/llama-2/pytorch/7b-chat-hf/1/pytorch_model.bin.index.json
/kaggle/input/llama-2/pytorch/7b-chat-hf/1/LICENSE.txt
/kaggle/input/llama-2/pytorch/7b-chat-hf/1/pytorch_model-00001-of-00002.bin
/kaggle/input/llama-2/pytorch/7b-chat-hf/1/special_tokens_map.json
/kaggle/input/llama-2/pytorch/7b-chat-hf/1/.gitattributes
/kaggle/input/llama-2/pytorch/7b-

In [2]:
# Install necessary packages
!pip install transformers torch scikit-learn




In [3]:
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
import torch

# Define model path
model_path = "/kaggle/input/llama-2/pytorch/7b-chat-hf/1"

# Load pre-trained tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(model_path, torch_dtype=torch.float16, device_map="auto")

# Initialize text generation pipeline
text_generator = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    torch_dtype=torch.float16,
    device_map="auto"
)


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



In [4]:
import pandas as pd
from datasets import Dataset

# Load dataset
df = pd.read_csv('/kaggle/input/2012-2014/accidents_2012_to_2014.csv')




  df = pd.read_csv('/kaggle/input/2012-2014/accidents_2012_to_2014.csv')


In [5]:
desired_columns = ['Accident_Severity', 'Number_of_Vehicles', 'Number_of_Casualties',
                   'Day_of_Week', 'Road_Type', 'Light_Conditions', 'Weather_Conditions',
                   'Road_Surface_Conditions', 'Urban_or_Rural_Area', 'Year']

# Select only the desired columns
df = df[desired_columns]

# Prepare dataset for training or inference
def prepare_dataset(df):
    df_prepared = pd.DataFrame({
        'text': df.apply(lambda row: f"Accident Severity: {row['Accident_Severity']}, "
                                    f"Number of Vehicles: {row['Number_of_Vehicles']}, "
                                    f"Number of Casualties: {row['Number_of_Casualties']}, "
                                    f"Day of Week: {row['Day_of_Week']}, "
                                    f"Road Type: {row['Road_Type']}, "
                                    f"Light Conditions: {row['Light_Conditions']}, "
                                    f"Weather Conditions: {row['Weather_Conditions']}, "
                                    f"Road Surface Conditions: {row['Road_Surface_Conditions']}, "
                                    f"Urban or Rural Area: {row['Urban_or_Rural_Area']}, "
                                    f"Year: {row['Year']}", axis=1)
    })
    return Dataset.from_pandas(df_prepared)

dataset = prepare_dataset(df)

In [6]:
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments

def train_model(dataset):
    model_name = "meta-llama/Llama-2-7b-chat-hf"  # Replace with your model path or name

    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForCausalLM.from_pretrained(model_name)

    # Set padding token if it's not already set
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token

    def tokenize_function(examples):
        return tokenizer(examples['text'], truncation=True, padding="max_length", max_length=512)

    # Assuming `dataset` is a Dataset object from the `datasets` library
    tokenized_datasets = dataset.map(tokenize_function, batched=True)
    
    training_args = TrainingArguments(
        output_dir="./output",
        per_device_train_batch_size=2,
        num_train_epochs=3,
        logging_dir="./logs",
        logging_steps=10,
        save_steps=10_000,
        evaluation_strategy="steps",
        eval_steps=10_000,
    )
    
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_datasets
    )
    
    trainer.train()
    return model, tokenizer

# Make sure dataset is defined and loaded correctly
# model, tokenizer = train_model(dataset)


In [7]:
def generate_general_insights(df, model, tokenizer):
    prompt = f"""
    Based on the dataset containing the following information:
    {df.describe(include='all').to_string()}

    Please provide general insights and recommendations related to traffic accidents.
    """
    
    response = text_generator(
        prompt,
        do_sample=True,
        top_k=10,
        num_return_sequences=1,
        eos_token_id=tokenizer.eos_token_id,
        max_length=1500,
    )
    
    return response[0]['generated_text']

def generate_cluster_advisory(df, model, tokenizer):
    prompt = f"""
    Based on the dataset containing the following information:
    {df.describe(include='all')}

    Provide advisories in order to minimize the number of traffic accidents.
    """
    
    response = text_generator(
        prompt,
        do_sample=True,
        top_k=10,
        num_return_sequences=1,
        eos_token_id=tokenizer.eos_token_id,
        max_length=1500,
    )
    
    return response[0]['generated_text']

# Generate insights
insights = generate_general_insights(df, model, tokenizer)
print(insights)

# Generate cluster-based advisories
advisory = generate_cluster_advisory(df, model, tokenizer)
print(advisory)


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.



    Based on the dataset containing the following information:
            Accident_Severity  Number_of_Vehicles  Number_of_Casualties    Day_of_Week           Road_Type                Light_Conditions       Weather_Conditions Road_Surface_Conditions  Urban_or_Rural_Area           Year
count       464697.000000       464697.000000         464697.000000  464697.000000              464697                          464697                   464697                  463942        464697.000000  464697.000000
unique                NaN                 NaN                   NaN            NaN                   6                               5                        9                       5                  NaN            NaN
top                   NaN                 NaN                   NaN            NaN  Single carriageway  Daylight: Street light present  Fine without high winds                     Dry                  NaN            NaN
freq                  NaN                 NaN       