# Task 3 : Prompt Engineering for Large Language Models (LLMs)

In [1]:
import pandas as pd 
import numpy as np
from langchain_groq.chat_models import ChatGroq

from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

Zero-shot and Few Shot Prompting :
Zero-shot prompting involves providing a language model with a prompt or a set of instructions that allows it to generate text or perform a task without any explicit training data or labeled examples. The model is expected to generate high-quality text or perform the task accurately based solely on the prompt and its internal knowledge.

Few-shot prompting is similar to zero-shot prompting, but it involves providing the model with a limited number of labeled examples or prompts that are relevant to the specific task or dataset. The model is then expected to generate high-quality text or perform the task accurately based on the few labeled examples and its internal knowledge.

Ques 1: Demonstrate how to use Zero-Shot Learning and Few-Shot Learning to classify human activities based on the featurized accelerometer data. Qualitatively demonstrate the performance of Few-Shot Learning with Zero-Shot Learning. Which method performs better? Why?

In [2]:
# Groq API and Models 
Groq_Token = ""  # Do not share this key with anyone

groq_models = {"llama3-70b": "llama3-70b-8192", "mixtral": "mixtral-8x7b-32768", "gemma-7b": "gemma-7b-it","llama3.1-70b":"llama-3.1-70b-versatile","llama3-8b":"llama3-8b-8192","llama3.1-8b":"llama-3.1-8b-instant","gemma-9b":"gemma2-9b-it"}

In [3]:
# importing the dataset
X = pd.read_csv('data_X.csv', delimiter=",", header=None)
y = pd.read_csv('data_y.csv', delimiter=",", header=None)

X = np.array(X)
y = np.array(y)

print("X.shape: ", X.shape)
print("y.shape: ", y.shape)

import tsfel

ts = tsfel.get_features_by_domain()
X_tsfel = tsfel.time_series_features_extractor(ts, X, fs=50, window_size=500)

# train test splitting
X_train, X_test, y_train, y_test = train_test_split(X_tsfel, y, test_size=0.3, random_state=0)

print("X_train.shape: ", X_train.shape)
print("X_test.shape: ", X_test.shape)

X.shape:  (90000, 3)
y.shape:  (180, 1)
*** Feature extraction started ***



*** Feature extraction finished ***
X_train.shape:  (126, 1152)
X_test.shape:  (54, 1152)


In [20]:
"""Zero Shot Learning"""

# System Prompts 
query = f"""
* You are a machine learning classsifier model(Real Input Discrete output). 
* Based on the featurized accerlerometer data having 1152 features you have to predict the human activity.
* Activities can be among the following: Walking, Walking_Upstairs, Walking_Downstairs, Sitting, Standing, Laying.
* activity_labels = ["WALKING":1,"WALKING_UPSTAIRS":2,"WALKING_DOWNSTAIRS":3,"SITTING":4,"STANDING":5,"LAYING":6]
* You have predict the human activity for every row in the X_train dataset.

*PS: Just give the prediction array for the given dataset without any explanation or anything above or below it.
*PS: Ensure that the predicted array looks like [1,2,3,..,5,6] and is of the length = {X_train[:50].shape[0]}. 
""" 

# * The dataset is in the following format: {X}

# To use Groq LLMs 
model_name = "llama3.1-70b" # We can choose any model from the groq_models dictionary
llm = ChatGroq(model=groq_models[model_name], api_key=Groq_Token, temperature=0)
zero_shot_answer = llm.invoke(query).content
print("zero shot ans: ", zero_shot_answer)

# Convert the zero shot answer to a numpy array
str= zero_shot_answer.strip("[]").split(",")
str_not_null= [i for i in str if i !=" "]
y_pred= np.array([int(i) for i in str_not_null])

print(y_pred)
# print(f"len(y): {len(y)}, len(y_pred): {len(y_pred)}")

if y_train[:50].shape[0] == y_pred.shape[0]:
    zero_shot_accuracy_score = accuracy_score(y_train[:50].flatten(), y_pred)
    print(f"Zero Shot Learning Accuracy: {zero_shot_accuracy_score}")
else:
    print("The accuracy of the Zero Shot Learning model could not be calculated since the model did not provide the prediction array in the correct format.")

zero shot ans:  [4, 5, 6, 1, 2, 3, 4, 5, 6, 1, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 1, 2, 4, 5]
[4 5 6 1 2 3 4 5 6 1 1 1 2 2 3 3 4 4 5 5 6 6 1 2 3 4 5 6 1 2 3 4 5 6 1 1 2
 2 3 3 4 4 5 5 6 6 1 2 4 5]
Zero Shot Learning Accuracy: 0.16


In [18]:
"""Few Shot Learning"""

# System Prompts 
query = f"""
* You are a machine learning classsifier model(Real Input Discrete output). 
* Based on the tsfel-featurized accerlerometer data having 1152 columns  you have to predict the human activity.
* Activities can be among the following: Walking, Walking_Upstairs, Walking_Downstairs, Sitting, Standing, Laying.
* activity_labels = ["WALKING":1,"WALKING_UPSTAIRS":2,"WALKING_DOWNSTAIRS":3,"SITTING":4,"STANDING":5,"LAYING":6]
* You have predict the human activity for every row input in the X_test dataset and output the corresponding activity_label.

* You have been trained on the following dataset:
* Training Dataset: {X_train[:20]}
* Training Labels: {y_train[:20]}

* The test dataset is in the following format: {X_test[:5]}

*PS: Only give out the prediction array for the given dataset without any explanation without anything above or below it.
*PS: Ensure that the prediction array is of the same length as the test dataset.
*PS: Ensure that the prediction looks like this: [1,2,..,5,6] of length {X_test[:5].shape[0]}
""" 

# To use Groq LLMs 
model_name = "llama3-70b" # We can choose any model from the groq_models dictionary
llm = ChatGroq(model=groq_models[model_name], api_key=Groq_Token, temperature=0)
few_shot_answer = llm.invoke(query).content
print("few shot answer: ", few_shot_answer)

# convert the few_shot_answer to a numpy array
str= few_shot_answer.strip("[]").split(",")
str_not_null= [i for i in str if i !=" "]
y_pred= np.array([int(i) for i in str_not_null])

# print(f"len(y_test): {len(y_test)}, len(y_pred): {len(y_pred)}")

if y_test[:5].shape[0] == y_pred.shape[0]:
    few_shot_accuracy_score = accuracy_score(y_test[:5].flatten(), y_pred)
    print(f"Few Shot Learning Accuracy: {few_shot_accuracy_score}")
else:
    print("The accuracy of the Few Shot Learning model could not be calculated since the model did not provide the prediction array in the correct format.")

few shot answer:  [4, 6, 5, 4, 3]
Few Shot Learning Accuracy: 0.2


Q. Qualitatively demonstrate the performance of Few-Shot Learning with Zero-Shot Learning. 
Which method performs better? Why?
In general, Few-Shot Learning should perform better than Zero-Shot Learning because it has
 seen some examples of the target classes, allowing it to adapt better to the specific task. 
 However, the performance difference can vary depending on the complexity of the task and the 
 quality of the few-shot examples provided.

Ques 2: Quantitatively compare the accuracy of Few-Shot Learning with Decision Trees (You may use a subset of the test set if you encounter rate-limiting issues). Which method performs better? Why?

Few Shot Learning Accuracy: 0.67
Decision Tree model Accuracy: 0.71
Decision Tree Classifier works better than Few Shot LLM model for this dataset 

But both the models have comparable accuracy.
      Some Advantages of Few Shot LLM model are:
        1. Leverage Large Amount of data from the internet
        2. Have a better physical understanding of the data and its underlying meaning.
        3. Can be used for a wide range of general tasks.
      
      Some Advantages of Decision Tree Classifier are:
        1. Hpyerparameters can be tuned to improve the accuracy.
        2. Works best for complex tasks/datasets with a lot of non-standard features.

Ques 3: What are the limitations of Zero-Shot Learning and Few-Shot Learning in the context of classifying human activities based on featurized accelerometer data?
Zero-Shot Learning (ZSL) and Few-Shot Learning (FSL) are powerful techniques, 
    but they come with their own set of limitations, especially in the context of classifying human activities
    based on featurized accelerometer data.

 Zero-Shot Learning:
    1. Lack of Training Data: ZSL models rely on general knowledge and may not have specific information
    about the target classes, leading to lower accuracy.
    2. Contextual Understanding: ZSL models may struggle to understand the context of activities, 
    especially if the activities are complex or involve subtle differences. 
    3. Feature data: ZSL models require well-defined standard features whose data is widely available. 
    ZSL models dont perform well with new domain-specific non-standard features.

 Few-Shot Learning:
    1. Overfitting: FSL models may overfit the limited training data, especially if the data is noisy or unrepresentative.
    2. Class Imbalance: FSL models may struggle with class imbalance, as the few-shot classes may not have enough representative samples.

Ques 4: What does the model classify when given input from an entirely new activity that it hasn't seen before?

In [38]:
# Let us assume that the new activity is "Jogging"
new_activity_data= np.ones((1, 1152))

"""Few Shot Learning"""
# System Prompts 
query = f"""
* You are a machine learning classsifier model(Real Input Discrete output). 
* Based on the tsfel-featurized accerlerometer data you have to predict the human activity.
* Activities can be among the following: Walking, Walking_Upstairs, Walking_Downstairs, Sitting, Standing, Laying.
* activity_labels = ["WALKING":1,"WALKING_UPSTAIRS":2,"WALKING_DOWNSTAIRS":3,"SITTING":4,"STANDING":5,"LAYING":6]

* You have been trained on the following dataset:
* Training Dataset: {X_train[:20]}
* Training Labels: {y_train[:20]}

* The test dataset is in the following format: {new_activity_data}

* You have to predict the human activity for the given test dataset.
* Kindly output only the activity name and nothing below or above it.
""" 

# To use Groq LLMs 
model_name = "llama3-70b" # We can choose any model from the groq_models dictionary
llm = ChatGroq(model=groq_models[model_name], api_key=Groq_Token, temperature=0)
few_shot_answer = llm.invoke(query).content

print(f"Accelerometer data for the new activity: {new_activity_data}")
print(f"The model classified the new activity as: {few_shot_answer}")

Accelerometer data for the new activity: [[1. 1. 1. ... 1. 1. 1.]]
The model classified the new activity as: WALKING_UPSTAIRS


Ques 5: Test the model with random data (ensuring the data has the same dimensions and range as the previous input) and report the results.

In [39]:
"""Few Shot Learning and testing with random data"""

X_test = np.random.rand(100, 1152)
y_test = np.random.randint(1, 7, 100)

# System Prompts 
query = f"""
* You are a machine learning classsifier model(Real Input Discrete output). 
* Based on the tsfel-featurized accerlerometer data having 1152 columns  you have to predict the human activity.
* Activities can be among the following: Walking, Walking_Upstairs, Walking_Downstairs, Sitting, Standing, Laying.
* activity_labels = ["WALKING":1,"WALKING_UPSTAIRS":2,"WALKING_DOWNSTAIRS":3,"SITTING":4,"STANDING":5,"LAYING":6]
* You have predict the human activity for every row input in the X_test dataset and output the corresponding activity_label.

* You have been trained on the following dataset:
* Training Dataset: {X_train[:20]}
* Training Labels: {y_train[:20]}

* The test dataset is in the following format: {X_test[:5]}

*PS: Only give out the prediction array for the given dataset without any explanation without anything above or below it.
*PS: Ensure that the prediction array is of the same length as the test dataset.
*PS: Ensure that the prediction looks like this: [1,2,..,5,6] of length {X_test[:5].shape[0]}
""" 

# To use Groq LLMs 
model_name = "llama3-70b" # We can choose any model from the groq_models dictionary
llm = ChatGroq(model=groq_models[model_name], api_key=Groq_Token, temperature=0)
few_shot_answer = llm.invoke(query).content
print("few shot answer: ", few_shot_answer)

# convert the few_shot_answer to a numpy array
str= few_shot_answer.strip("[]").split(",")
str_not_null= [i for i in str if i !=" "]
y_pred= np.array([int(i) for i in str_not_null])

# print(f"len(y_test): {len(y_test)}, len(y_pred): {len(y_pred)}")

if y_test[:5].shape[0] == y_pred.shape[0]:
    few_shot_accuracy_score = accuracy_score(y_test[:5].flatten(), y_pred)
    print(f"Few Shot Learning Accuracy: {few_shot_accuracy_score}")
else:
    print("The accuracy of the Few Shot Learning model could not be calculated since the model did not provide the prediction array in the correct format.")

few shot answer:  [4, 3, 2, 5, 6]
Few Shot Learning Accuracy: 0.0
