In [1]:
# Check if the dataset is already downloaded
import os 
import pandas as pd
if not os.path.exists('Data.csv'):
    # Download the dataset
    !gdown --id 1-AQAL98COaKytVkkAtNMhjskHRWfeskw
    df = pd.read_csv('Data.csv')
# Create a dataframe by removing the PM10 column from the original dataframe
    df1 = df.drop('PM10', axis=1)
    df1.to_csv('Data.csv', index=False)

In [2]:
def generate_template(prompt):
    df_check = pd.read_csv("Data.csv")
    df_check["Timestamp"] = pd.to_datetime(df_check["Timestamp"])
    df_check = df_check.head(5)

    new_line = "\n"

    template = f"""```python
import pandas as pd
import matplotlib.pyplot as plt

df = pd.read_csv("Data.csv")
df["Timestamp"] = pd.to_datetime(df["Timestamp"])

# df.dtypes
{new_line.join(map(lambda x: '# '+x, str(df_check.dtypes).split(new_line)))}

# {prompt.strip()}
# <your code here>

#answer = 
```
"""
    return template



def generate_query(template):
    
    query = f"""I have a pandas dataframe data of PM2.5.
* The columns are 'Timestamp', 'station', 'PM2.5', 'address', 'city', 'latitude', 'longitude', and 'state'.
* Frequency of Data is Daily.
* `Pollution` generally means `PM2.5`.
* PM2.5 guidelines: India: 60, WHO: 15.
* Store the final answer in a global variable `answer`.
* Always report the unit of the data. Example: `The average PM2.5 is 45.67 µg/m³`

Complete the following code.

{template}
"""
    return query



def process_query(query, llm):
    global answer
    template = generate_template(query)
    query = generate_query(template)
    try:
        answer = llm.invoke(query)
        global code
        code = f"""
        {template.split("```python")[1].split("```")[0]}
        {answer.content.split("```python")[1].split("```")[0]}
        """
        # update variable `answer` when code is executed
        exec(code,globals())
    except Exception as e:
        answer = f"Error: {e}"
    print(answer)



prompt = "What are the top three most polluted cities based on PM2.5 levels?"
template = generate_template(prompt)
query = generate_query(template)

In [3]:
from langchain_huggingface import HuggingFaceEndpoint
llm = HuggingFaceEndpoint(
    endpoint_url="https://api-inference.huggingface.co/models/codellama/CodeLlama-7b-hf",
    max_new_tokens=512,
    huggingfacehub_api_token=os.getenv("HF_TOKEN")
)
# print(llm.invoke("What is Deep Learning?"))



  from .autonotebook import tqdm as notebook_tqdm


In [8]:
from langchain_core.messages import (
    HumanMessage,
    SystemMessage,
)
from langchain_huggingface import ChatHuggingFace

messages = [
    SystemMessage(content="You're a helpful assistant"),
    HumanMessage(
        content="What happens when an unstoppable force meets an immovable object?"
    ),
]

chat_model = ChatHuggingFace(llm=llm,model_id="codellama/CodeLlama-7b-hf")

In [9]:
chat_model._to_chat_prompt(messages)

"<s>[INST] <<SYS>>\nYou're a helpful assistant\n<</SYS>>\n\nWhat happens when an unstoppable force meets an immovable object? [/INST]"

In [12]:
answer = chat_model.invoke(messages)
global code
code = f"""
{template.split("```python")[1].split("```")[0]}
{answer.content.split("```python")[1].split("```")[0]}
"""
# update variable `answer` when code is executed
# exec(code,globals())

HfHubHTTPError: 422 Client Error: Unprocessable Entity for url: https://api-inference.huggingface.co/models/codellama/CodeLlama-7b-hf/v1/chat/completions (Request ID: tTgy_KxoBLYGsTP5tK1sk)

Template error: template not found

In [None]:
print(answer)

<div class="row mt-3">
    <div class="col-sm mt-3 mt-md-0">
        <img class="img-fluid rounded z-depth-1" src="https://github.com/ayivima/AI-SURFS/blob/master/Code_base/Python/Pandas/solutions/L5_Q1.png?raw=true" />
    </div>
</div>

<div class="caption">
    The top three most polluted cities based on PM2.5 levels.
</div>

<br>

### Solution


```python
# answer = 
# ['Mumbai, Maharashtra, India', 'Delhi, Delhi, India', 'Chandigarh, Punjab, India']

df['station'].value_counts()[:3]
```




    Mumbai, Maharashtra, India     142
    Delhi, Delhi, India            141
    Chandigarh, Punjab, India      137
    Name: station, dtype: int64




```python
# answer = 
# ['Mumbai, Maharashtra, India', 'Delhi, Delhi, India', 'Chandigarh, Punjab, India']

df[df['station'].isin(df['station'].value_counts()[:3].index)].sort_values(by = 'PM2.5', ascending = False)
```




<div>
<style scoped>
    .dataframe tbody tr th:only-of-type {
        vertical-align: middle;
    }

    .dataframe tbody