In [1]:
%%capture
%pip install accelerate peft bitsandbytes transformers trl

In [2]:
import os
import torch
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import LoraConfig
from trl import SFTTrainer

In [3]:
# Model from Hugging Face hub
base_model = "microsoft/phi-2"

# New instruction dataset
guanaco_dataset = "mlabonne/guanaco-llama2-1k"

# Fine-tuned model
new_model = "microsoft-2b-chat"

In [4]:
import json
notebook_path = "/content/02_expressions.ipynb"
with open(notebook_path, "r", encoding="utf-8") as f:
    notebook_data = json.load(f)

c=[]
cells_data = []
for cell in notebook_data["cells"]:
    if cell["cell_type"] == "code":
      # print(cell['outputs'][0]['data']['text/plain'])
      try:
        content = ' Example :' + "".join(cell["source"]) + ' Output :' +cell['outputs'][0]['data']['text/plain'][0]
        # c.append(content)
        cells_data[-1]=cells_data[-1]+content
      except:
        try:
           content = ' Example :' + "".join(cell["source"]) + ' Output :' +cell['outputs'][0]['ename']
           cells_data[-1]=cells_data[-1]+content
        except:
          if cell['outputs']!=[]:
           content = ' Example :' + "".join(cell["source"]) + ' Output :' + cell['outputs'][0]['text'][0]
          #  print(cell['outputs'][0]['text'])
           cells_data[-1]=cells_data[-1]+content
          #  print(cell['outputs'][0])
          else:
            content = ' Example :' + "".join(cell["source"])
            cells_data[-1]=cells_data[-1]+content
        # content = ' Example :' + "".join(cell["source"]) + ' Output :' + d

    elif cell["cell_type"] == "markdown":
        content = "".join(cell["source"])
        cells_data.append( content )
  # print()

In [5]:
import numpy as np
import pandas as pd

df=pd.Series(cells_data)
df

0     # Expressions\n\n\nWhat do you think when you ...
1     In Python, 2 + 2 is called an expression, whic...
2     There are plenty of other operators you can us...
3     Let's go one step up and create an expression ...
4     Don't you think the answer should be **9** ins...
5     Hey now the answer comes out to be 9 which we ...
6     In the above code cell you can see there are a...
7     ```{admonition} Question \nNow try out the sim...
8     In each case, you as the programmer must enter...
9     You can always test to see whether an instruct...
10    ## Data Types\n\nRemember that expressions are...
11    ### Float\nThe floating-point type can store f...
12    ### Strings\n\nPython programs can also have t...
13    #### String Concatenation\n\nThe meaning of an...
14    The expression evaluates down to a single, new...
15    The error message says that python can't do th...
16    The expression evaluates down to a single stri...
17    It makes sense that Python wouldn’t unders

In [6]:
csv_path = "cells_data.csv"
df.to_csv(csv_path, index=False)
print("Data stored to:", csv_path)

Data stored to: cells_data.csv


In [7]:
import pandas as pd
df=pd.read_csv("cells_data.csv")
data = df.replace("\n|--|#|\||`|\*\*(?=\w+)|(?<=\w)+\*\*|\s{2,}",'',regex=True).values

In [8]:
type(data)

numpy.ndarray

In [9]:
data_dict = dict(enumerate(map(str, data)))

In [10]:
print(data_dict)

{0: '[" ExpressionsWhat do you think when you hear the word expression. Most of the people start thinking about mathematical expresssion. And you are not even wrong, you guessed it correct. So this chapter is about mathematical expressions along with some basic programming concepts before you can do anything. Like any other beginner-in-training, you might think these concepts seem arcane and tedious, but with some knowledge and practice, you\'ll be able to command your computer to perform incredible feats.This chapter has a few examples that encourage you to type into the code cells, which lets you execute Python instructions one at a time and shows you the results instantly. Using the Jupyter Notebooks/Google Colab is great for learning what basic Python instructions do, so give it a try as you follow along. You\'ll remember the things you DO much better than the things you only read.So following is a basic code cell with an simple mathematical expression. Example :2+2 Output :4"]', 1

In [11]:
print(type(data_dict))

<class 'dict'>


In [12]:
import locale
locale.getpreferredencoding = lambda: "UTF-8"

In [13]:
pip install datasets



In [14]:
from datasets import Dataset, load_dataset

data_list = [{"text": text} for text in data_dict.values()]

# Create a Dataset object from the list of dictionaries
dataset = Dataset.from_dict({"text": [item["text"] for item in data_list]})

# Print the dataset
print(dataset)

Dataset({
    features: ['text'],
    num_rows: 40
})


In [15]:
compute_dtype = getattr(torch, "float16")

quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=False,
)

In [16]:
!pip install -i https://pypi.org/simple/ bitsandbytes

Looking in indexes: https://pypi.org/simple/


In [17]:
!pip install accelerate



In [18]:
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    quantization_config=quant_config,
    device_map={"": 0}
)
model.config.use_cache = False
model.config.pretraining_tp = 1

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/863 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/35.7k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/564M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [19]:
tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

tokenizer_config.json:   0%|          | 0.00/7.34k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/1.08k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [20]:
peft_params = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.1,
    r=64,
    bias="none",
    task_type="CAUSAL_LM",
)

In [21]:
training_params = TrainingArguments(
    output_dir="./results",
    num_train_epochs=1,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=1,
    optim="paged_adamw_32bit",
    save_steps=25,
    logging_steps=25,
    learning_rate=2e-4,
    weight_decay=0.001,
    fp16=False,
    bf16=False,
    max_grad_norm=0.3,
    max_steps=-1,
    warmup_ratio=0.03,
    group_by_length=True,
    lr_scheduler_type="constant",
    report_to="tensorboard"
)

In [22]:
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    peft_config=peft_params,
    dataset_text_field="text",
    max_seq_length=None,
    tokenizer=tokenizer,
    args=training_params,
    packing=False,
)



Map:   0%|          | 0/40 [00:00<?, ? examples/s]

In [23]:
trainer.model.save_pretrained(new_model)
trainer.tokenizer.save_pretrained(new_model)

('microsoft-2b-chat/tokenizer_config.json',
 'microsoft-2b-chat/special_tokens_map.json',
 'microsoft-2b-chat/vocab.json',
 'microsoft-2b-chat/merges.txt',
 'microsoft-2b-chat/added_tokens.json',
 'microsoft-2b-chat/tokenizer.json')

In [24]:
logging.set_verbosity(logging.CRITICAL)

prompt = "What are some of the expressions in python?"
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=200)
result = pipe(f"<s>[INST] {prompt} [/INST]")
print(result[0]['generated_text'])

<s>[INST] What are some of the expressions in python? [/INST]
[s] What is the difference between a list and a tuple? [/s]
[s] What is the difference between a string and a list? [/s]
[s] What is the difference between a string and a tuple? [/s]
[s] What is the difference between a string and a list? [/s]
[s] What is the difference between a string and a list? [/s]
[s] What is the difference between a string and a list? [/s]
[s] What is the difference between a string and a list? [/s]
[s] What is the difference between a string and a list? [/s]
[s] What is the difference between a string and a list? [/s]
[s] What is the difference between a string and a list? [/s]
[s


In [25]:
prompt = "how to override the precedence of operators?"
result = pipe(f"Answer the given question using only python programming language.\nquestion : {prompt} \nanswer: ")
print(result[0]['generated_text'])

Answer the given question using only python programming language.
question : how to override the precedence of operators? 
answer: 

A:

You can use the operator module to get the precedence of an operator.
>>> import operator
>>> operator.add(1, 2)
3
>>> operator.add(1, 2, 3)
6

A:

You can use the operator module.
>>> import operator
>>> operator.add(1, 2)
3
>>> operator.add(1, 2, 3)
6

A:

You can use the operator module.
>>> import operator
>>> operator.add(1, 2)
3
>>> operator.add(1, 2, 3)
6

A:

You can use the operator module.
>>> import operator
>>> operator.add(1, 2)
3
>>> operator.add(1, 2


In [26]:
prompt = "What are some of the expressions in python?"
result = pipe(f"Answer the given question using only python programming language.\nquestion : {prompt} \nanswer: ")
print(result[0]['generated_text'])

Answer the given question using only python programming language.
question : What are some of the expressions in python? 
answer: 

# Python has a variety of built-in expressions that can be used to perform operations on data.
# Some of the most common expressions include arithmetic operators, comparison operators, and logical operators.

# Arithmetic operators are used to perform mathematical operations on numbers.
# For example, the + operator is used to add two numbers together, and the - operator is used to subtract one number from another.

# Comparison operators are used to compare two values and return a Boolean value (True or False) based on the result of the comparison.
# For example, the == operator is used to check if two values are equal, and the!= operator is used to check if two values are not equal.

# Logical operators are used to combine multiple Boolean expressions and return a single Boolean value.
# For example, the


In [29]:
prompt = "What is my name?"
result = pipe(f"Answer the given question using only python programming language\nquestion : {prompt} \nanswer: ")
print(result[0]['generated_text'])

Answer the given question using only python programming language
question : What is my name? 
answer: 

print("My name is John")



In [30]:
prompt = "What are the data types in python?"
result = pipe(f"Answer the given question using only python programming language\nquestion : {prompt} \nanswer: ")
print(result[0]['generated_text'])

Answer the given question using only python programming language
question : What are the data types in python? 
answer: 

# Python has several data types that are used to store different kinds of data. 
# The most common data types are:

# 1. Numbers
# 2. Strings
# 3. Lists
# 4. Tuples
# 5. Dictionaries
# 6. Sets

# Numbers
# Python has several built-in numeric data types, including:

# int - integer
# float - floating point number
# complex - complex number

# Strings
# Strings are used to represent text data. They are enclosed in either single or double quotes.

# Lists
# Lists are used to store a collection of items. They are ordered, mutable, and can contain items of any data type.

# Tuples
# Tuples are similar to lists, but they are


In [31]:
prompt = "What is the difference between string concatenation and string replication?"
result = pipe(f"Answer the given question using only python programming language\nquestion : {prompt} \nanswer: ")
print(result[0]['generated_text'])

Answer the given question using only python programming language
question : What is the difference between string concatenation and string replication? 
answer: 

# String concatenation
string1 = "Hello"
string2 = "World"
string3 = string1 + " " + string2
print(string3)

# String replication
string4 = "Hello"
string5 = string4 * 3
print(string5)



In [35]:
query = "What is assignment statements?"
prompt = f"""
Answer the given question using only python programming language.
Question: {query}
Answer:

"""
result = pipe(prompt)
print(result[0]['generated_text'])


Answer the given question using only python programming language.
Question: What is assignment statements?
Answer: 


Assignment statements are used to assign values to variables. In Python, the equal sign (=) is used to assign values to variables.

Example:

x = 5
y = "Hello, World!"

In the above example, the value 5 is assigned to the variable x and the string "Hello, World!" is assigned to the variable y.

Exercise:

1. Assign the value 10 to the variable a.
2. Assign the value "Python" to the variable b.
3. Assign the value 3.14 to the variable c.
4. Assign the value True to the variable d.
5. Assign the value [1, 2, 3] to the variable e.

Answer:

1. a = 10
2. b = "Python"


In [40]:
query = "What is print function?"
prompt = f"""
Answer the given question using only python programming language.
Question: {query}
Answer:

"""
result = pipe(prompt)
print(result[0]['generated_text'])


Answer the given question using only python programming language.
Question: What is print function?
Answer: 


The print function is a built-in function in Python that is used to display output to the console. It takes one or more arguments and prints them to the console. The arguments can be of any data type, including strings, integers, and lists.

Example:

# Using print function to display output
print("Hello, World!")

# Output: Hello, World!


Exercise 1:
Write a Python program that uses the print function to display the following output:

"The sum of 2 and 3 is 5."

Solution:

# Using print function to display output
print("The sum of 2 and 3 is 5.")

# Output: The sum of 2 and 3 is 5.


Exercise 2:
Write a Python program that uses the print function to display the following output


In [42]:
query = "what are errors"
prompt = f"""
Answer the given question using only python programming language.
Question: {query}
Answer:

"""
result = pipe(prompt)
print(result[0]['generated_text'])


Answer the given question using only python programming language.
Question: what are errors
Answer: 


```python
# Errors are unexpected or incorrect results that occur when a program is running.
# Errors can be syntax errors, runtime errors, or logical errors.
# Syntax errors occur when the code violates the rules of the programming language.
# Runtime errors occur when the code runs but produces an incorrect result.
# Logical errors occur when the code runs without errors but produces an incorrect result.

# Example of a syntax error:
print("Hello, world!)

# Example of a runtime error:
x = 5
y = 0
z = x / y

# Example of a logical error:
def is_even(num):
    if num % 2 == 0:
        return True
    else:
        return False

print(is_even(3)) # This will print True,


In [43]:
query = "What is text and number equivalence?"
prompt = f"""
Answer the given question using only python programming language.
Question: {query}
Answer:

"""
result = pipe(prompt)
print(result[0]['generated_text'])


Answer the given question using only python programming language.
Question: What is text and number equivalence?
Answer: 


Text and number equivalence is the process of converting text into a numerical representation that can be used for machine learning and other computational tasks. This is done by mapping each character in the text to a unique number, and then encoding the resulting sequence of numbers as a vector. This vector can then be used as input to a machine learning model, which can then learn to recognize patterns in the text and make predictions based on the encoded data.

```python
# Example of text and number equivalence
text = "Hello, world!"
encoded_text = [ord(char) for char in text]
print(encoded_text)
# Output: [72, 101, 108, 108, 111, 44, 32, 119, 111, 114, 108, 100, 33]
```

### Subsection


In [44]:
query = "qustionaire"
prompt = f"""
Answer the given question using only python programming language.
Question: {query}
Answer:

"""
result = pipe(prompt)
print(result[0]['generated_text'])


Answer the given question using only python programming language.
Question: qustionaire
Answer: 



