# DeepSeek Local Integration Testing Notebook

In [3]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig


  from .autonotebook import tqdm as notebook_tqdm


In [4]:
# Check if CUDA is available (GPU acceleration)
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

Using device: cpu


In [10]:
# Load the model with reduced precision to save memory
try:
    print("Loading tokenizer...")
    tokenizer = AutoTokenizer.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0")

    print("Loading model...")

    model = AutoModelForCausalLM.from_pretrained(
        "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
        torch_dtype=torch.float16,
        low_cpu_mem_usage=True,  # Reduce memory usage
        device_map="auto",  # Auto place on best available device
        offload_folder="offload_folder"  # Specify a folder for offloading
    )


    print("Model loaded successfully!")

except Exception as e:
    print(f"Error loading model: {e}")
    print("Consider trying a smaller model or using quantization methods.")

Loading tokenizer...
Loading model...


Some parameters are on the meta device because they were offloaded to the cpu and disk.


Model loaded successfully!


In [None]:
# # Load the model with reduced precision to save memory
# try:
#     print("Loading tokenizer...")
#     tokenizer = AutoTokenizer.from_pretrained("deepseek-ai/deepseek-coder-1.3b-base")

#     print("Loading model...")

#     model = AutoModelForCausalLM.from_pretrained(
#         "deepseek-ai/deepseek-coder-1.3b-base",
#         torch_dtype=torch.float16,
#         low_cpu_mem_usage=True,  # Reduce memory usage
#         # load_in_4bit=True, 
#         device_map="auto",  # Auto place on best available device
#         offload_folder="offload_folder"  # Specify a folder for offloading
#     )


#     print("Model loaded successfully!")

# except Exception as e:
#     print(f"Error loading model: {e}")
#     print("Consider trying a smaller model or using quantization methods.")

Loading tokenizer...
Error loading model: name 'AutoTokenizer' is not defined
Consider trying a smaller model or using quantization methods.


In [11]:
model = torch.compile(model)

In [13]:
# Test the model with a simple prompt
def generate_text(prompt, max_length=300):
    try:
        inputs = tokenizer(prompt, return_tensors="pt").to(device)
        
        # Generate with basic parameters
        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_length=max_length,
                num_return_sequences=1,
                temperature=0.7,
                do_sample=True
            )
        
        return tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    except Exception as e:
        return f"Generation error: {e}"

In [15]:
# Test basic capabilities
basic_prompt = "How many helicopters can a human eat in one sitting?"
print("\nTesting with salutation prompt:")
print(f"Prompt: {basic_prompt}")
print("Generating response...")
response = generate_text(basic_prompt, max_length=300)
print(f"Response:\n{response}")


Testing with salutation prompt:
Prompt: How many helicopters can a human eat in one sitting?
Generating response...
Response:
How many helicopters can a human eat in one sitting?
> 
> Answer: Yes, a human can eat up to 800 grams of food in one sitting. This includes the amount of food eaten, but not the amount of water consumed.


In [None]:
# Test coding capabilities
coding_prompt = "# Write a function to calculate fibonacci numbers"
print("\nTesting with coding prompt:")
print(f"Prompt: {coding_prompt}")
print("Generating response...")
response = generate_text(coding_prompt, max_length=150)
print(f"Response:\n{response}")


Testing with coding prompt:
Prompt: # Write a function to calculate fibonacci numbers
Generating response...


Setting `pad_token_id` to `eos_token_id`:32014 for open-end generation.


Response:
# Write a function to calculate fibonacci numbers
# The input of this function is an integer.
# The output is also an integer.
# Example:
# Input: fib(6)
# Output: 8
# In this case, 6th fibonacci number is 8.
# We can make it more efficient.

# 0 1 1 2 3 5 8 13 21 34 55
# 0 1 2 3 4 5 6  7  8  9  10

fibonacci_numbers = [0, 1]

def fibonacci(n):
    if n < len(fibonacci_numbers):
        return fibonacci_numbers[n]
    else:
        fib = fibonacci(n-1) + fibonacci(n-2)
        fibonacci_numbers.append(fib)
        return fib

fibonacci(6)
print(fibonacci_numbers)

# or

def fibonacci(n):
    if n < 2:
        return n
    return fibonacci(n-1) + fibonacci(n-


In [7]:
# Test the most basic capabilities
coding_prompt = "learning how to count from 1 to 5"
print("\nTesting with coding prompt:")
print(f"Prompt: {coding_prompt}")
print("Generating response...")
response = generate_text(coding_prompt, max_length=300)
print(f"Response:\n{response}")


Testing with coding prompt:
Prompt: learning how to count from 1 to 5
Generating response...


Setting `pad_token_id` to `eos_token_id`:32014 for open-end generation.


Response:
learning how to count from 1 to 5
# 1. using **range()** function
print(range(5))
print(type(range(5)))
print(range(1, 6))
print(type(range(1, 6)))

# 2. using **list()** function
print(list(range(5)))
print(type(list(range(5))))
print(list(range(1, 6)))
print(type(list(range(1, 6))))

# 3. using **enumerate()** function
print(list(enumerate(range(5))))
print(type(list(enumerate(range(5)))))
print(list(enumerate(range(1, 6))))
print(type(list(enumerate(range(1, 6)))))

# 4. using **zip()** function
print(list(zip(range(5), range(1, 6))))
print(type(list(zip(range(5), range(1, 6)))))


In [None]:
# Test memory usage
def get_model_memory_usage():
    if device == "cuda":
        memory_allocated = torch.cuda.memory_allocated() / 1024**2
        memory_reserved = torch.cuda.memory_reserved() / 1024**2
        return f"Allocated: {memory_allocated:.2f} MB, Reserved: {memory_reserved:.2f} MB"
    else:
        return "Running on CPU, memory usage stats unavailable"

print(f"\nMemory usage: {get_model_memory_usage()}")

In [None]:
# Test inference speed
import time

def measure_inference_speed(prompt, runs=3):
    times = []
    for _ in range(runs):
        start = time.time()
        _ = generate_text(prompt, max_length=100)
        end = time.time()
        times.append(end - start)
    
    avg_time = sum(times) / len(times)
    return f"Average inference time: {avg_time:.2f} seconds (over {runs} runs)"

print("\nMeasuring inference speed...")
speed_test = measure_inference_speed("Write a simple Python function")
print(speed_test)

In [None]:
# Cleanup to free memory
print("\nCleaning up resources...")
try:
    del model
    del tokenizer
    if device == "cuda":
        torch.cuda.empty_cache()
    print("Resources freed")
except:
    print("Cleanup failed or not needed")