diff --git a/docs/ROADMAP.md b/docs/ROADMAP.md
index 466732f..8b388da 100644
--- a/docs/ROADMAP.md
+++ b/docs/ROADMAP.md
@@ -119,7 +119,15 @@
 ## 🏁 Final Goals (v1.0 release)
 
 - [ ] Bootable ISO with GRUB2 EFI support
-- [ ] Fully self-hosted userland shell
-- [ ] User process execution (ELF64)
-- [ ] Init system + FS + IPC working
+- [ ] Usable VMM & PMM
+- [ ] Userspace C runtime (libc-lite)
+- [ ] XFS support
+- [ ] Install and setup to disk
+- [ ] Framebuffer graphics support
+- [ ] Virtual files
+- [ ] HPET/APIC support
+- [ ] ACPI support
+- [ ] Modern drivers
+- [ ] Networking
+- [ ] Better Fault handling
 - [ ] At least one demo program
diff --git a/drivers/LPT/LPT.c b/drivers/LPT/LPT.c
index fa12062..a2feb76 100644
--- a/drivers/LPT/LPT.c
+++ b/drivers/LPT/LPT.c
@@ -57,7 +57,7 @@ void LPT_WriteChar(char c) {
     // Assert strobe (bit 1 -> pin LOW due to inversion)
     outb(g_lpt_io_base + LPT_CONTROL_PORT, control_val | LPT_CONTROL_STROBE);
 
-    // Strobe pulse width (minimum 0.5μs, using ~10μs for safety)
+    // Strobe pulse width (minimum 0.5 μs, using ~10μs for safety)
     for(volatile int i = 0; i < 50; i++);
 
     // Release strobe (bit 0 -> pin HIGH)
diff --git a/kernel/core/Kernel.c b/kernel/core/Kernel.c
index fad77c3..b2e8f20 100644
--- a/kernel/core/Kernel.c
+++ b/kernel/core/Kernel.c
@@ -649,7 +649,6 @@ static InitResultT PXS2(void) {
 
     IRQUnmaskCoreSystems();
 
-    // Setup memory protection LAST - after all systems are ready
     StackGuardInit();
     SetupMemoryProtection();
 
diff --git a/kernel/memory/MemOps.c b/kernel/memory/MemOps.c
index 04f60ef..f916579 100644
--- a/kernel/memory/MemOps.c
+++ b/kernel/memory/MemOps.c
@@ -3,21 +3,65 @@
 #include "Panic.h"
 
 void strcpy(char* dest, const char* src) {
-    while ((*dest++ = *src++));
+    // Optimize for 64-bit aligned copies when possible
+    if (((uintptr_t)dest & 7) == 0 && ((uintptr_t)src & 7) == 0) {
+        uint64_t* d64 = (uint64_t*)dest;
+        const uint64_t* s64 = (const uint64_t*)src;
+
+        uint64_t val;
+        while ((val = *s64++) != 0) {
+            // Check if any byte in the 64-bit value is zero
+            if ((val & 0xFF00000000000000ULL) == 0 ||
+                (val & 0x00FF000000000000ULL) == 0 ||
+                (val & 0x0000FF0000000000ULL) == 0 ||
+                (val & 0x000000FF00000000ULL) == 0 ||
+                (val & 0x00000000FF000000ULL) == 0 ||
+                (val & 0x0000000000FF0000ULL) == 0 ||
+                (val & 0x000000000000FF00ULL) == 0 ||
+                (val & 0x00000000000000FFULL) == 0) {
+                // Found null terminator, fall back to byte copy
+                char* d = (char*)d64;
+                const char* s = (const char*)(s64 - 1);
+                while ((*d++ = *s++));
+                return;
+            }
+            *d64++ = val;
+        }
+        *(char*)d64 = '\0';
+    } else {
+        // Original byte-by-byte copy for unaligned data
+        while ((*dest++ = *src++));
+    }
 }
 
 void strcat(char* dest, const char* src) {
+    // Find end of dest string more efficiently
     while (*dest) dest++;
-    while ((*dest++ = *src++));
+    strcpy(dest, src);  // Reuse optimized strcpy
 }
 
 void htoa(uint64_t n, char* buffer) {
-    const char* hex_chars = "0123456789ABCDEF";
+    static const char hex_chars[16] = "0123456789ABCDEF";
     buffer[0] = '0';
     buffer[1] = 'x';
-    for (int i = 0; i < 16; i++) {
-        buffer[2 + i] = hex_chars[(n >> (60 - i * 4)) & 0xF];
-    }
+
+    // Unroll the loop for better performance
+    buffer[2]  = hex_chars[(n >> 60) & 0xF];
+    buffer[3]  = hex_chars[(n >> 56) & 0xF];
+    buffer[4]  = hex_chars[(n >> 52) & 0xF];
+    buffer[5]  = hex_chars[(n >> 48) & 0xF];
+    buffer[6]  = hex_chars[(n >> 44) & 0xF];
+    buffer[7]  = hex_chars[(n >> 40) & 0xF];
+    buffer[8]  = hex_chars[(n >> 36) & 0xF];
+    buffer[9]  = hex_chars[(n >> 32) & 0xF];
+    buffer[10] = hex_chars[(n >> 28) & 0xF];
+    buffer[11] = hex_chars[(n >> 24) & 0xF];
+    buffer[12] = hex_chars[(n >> 20) & 0xF];
+    buffer[13] = hex_chars[(n >> 16) & 0xF];
+    buffer[14] = hex_chars[(n >> 12) & 0xF];
+    buffer[15] = hex_chars[(n >> 8)  & 0xF];
+    buffer[16] = hex_chars[(n >> 4)  & 0xF];
+    buffer[17] = hex_chars[n & 0xF];
     buffer[18] = '\0';
 }
 
@@ -27,37 +71,64 @@ void itoa(uint64_t n, char* buffer) {
         buffer[1] = '\0';
         return;
     }
-    char int_buffer[21];
-    char* p = &int_buffer[20];
+
+    char temp_buffer[21];
+    char* p = &temp_buffer[20];
     *p = '\0';
-    uint64_t temp = n;
-    do {
-        *--p = '0' + (temp % 10);
-        temp /= 10;
-    } while(temp > 0);
+
+    // Use faster division by avoiding modulo when possible
+    while (n >= 10) {
+        uint64_t q = n / 10;
+        *--p = '0' + (n - q * 10);  // Faster than n % 10
+        n = q;
+    }
+    *--p = '0' + n;
+
     strcpy(buffer, p);
 }
 
-void* memset(void* dest, int value, unsigned long size) { // FCK GCC
+void* memset(void* dest, int value, unsigned long size) {
     return FastMemset(dest, value, size);
 }
 
 void* FastMemset(void* dest, int value, uint64_t size) {
     ASSERT(dest != NULL);
+
+    if (size == 0) return dest;
+
     CpuFeatures* features = GetCpuFeatures();
     uint8_t* d = (uint8_t*)dest;
+    uint8_t val = (uint8_t)value;
 
-    if (features->sse2 && size >= 16) {
-        // Create a 128-bit value where all bytes are 'value'
-        uint64_t val64 = ((uint64_t)value << 56) | ((uint64_t)value << 48) |
-                         ((uint64_t)value << 40) | ((uint64_t)value << 32) |
-                         ((uint64_t)value << 24) | ((uint64_t)value << 16) |
-                         ((uint64_t)value << 8) | value;
+    // Use AVX2 if available for even better performance
+    if (features->avx2 && size >= 32) {
+        // Create 256-bit value
+        uint64_t val64 = 0x0101010101010101ULL * val;
+
+        asm volatile(
+            "vmovq %0, %%xmm0\n"
+            "vpbroadcastq %%xmm0, %%ymm0\n"
+            :
+            : "r"(val64)
+            : "xmm0", "ymm0"
+        );
+
+        while (size >= 32) {
+            asm volatile("vmovdqu %%ymm0, (%0)" : : "r"(d) : "memory");
+            d += 32;
+            size -= 32;
+        }
+
+        // Clean up YMM registers
+        asm volatile("vzeroupper" ::: "memory");
+    }
+    else if (features->sse2 && size >= 16) {
+        // Original SSE2 path with better value construction
+        uint64_t val64 = 0x0101010101010101ULL * val;
 
         asm volatile(
             "movq %0, %%xmm0\n"
             "punpcklqdq %%xmm0, %%xmm0\n"
-            "punpcklqdq %%xmm0, %%xmm0\n"
             :
             : "r"(val64)
             : "xmm0"
@@ -69,38 +140,96 @@ void* FastMemset(void* dest, int value, uint64_t size) {
             size -= 16;
         }
     }
+    else if (size >= 8) {
+        // 64-bit aligned memset for smaller sizes
+        uint64_t val64 = 0x0101010101010101ULL * val;
+
+        while (size >= 8 && ((uintptr_t)d & 7) == 0) {
+            *(uint64_t*)d = val64;
+            d += 8;
+            size -= 8;
+        }
+    }
 
     // Handle remaining bytes
-    while (size--) *d++ = value;
+    while (size--) *d++ = val;
     return dest;
 }
 
 void* FastMemcpy(void* dest, const void* src, uint64_t size) {
     ASSERT(dest != NULL && src != NULL);
+
+    if (size == 0) return dest;
+
     uint8_t* d = (uint8_t*)dest;
     const uint8_t* s = (const uint8_t*)src;
 
-    if (size >= 8) {
-        // Handle alignment
+    CpuFeatures* features = GetCpuFeatures();
+
+    // Use AVX2 for large copies if available
+    if (features->avx2 && size >= 32) {
+        // Handle initial misalignment
+        while (((uintptr_t)d & 31) != 0 && size > 0) {
+            *d++ = *s++;
+            size--;
+        }
+
+        // AVX2 copy for aligned destination
+        while (size >= 32) {
+            asm volatile(
+                "vmovdqu (%1), %%ymm0\n"
+                "vmovdqa %%ymm0, (%0)\n"
+                :
+                : "r"(d), "r"(s)
+                : "memory", "ymm0"
+            );
+            d += 32;
+            s += 32;
+            size -= 32;
+        }
+
+        asm volatile("vzeroupper" ::: "memory");
+    }
+    else if (features->sse2 && size >= 16) {
+        // Handle alignment for SSE2
+        while (((uintptr_t)d & 15) != 0 && size > 0) {
+            *d++ = *s++;
+            size--;
+        }
+
+        // SSE2 copy
+        while (size >= 16) {
+            asm volatile(
+                "movdqu (%1), %%xmm0\n"
+                "movdqa %%xmm0, (%0)\n"
+                :
+                : "r"(d), "r"(s)
+                : "memory", "xmm0"
+            );
+            d += 16;
+            s += 16;
+            size -= 16;
+        }
+    }
+    else if (size >= 8) {
+        // Handle 8-byte alignment
         while (((uintptr_t)d & 7) != 0 && size > 0) {
             *d++ = *s++;
             size--;
         }
 
         if (((uintptr_t)s & 7) == 0) {
-            // Both aligned - use 64-bit copies with loop unrolling
+            // Both aligned - use 64-bit copies with aggressive unrolling
             uint64_t* d64 = (uint64_t*)d;
             const uint64_t* s64 = (const uint64_t*)s;
 
-            // Unrolled loop for better performance
-            while (size >= 32) {
-                d64[0] = s64[0];  // Copy 32 bytes
-                d64[1] = s64[1];  // in 4 operations
-                d64[2] = s64[2];
-                d64[3] = s64[3];
-                d64 += 4;
-                s64 += 4;
-                size -= 32;
+            // Unroll even more for better performance
+            while (size >= 64) {
+                d64[0] = s64[0];  d64[1] = s64[1];  d64[2] = s64[2];  d64[3] = s64[3];
+                d64[4] = s64[4];  d64[5] = s64[5];  d64[6] = s64[6];  d64[7] = s64[7];
+                d64 += 8;
+                s64 += 8;
+                size -= 64;
             }
 
             while (size >= 8) {
@@ -110,18 +239,19 @@ void* FastMemcpy(void* dest, const void* src, uint64_t size) {
 
             d = (uint8_t*)d64;
             s = (const uint8_t*)s64;
-        } else {
-            // Source not aligned - use 32-bit copies
-            while (size >= 4 && ((uintptr_t)s & 3) == 0) {
-                *(uint32_t*)d = *(const uint32_t*)s;
-                d += 4;
-                s += 4;
-                size -= 4;
-            }
         }
     }
 
-    // Handle remainder bytes
+    // Handle remainder bytes with potential 4-byte optimization
+    if (size >= 4 && ((uintptr_t)s & 3) == 0 && ((uintptr_t)d & 3) == 0) {
+        while (size >= 4) {
+            *(uint32_t*)d = *(const uint32_t*)s;
+            d += 4;
+            s += 4;
+            size -= 4;
+        }
+    }
+
     while (size > 0) {
         *d++ = *s++;
         size--;
@@ -134,14 +264,32 @@ void FastZeroPage(void* page) {
     ASSERT(page != NULL);
     CpuFeatures* features = GetCpuFeatures();
 
-    if (features->sse2) {
+    if (features->avx2) {
+        // Use AVX2 for faster page zeroing
+        asm volatile("vpxor %%ymm0, %%ymm0, %%ymm0" ::: "ymm0");
+
+        uint8_t* p = (uint8_t*)page;
+        for (int i = 0; i < 4096; i += 32) {
+            asm volatile("vmovdqa %%ymm0, (%0)" : : "r"(p + i) : "memory");
+        }
+
+        asm volatile("vzeroupper" ::: "memory");
+    } else if (features->sse2) {
         asm volatile("pxor %%xmm0, %%xmm0" ::: "xmm0");
 
         uint8_t* p = (uint8_t*)page;
-        for (int i = 0; i < 4096; i += 16) {
-            asm volatile("movdqu %%xmm0, (%0)" : : "r"(p + i) : "memory");
+        // Unroll for better performance
+        for (int i = 0; i < 4096; i += 64) {
+            asm volatile(
+                "movdqa %%xmm0, 0(%0)\n"
+                "movdqa %%xmm0, 16(%0)\n"
+                "movdqa %%xmm0, 32(%0)\n"
+                "movdqa %%xmm0, 48(%0)\n"
+                : : "r"(p + i) : "memory"
+            );
         }
     } else {
+        // Fallback to optimized memset
         FastMemset(page, 0, 4096);
     }
 }
@@ -150,9 +298,36 @@ int FastMemcmp(const void* ptr1, const void* ptr2, uint64_t size) {
     const uint8_t* p1 = (const uint8_t*)ptr1;
     const uint8_t* p2 = (const uint8_t*)ptr2;
 
-    for (uint64_t i = 0; i < size; i++) {
-        if (p1[i] < p2[i]) return -1;
-        if (p1[i] > p2[i]) return 1;
+    // 64-bit comparison for aligned data
+    if (size >= 8 && ((uintptr_t)p1 & 7) == 0 && ((uintptr_t)p2 & 7) == 0) {
+        const uint64_t* q1 = (const uint64_t*)p1;
+        const uint64_t* q2 = (const uint64_t*)p2;
+
+        while (size >= 8) {
+            if (*q1 != *q2) {
+                // Found difference, need to find which byte
+                p1 = (const uint8_t*)q1;
+                p2 = (const uint8_t*)q2;
+                for (int i = 0; i < 8; i++) {
+                    if (p1[i] < p2[i]) return -1;
+                    if (p1[i] > p2[i]) return 1;
+                }
+            }
+            q1++;
+            q2++;
+            size -= 8;
+        }
+        p1 = (const uint8_t*)q1;
+        p2 = (const uint8_t*)q2;
+    }
+
+    // Compare remaining bytes
+    while (size > 0) {
+        if (*p1 < *p2) return -1;
+        if (*p1 > *p2) return 1;
+        p1++;
+        p2++;
+        size--;
     }
     return 0;
 }
\ No newline at end of file