# Encoding Methods Comparative Analysis

## Objective
Analyze and compare different encoding techniques for categorical variables.

In [10]:
pip install memory-profiler




In [None]:
import numpy as np
import pandas as pd
from time import time
import matplotlib.pyplot as plt
from memory_profiler import profile
from sklearn.preprocessing import LabelEncoder

## 1. Base Encoder Class
Implement your encoding methods in this class

In [None]:
from sklearn.preprocessing import LabelEncoder
import hashlib

class EncodingAnalyzer:
    def __init__(self):
        """Initialize your encoding tracker here"""
        self.label_encoder = LabelEncoder()


    def basic_ascii_encode(self, text: str) -> int:
        """Convert text to ASCII values and return the sum"""
        return sum(ord(char) for char in text)


    def weighted_ascii_encode(self, text: str) -> int:
        """Weighted ASCII encoding: Multiply ASCII values by their positions"""
        return sum((i + 1) * ord(char) for i, char in enumerate(text))



    def hash_based_encode(self, text: str) -> int:
        """Custom hash encoding using MD5"""
        return int(hashlib.md5(text.encode()).hexdigest(), 16) % (10**8)


    def label_encode(self, text: str) -> int:
        """Use sklearn's LabelEncoder to encode text"""
        self.label_encoder.fit([text])
        return self.label_encoder.transform([text])[0]



## 2. Performance Analysis Tools
Implement your analysis methods here

In [None]:

class PerformanceAnalyzer:

    def measure_time(self, func, data):
        """Measure encoding time"""
        start_time = time.time()
        result = func(data)
        end_time = time.time()
        elapsed_time = end_time - start_time
        return elapsed_time, result

    @memory_profiler.profile
    def measure_memory(self, func, data):
        """Measure memory usage"""
        result = func(data)
        return result

    def check_collisions(self, encoded_data, original_data):
        """Find encoding collisions"""
        unique_values = len(set(encoded_data))
        total_values = len(original_data)
        collisions = total_values - unique_values

        collision_rate = (collisions / total_values) * 100
        return collisions, collision_rate


## 3. Test Scenarios
Test your implementations with these cases

In [None]:

test_scenarios = {
    'basic': ['cat', 'act', 'tac'],
    'length': ['short', 'very_long_string_test'],
    'special': ['hello!', 'hello?', 'hello.'],
    'case': ['Test', 'test', 'TEST'],
    'numeric': ['user1', '1user', 'user_1']
}


def sample_encoding(text):
    return sum(ord(char) for char in text)
# Run tests
for scenario, test_cases in test_scenarios.items():
    print(f"\nTesting Scenario: {scenario}")

    encoded_results = [sample_encoding(text) for text in test_cases]

    for text, encoded in zip(test_cases, encoded_results):
        print(f"Original: {text} -> Encoded: {encoded}")



Testing Scenario: basic
Original: cat -> Encoded: 312
Original: act -> Encoded: 312
Original: tac -> Encoded: 312

Testing Scenario: length
Original: short -> Encoded: 560
Original: very_long_string_test -> Encoded: 2282

Testing Scenario: special
Original: hello! -> Encoded: 565
Original: hello? -> Encoded: 595
Original: hello. -> Encoded: 578

Testing Scenario: case
Original: Test -> Encoded: 416
Original: test -> Encoded: 448
Original: TEST -> Encoded: 320

Testing Scenario: numeric
Original: user1 -> Encoded: 496
Original: 1user -> Encoded: 496
Original: user_1 -> Encoded: 591


## 4. Results Visualization
Create your performance comparison visualizations

In [None]:
def plot_results(results):
    """Create performance comparison plots"""
    # TODO: Implement visualization
    pass

In [12]:
analyzer = EncodingAnalyzer()


# Define encoding functions
encoding_methods = {
    "Basic ASCII": analyzer.basic_ascii_encode,
    "Weighted ASCII": analyzer.weighted_ascii_encode,
    "Hash Encoding": analyzer.hash_based_encode,
    "Label Encoding": analyzer.label_encode
}

# Run tests
results = run_tests(analyzer, encoding_methods)

# Plot results
plot_results(results)


NameError: name 'run_tests' is not defined

## 5. Analysis Template

Complete this analysis for each encoding method:

1. Implementation Complexity:
   - Lines of code:
   - Time to implement:
   - Key challenges:

2. Performance Metrics:
   - Average encoding time:
   - Memory footprint:
   - Collision rate:

3. Advantages:
   - [Your findings]

4. Disadvantages:
   - [Your findings]

5. Edge Cases:
   - [Your findings]

6. Real-world Applicability:
   - [Your analysis]