<a href="https://colab.research.google.com/github/avinashsagar981/Python/blob/main/Imp_code_snippet.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

*** How to use enumerate function ***

In [None]:
text = "cat in the hat dog on the mat bird in the tree"

a = text.split()

for i, word in enumerate(a): # or for i, word in enumerate(a,1) so it start with index 1
  print(i, word)

# or
b = {i:word for i, word in enumerate(a)}
b

## **One Hot Encoding**

In [None]:
def one_hot_encode(text):
    words = text.split()
    vocabulary = set(words)
    word_to_index = {word: i for i, word in enumerate(vocabulary)}
    one_hot_encoded = []
    for word in words:
        one_hot_vector = [0] * len(vocabulary)
        one_hot_vector[word_to_index[word]] = 1
        one_hot_encoded.append(one_hot_vector)
    return one_hot_encoded, word_to_index, vocabulary

example_text = "cat in the hat dog on the mat bird in the tree"

one_hot_encoded, word_to_index, vocabulary = one_hot_encode(example_text)

print("Vocabulary:", vocabulary)
print("Word to Index Mapping:", word_to_index)
print("One-Hot Encoded Matrix:")
for word, encoding in zip(example_text.split(), one_hot_encoded):
    print(f"{word}: {encoding}")

Vocabulary: {'dog', 'cat', 'hat', 'tree', 'on', 'the', 'bird', 'in', 'mat'}
Word to Index Mapping: {'dog': 0, 'cat': 1, 'hat': 2, 'tree': 3, 'on': 4, 'the': 5, 'bird': 6, 'in': 7, 'mat': 8}
One-Hot Encoded Matrix:
cat: [0, 1, 0, 0, 0, 0, 0, 0, 0]
in: [0, 0, 0, 0, 0, 0, 0, 1, 0]
the: [0, 0, 0, 0, 0, 1, 0, 0, 0]
hat: [0, 0, 1, 0, 0, 0, 0, 0, 0]
dog: [1, 0, 0, 0, 0, 0, 0, 0, 0]
on: [0, 0, 0, 0, 1, 0, 0, 0, 0]
the: [0, 0, 0, 0, 0, 1, 0, 0, 0]
mat: [0, 0, 0, 0, 0, 0, 0, 0, 1]
bird: [0, 0, 0, 0, 0, 0, 1, 0, 0]
in: [0, 0, 0, 0, 0, 0, 0, 1, 0]
the: [0, 0, 0, 0, 0, 1, 0, 0, 0]
tree: [0, 0, 0, 1, 0, 0, 0, 0, 0]


In [None]:
# One-Hot Encoding Implementation with Detailed Explanation

def one_hot_encode(text):
    # LINE 1: Split the input text into individual words using spaces as delimiters
    # Example: "cat in the hat" becomes ["cat", "in", "the", "hat"]
    words = text.split()

    # LINE 2: Create a set from the words list to get unique words only
    # Sets automatically remove duplicates, so "the" appears only once
    # Example: {"cat", "in", "the", "hat", "dog", "on", "mat", "bird", "tree"}
    vocabulary = set(words)

    # LINE 3: Create a dictionary mapping each unique word to an index number
    # enumerate() gives us (index, word) pairs starting from 0
    # Dictionary comprehension creates {word: index} mapping
    # Example: {"cat": 0, "in": 1, "the": 2, "hat": 3, ...}
    word_to_index = {word: i for i, word in enumerate(vocabulary)}

    # LINE 4: Initialize empty list to store one-hot vectors for each word
    one_hot_encoded = []

    # LINE 5: Loop through each word in the original text (including duplicates)
    for word in words:
        # LINE 6: Create a vector of zeros with length equal to vocabulary size
        # This creates a list like [0, 0, 0, 0, 0, 0, 0, 0, 0] for 9 unique words
        one_hot_vector = [0] * len(vocabulary)

        # LINE 7: Set the position corresponding to current word's index to 1
        # If "cat" has index 0, then one_hot_vector[0] = 1
        # Result: [1, 0, 0, 0, 0, 0, 0, 0, 0] for "cat"
        one_hot_vector[word_to_index[word]] = 1

        # LINE 8: Add this one-hot vector to our result list
        one_hot_encoded.append(one_hot_vector)

    # LINE 9: Return the encoded vectors, word-to-index mapping, and vocabulary
    return one_hot_encoded, word_to_index, vocabulary

# EXECUTION SECTION:

# LINE 10: Define example text with repeated words
example_text = "cat in the hat dog on the mat bird in the tree"

# LINE 11: Call the function and unpack the three returned values
one_hot_encoded, word_to_index, vocabulary = one_hot_encode(example_text)

# LINE 12-13: Print the unique vocabulary (unordered set)
print("Vocabulary:", vocabulary)

# LINE 14-15: Print the word-to-index mapping dictionary
print("Word to Index Mapping:", word_to_index)

# LINE 16-18: Print each word with its corresponding one-hot vector
print("One-Hot Encoded Matrix:")
for word, encoding in zip(example_text.split(), one_hot_encoded):
    print(f"{word}: {encoding}")

# WHAT THIS CODE DOES:
# 1. Takes text input and converts each word to a one-hot encoded vector
# 2. One-hot encoding represents each word as a binary vector where:
#    - Vector length = size of vocabulary (unique words)
#    - Only one position is 1 (hot), rest are 0 (cold)
#    - Each word gets a unique position that's always 1
# 3. This is commonly used in NLP for converting text to numerical format
#    that machine learning models can process

## One Hot Encoding

It can be identified as one of the simplest forms of representing words numerically. Each word is represented as a binary vector with the length of the entire vocabulary. The vector has a “1” in the position corresponding to the word’s index and “0” elsewhere.
###### **Pro’s:** Simple, interpretable.
##### **Con’s:** High dimensionality, doesn’t capture semantic relationships.

In [9]:
from sklearn.preprocessing import OneHotEncoder
import numpy as np

# Sample data
corpus = ['dog', 'cat', 'dog', 'fish']

# Reshape data to fit the model
corpus = np.array(corpus).reshape(-1, 1)

# One-hot encode the data
onehot_encoder = OneHotEncoder()
onehot_encoded = onehot_encoder.fit_transform(corpus)

print(onehot_encoded)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 4 stored elements and shape (4, 3)>
  Coords	Values
  (0, 1)	1.0
  (1, 0)	1.0
  (2, 1)	1.0
  (3, 2)	1.0


# How to update this file on github

### GITHUB

In [None]:
# Install PyGithub
!pip install PyGithub

In [27]:
from github import Github
import json

# Authenticate
g = Github(userdata.get('Github_Token'))

# Get repository
repo = g.get_repo("avinashsagar981/Python")

# Read notebook content
with open('Imp_code_snippet.ipynb', 'r') as f:
    content = f.read()

try:
    # Try to get existing file
    file = repo.get_contents("Imp_code_snippet.ipynb")
    # Update existing file
    repo.update_file("Imp_code_snippet.ipynb", "Update from Colab", content, file.sha)
    print("✅ File updated successfully!")
except:
    # Create new file if it doesn't exist
    repo.create_file("Imp_code_snippet.ipynb", "Create from Colab", content)
    print("✅ File created successfully!")

✅ File updated successfully!


# Updatted!!!

In [28]:
## 12345