**One Hot Encoding**

In [52]:
pip install scikit-learn



In [53]:
from sklearn.preprocessing import OneHotEncoder

categories_list = [["Ankara", "Bursa", "İstanbul", "İzmir"]]

# One-Hot Encoding işlemi
encoder = OneHotEncoder(categories=categories_list, sparse_output=False)


cities = [["İstanbul"], ["Ankara"], ["İzmir"], ["Bursa"]]
one_hot = encoder.fit_transform(cities)

print(one_hot)



[[0. 0. 1. 0.]
 [1. 0. 0. 0.]
 [0. 0. 0. 1.]
 [0. 1. 0. 0.]]


**Binary Encoding**

In [54]:
pip install pandas



In [55]:
pip install category-encoders



In [56]:
import pandas as pd
import category_encoders as ce

df = pd.DataFrame({"Şehir": ["İstanbul", "Ankara", "İzmir", "Bursa", "Ankara", "İstanbul"]})


encoder = ce.BinaryEncoder(cols=["Şehir"])

# Binary Encoding işlemi
df_encoded = encoder.fit_transform(df)

print(df_encoded)


   Şehir_0  Şehir_1  Şehir_2
0        0        0        1
1        0        1        0
2        0        1        1
3        1        0        0
4        0        1        0
5        0        0        1


**Bag of Words Modeli**



1.Boolean(İkili)Skorlama(0 ve 1 kullanımı)




In [57]:
from sklearn.feature_extraction.text import CountVectorizer

documents = [
    "It was the best of times",
    "it was the worst of times",
    "it was the age of wisdom",
    "it was the age of foolishness"
]

# BoW modelini oluştur
vectorizer = CountVectorizer(binary=True)
X = vectorizer.fit_transform(documents)

print("Kelime Sözlüğü:", vectorizer.get_feature_names_out())
print("BoW Matrisi:\n", X.toarray())


Kelime Sözlüğü: ['age' 'best' 'foolishness' 'it' 'of' 'the' 'times' 'was' 'wisdom' 'worst']
BoW Matrisi:
 [[0 1 0 1 1 1 1 1 0 0]
 [0 0 0 1 1 1 1 1 0 1]
 [1 0 0 1 1 1 0 1 1 0]
 [1 0 1 1 1 1 0 1 0 0]]




2.Count (Kelime Freknas) Skorlama



In [58]:
from sklearn.feature_extraction.text import CountVectorizer

documents = [
    "Kedi hızlıdır ve kedi koşar.",
    "Kedi çok sevimlidir.",
    "Köpek havlar, kedi miyavlar."
]

# BoW modeli (Kelime frekansları ile)
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(documents)

print("Kelime Sözlüğü:", vectorizer.get_feature_names_out())
print("BoW Matrisi:\n", X.toarray())

Kelime Sözlüğü: ['havlar' 'hızlıdır' 'kedi' 'koşar' 'köpek' 'miyavlar' 'sevimlidir' 've'
 'çok']
BoW Matrisi:
 [[0 1 2 1 0 0 0 1 0]
 [0 0 1 0 0 0 1 0 1]
 [1 0 1 0 1 1 0 0 0]]




3.TF-IDF (Term Frequency – Inverse Document Frequency) Skorlama



In [60]:
from sklearn.feature_extraction.text import TfidfVectorizer

documents = [
    "Kedi hızlıdır ve kedi koşar.",
    "Kedi çok sevimlidir.",
    "Köpek havlar, kedi miyavlar."
]

# TF-IDF modeli
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(documents)

print("Kelime Sözlüğü:", vectorizer.get_feature_names_out())
print("TF-IDF Matrisi:\n", X.toarray())

Kelime Sözlüğü: ['havlar' 'hızlıdır' 'kedi' 'koşar' 'köpek' 'miyavlar' 'sevimlidir' 've'
 'çok']
TF-IDF Matrisi:
 [[0.         0.4769856  0.56343076 0.4769856  0.         0.
  0.         0.4769856  0.        ]
 [0.         0.         0.38537163 0.         0.         0.
  0.65249088 0.         0.65249088]
 [0.54645401 0.         0.32274454 0.         0.54645401 0.54645401
  0.         0.         0.        ]]


**n-gram**

In [59]:
from nltk.tokenize import TreebankWordTokenizer
from nltk.util import ngrams
from collections import Counter

def generate_ngrams(text, n):
    """Generate n-grams from a given text using TreebankWordTokenizer."""
    tokenizer = TreebankWordTokenizer()
    words = tokenizer.tokenize(text)
    n_grams = list(ngrams(words, n))
    return n_grams


text = "Today the weather is very nice and sunny."


unigrams = generate_ngrams(text, 1)
bigrams = generate_ngrams(text, 2)
trigrams = generate_ngrams(text, 3)

print("Unigrams:", unigrams)
print("Bigrams:", bigrams)
print("Trigrams:", trigrams)


bigram_counts = Counter(bigrams)
print("\nBigram Frequency:")
for bigram, count in bigram_counts.items():
    print(f"{bigram}: {count}")


Unigrams: [('Today',), ('the',), ('weather',), ('is',), ('very',), ('nice',), ('and',), ('sunny',), ('.',)]
Bigrams: [('Today', 'the'), ('the', 'weather'), ('weather', 'is'), ('is', 'very'), ('very', 'nice'), ('nice', 'and'), ('and', 'sunny'), ('sunny', '.')]
Trigrams: [('Today', 'the', 'weather'), ('the', 'weather', 'is'), ('weather', 'is', 'very'), ('is', 'very', 'nice'), ('very', 'nice', 'and'), ('nice', 'and', 'sunny'), ('and', 'sunny', '.')]

Bigram Frequency:
('Today', 'the'): 1
('the', 'weather'): 1
('weather', 'is'): 1
('is', 'very'): 1
('very', 'nice'): 1
('nice', 'and'): 1
('and', 'sunny'): 1
('sunny', '.'): 1
