# **Kendi Yazdığım Fonksiyonlarla Oluşturduğum Program**

In [25]:
# train data

x = [['free', 'free', 'free', 'buy', 'discount', 'combo', 'pleasure'],
     ['free', 'free', 'discount', 'pleasure', 'smile', 'smile', 'smile'],
     ['cat', 'mouse'],
     ['cat', 'cat', 'dog', 'dog', 'dog', 'dog'],
     ['mouse']]
     
y = ['S', 'S', 'N', 'N', 'N']

In [26]:
import math

def calculate_mi_probabilities(x, y):
    word_counts = {}
    class_counts = {'S': 0, 'N': 0}

    for doc, label in zip(x, y):
        class_counts[label] += 1
        for word in doc:
            if word not in word_counts:
                word_counts[word] = {'S': 0, 'N': 0}

            word_counts[word][label] += 1

    word_probabilities = {}

    for word in word_counts:
        P_W0_Cs = word_counts[word]['S'] / class_counts['S']
        P_W0_Cn = word_counts[word]['N'] / class_counts['N']
        
        P_W1_Cs = 1 - P_W0_Cs
        P_W1_Cn = 1 - P_W0_Cn

        epsilon = 1e-10  # epsilon değeri
        if P_W1_Cn <= 0:
            P_W1_Cn = epsilon
        if P_W1_Cs <= 0:
            P_W1_Cs = epsilon
        if P_W0_Cn <= 0:
            P_W0_Cn = epsilon
        if P_W0_Cs <= 0:
            P_W0_Cs = epsilon

        MI = P_W0_Cs * math.log2(P_W0_Cs / (P_W0_Cs + P_W0_Cn)) \
             + P_W1_Cs * math.log2(P_W1_Cs / (P_W1_Cs + P_W1_Cn)) \
             + P_W0_Cn * math.log2(P_W0_Cn / (P_W0_Cs + P_W0_Cn)) \
             + P_W1_Cn * math.log2(P_W1_Cn / (P_W1_Cs + P_W1_Cn))
        
        word_probabilities[word] = {
            'P_W0_Cs': P_W0_Cs,
            'P_W1_Cs': P_W1_Cs,
            'P_W0_Cn': P_W0_Cn,
            'P_W1_Cn': P_W1_Cn,
            'MI': MI
        }

    return word_probabilities

In [52]:
# Sonuçlar
word_probabilities = calculate_mi_probabilities(x, y)

# MI değerlerine göre en büyük iki değeri seçme
top2_words = sorted(word_probabilities.items(), key=lambda x: x[1]['MI'], reverse=True)[:2]

top2_words

[('discount',
  {'P_W0_Cs': 1.0,
   'P_W1_Cs': 1e-10,
   'P_W0_Cn': 1e-10,
   'P_W1_Cn': 1.0,
   'MI': -6.932395221869623e-09}),
 ('pleasure',
  {'P_W0_Cs': 1.0,
   'P_W1_Cs': 1e-10,
   'P_W0_Cn': 1e-10,
   'P_W1_Cn': 1.0,
   'MI': -6.932395221869623e-09})]

In [50]:
for word in word_probabilities:
    print("Word:", word)
    print("P(W=0, C=S):", word_probabilities[word]['P_W0_Cs'])
    print("P(W=1, C=S):", word_probabilities[word]['P_W1_Cs'])
    print("P(W=0, C=N):", word_probabilities[word]['P_W0_Cn'])
    print("P(W=1, C=N):", word_probabilities[word]['P_W1_Cn'])
    print("MI:", word_probabilities[word]['MI'])
    print()


Word: free
P(W=0, C=S): 2.5
P(W=1, C=S): 1e-10
P(W=0, C=N): 1e-10
P(W=1, C=N): 1.0
MI: -7.064588031345375e-09

Word: buy
P(W=0, C=S): 0.5
P(W=1, C=S): 0.5
P(W=0, C=N): 1e-10
P(W=1, C=N): 1.0
MI: -1.377443754447932

Word: discount
P(W=0, C=S): 1.0
P(W=1, C=S): 1e-10
P(W=0, C=N): 1e-10
P(W=1, C=N): 1.0
MI: -6.932395221869623e-09

Word: combo
P(W=0, C=S): 0.5
P(W=1, C=S): 0.5
P(W=0, C=N): 1e-10
P(W=1, C=N): 1.0
MI: -1.377443754447932

Word: pleasure
P(W=0, C=S): 1.0
P(W=1, C=S): 1e-10
P(W=0, C=N): 1e-10
P(W=1, C=N): 1.0
MI: -6.932395221869623e-09

Word: smile
P(W=0, C=S): 1.5
P(W=1, C=S): 1e-10
P(W=0, C=N): 1e-10
P(W=1, C=N): 1.0
MI: -6.990891471934525e-09

Word: cat
P(W=0, C=S): 1e-10
P(W=1, C=S): 1.0
P(W=0, C=N): 1.0
P(W=1, C=N): 1e-10
MI: -6.932395221869623e-09

Word: mouse
P(W=0, C=S): 1e-10
P(W=1, C=S): 1.0
P(W=0, C=N): 0.6666666666666666
P(W=1, C=N): 0.33333333333333337
MI: -1.0817041693532121

Word: dog
P(W=0, C=S): 1e-10
P(W=1, C=S): 1.0
P(W=0, C=N): 1.3333333333333333
P(W=1, C=N)

In [43]:

def calculate_tfidf_score(word, document):
    term_frequency = document.count(word)
    normalized_term_frequency = term_frequency / len(document)
    inverse_document_frequency = math.log(1 / (term_frequency + 1))
    tfidf_score = normalized_term_frequency * inverse_document_frequency
    return tfidf_score


In [44]:
# Her bir train dokümanını seçilen iki kelimeyle temsil etme
representations = []
for document in x:
    representation = []
    for word, _ in top2_words:
        tfidf_score = calculate_tfidf_score(word, document)
        representation.append(tfidf_score)
    representations.append(representation)

print("Training Data Representations:")
for doc, representation in zip(x, representations):
    print("Document:", doc)
    print("Representation:", representation)
    print()

Training Data Representations:
Document: ['free', 'free', 'free', 'buy', 'discount', 'combo', 'pleasure']
Representation: [-0.09902102579427789, -0.09902102579427789]

Document: ['free', 'free', 'discount', 'pleasure', 'smile', 'smile', 'smile']
Representation: [0.0, 0.0]

Document: ['cat', 'mouse']
Representation: [0.0, 0.0]

Document: ['cat', 'cat', 'dog', 'dog', 'dog', 'dog']
Representation: [0.0, 0.0]

Document: ['mouse']
Representation: [0.0, 0.0]



In [45]:
# Test data
test_data = [['dog', 'cat', 'mouse', 'cat'],
             ['Free', 'free', 'smile']]

In [46]:

test_representations = []
for document in test_data:
    representation = []
    for word, _ in top2_words:
        tfidf_score = calculate_tfidf_score(word, document)
        representation.append(tfidf_score)
    test_representations.append(representation)

print("Test Data Representations:")
for doc, representation in zip(test_data, test_representations):
    print("Document:", doc)
    print("Representation:", representation)
    print()

Test Data Representations:
Document: ['dog', 'cat', 'mouse', 'cat']
Representation: [0.0, 0.0]

Document: ['Free', 'free', 'smile']
Representation: [0.0, 0.0]



In [47]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=2)
knn.fit(representations, y)
predictions = knn.predict(test_representations)


print("Test Data Predictions:")
for doc, prediction in zip(test_data, predictions):
    print("Document:", doc)
    print("Prediction:", prediction)
    print()


Test Data Predictions:
Document: ['dog', 'cat', 'mouse', 'cat']
Prediction: N

Document: ['Free', 'free', 'smile']
Prediction: N



# **Hazır Kütüphane Fonksiyonlarıyla Oluşturduğum Program**

In [53]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import mutual_info_classif
from sklearn.neighbors import KNeighborsClassifier

In [54]:
documents = [
    'free free free buy discount combo pleasure',             # d1
    'free free free discount pleasure smile smile smile',     # d2
    'cat mouse',                                              # d3
    'cat cat dog dog dog dog',                                # d4
    'mouse',                                                  # d5
]

classes = ['S', 'S', 'N', 'N', 'N']

all_documents = [' '.join(doc.split()) for doc in documents]

In [55]:
vectorizer = TfidfVectorizer()

tfidf_scores = vectorizer.fit_transform(all_documents).toarray()

In [56]:
mi_scores = mutual_info_classif(tfidf_scores, classes)

word_mi_scores = list(zip(vectorizer.get_feature_names_out(), mi_scores))
word_mi_scores.sort(key=lambda x: x[1], reverse=True)

selected_words = [word for word, _ in word_mi_scores[:2]]

print("Selected words:", selected_words)

Selected words: ['discount', 'free']


In [57]:
d6 = ['dog', 'cat', 'mouse', 'cat']
d6_vector = vectorizer.transform([' '.join(d6)]).toarray()

d7 = ['Free', 'free', 'smile']
d7_vector = vectorizer.transform([' '.join(d7)]).toarray()

In [58]:
knn = KNeighborsClassifier(n_neighbors=3)

knn.fit(tfidf_scores, classes)

In [59]:
d6_predicted = knn.predict(d6_vector)

print("Predicted class label for d6:", d6_predicted)

d7_predicted = knn.predict(d7_vector)

print("Predicted class label for d7:", d7_predicted)

Predicted class label for d6: ['N']
Predicted class label for d7: ['S']
