In [2]:
!pip install datasets scikit-learn nltk gensim
!pip install transformers




In [3]:
from datasets import load_dataset

# IMDB veri setini yükleyelim
dataset = load_dataset('imdb')


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [4]:
import nltk
import re
import string
from nltk.corpus import stopwords

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# Temizleme fonksiyonu
def preprocess(text):
    text = text.lower()
    text = re.sub(f"[{re.escape(string.punctuation)}]", "", text)  # Noktalama kaldır
    tokens = text.split()
    tokens = [word for word in tokens if word not in stop_words]
    return " ".join(tokens)

# Küçük bir subset alalım (daha hızlı çalışmak için)
train_texts = dataset['train']['text'][:5000]
train_labels = dataset['train']['label'][:5000]
test_texts = dataset['test']['text'][:1000]
test_labels = dataset['test']['label'][:1000]

# Temizleme
train_texts = [preprocess(text) for text in train_texts]
test_texts = [preprocess(text) for text in test_texts]


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
# 2500 pozitif ve 2500 negatif örnek alalım
train_data = dataset['train']
test_data = dataset['test']

# Sınıflara göre filtrele
train_pos = [example['text'] for example in train_data if example['label'] == 1][:2500]
train_neg = [example['text'] for example in train_data if example['label'] == 0][:2500]

train_texts = train_pos + train_neg
train_labels = [1]*2500 + [0]*2500

# Test verisi için de benzer şekilde
test_pos = [example['text'] for example in test_data if example['label'] == 1][:500]
test_neg = [example['text'] for example in test_data if example['label'] == 0][:500]

test_texts = test_pos + test_neg
test_labels = [1]*500 + [0]*500



In [7]:
import nltk
import re
import string
from nltk.corpus import stopwords

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def preprocess(text):
    text = text.lower()
    text = re.sub(f"[{re.escape(string.punctuation)}]", "", text)
    tokens = text.split()
    tokens = [word for word in tokens if word not in stop_words]
    return " ".join(tokens)

train_texts = [preprocess(text) for text in train_texts]
test_texts = [preprocess(text) for text in test_texts]


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# TF-IDF vektörleştirme
vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(train_texts)
X_test_tfidf = vectorizer.transform(test_texts)

# Lojistik Regresyon Modeli
clf_tfidf = LogisticRegression(max_iter=1000)
clf_tfidf.fit(X_train_tfidf, train_labels)

# Tahmin ve sonuç
pred_tfidf = clf_tfidf.predict(X_test_tfidf)
print("TF-IDF Sonuçları:\n")
print(classification_report(test_labels, pred_tfidf))


TF-IDF Sonuçları:

              precision    recall  f1-score   support

           0       0.87      0.89      0.88       500
           1       0.88      0.86      0.87       500

    accuracy                           0.88      1000
   macro avg       0.88      0.88      0.87      1000
weighted avg       0.88      0.88      0.87      1000



In [9]:
from gensim.models import Word2Vec
import numpy as np

# Tokenizasyon: Her metni kelimelere ayır
train_tokens = [text.split() for text in train_texts]
test_tokens = [text.split() for text in test_texts]

# Word2Vec modeli eğit
w2v_model = Word2Vec(sentences=train_tokens, vector_size=100, window=5, min_count=2, workers=4)

# Ortalama vektör hesaplama fonksiyonu
def get_avg_vector(tokens, model, vector_size):
    vectors = [model.wv[word] for word in tokens if word in model.wv]
    if len(vectors) > 0:
        return np.mean(vectors, axis=0)
    else:
        return np.zeros(vector_size)

# Eğitim verisini vektörlere dönüştür
X_train_w2v = np.array([get_avg_vector(tokens, w2v_model, 100) for tokens in train_tokens])

# Test verisini vektörlere dönüştür
X_test_w2v = np.array([get_avg_vector(tokens, w2v_model, 100) for tokens in test_tokens])


In [10]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# Lojistik Regresyon Modeli
clf_w2v = LogisticRegression(max_iter=1000)
clf_w2v.fit(X_train_w2v, train_labels)

# Tahmin ve değerlendirme
pred_w2v = clf_w2v.predict(X_test_w2v)

# Sonuçları yazdır
print("Word2Vec Sonuçları:\n")
print(classification_report(test_labels, pred_w2v))


Word2Vec Sonuçları:

              precision    recall  f1-score   support

           0       0.72      0.66      0.69       500
           1       0.68      0.75      0.72       500

    accuracy                           0.70      1000
   macro avg       0.70      0.70      0.70      1000
weighted avg       0.70      0.70      0.70      1000



In [11]:
from sklearn.metrics import accuracy_score

# Doğruluk hesapla
acc_w2v = accuracy_score(test_labels, pred_w2v)

print(f"Word2Vec Accuracy: {acc_w2v:.4f}")


Word2Vec Accuracy: 0.7020


In [12]:
import numpy as np


In [13]:
# En sık geçen 500 kelimeyi alalım
top_k = 500
words = list(w2v_model.wv.index_to_key)[:top_k]

# vectors.tsv ve metadata.tsv dosyalarını oluştur
with open("vectors.tsv", "w", encoding="utf-8") as vec_file, open("metadata.tsv", "w", encoding="utf-8") as meta_file:
    for word in words:
        vec = w2v_model.wv[word]
        vec_str = "\t".join([str(x) for x in vec])
        vec_file.write(f"{vec_str}\n")
        meta_file.write(f"{word}\n")


In [14]:
from google.colab import files
files.download("vectors.tsv")
files.download("metadata.tsv")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [15]:
import numpy as np

# En çok geçen 500 kelimeyi alalım
top_k = 500
words = list(w2v_model.wv.index_to_key)[:top_k]

# vectors.tsv ve metadata.tsv dosyalarını oluştur
with open("vectors.tsv", "w", encoding="utf-8") as vec_file, open("metadata.tsv", "w", encoding="utf-8") as meta_file:
    for word in words:
        try:
            vec = w2v_model.wv[word]
            vec_str = "\t".join(map(str, vec))
            vec_file.write(f"{vec_str}\n")
            meta_file.write(f"{word}\n")
        except KeyError:
            continue


In [16]:
from google.colab import files
files.download("vectors.tsv")
files.download("metadata.tsv")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>