In [1]:
import pandas as pd
import nltk
import os
import time
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer

# NLTK verilerini indir
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# 1. Veriyi yükle (ilk 5000 cümle)
df = pd.read_csv("reviews_cleaned_ultimate.csv").dropna()
sentences = df['cleanedReview'].dropna().head(5000).tolist()

# 2. Tokenizasyon ve stopword temizleme
stop_words = set(stopwords.words('english'))
tokenized = [
    [word for word in nltk.word_tokenize(sentence) if word.lower() not in stop_words]
    for sentence in sentences
]

# 3. Lemmatization ve Stemming
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()

lemmatized_sentences = [' '.join([lemmatizer.lemmatize(word.lower()) for word in sent]) for sent in tokenized]
stemmed_sentences = [' '.join([stemmer.stem(word.lower()) for word in sent]) for sent in tokenized]

# 4. TF-IDF vektörlerini oluştur (max_features ile sınırla)
vectorizer_lemma = TfidfVectorizer(max_features=5000)
tfidf_matrix_lemma = vectorizer_lemma.fit_transform(lemmatized_sentences)
df_tfidf_lemma = pd.DataFrame(tfidf_matrix_lemma.toarray(), columns=vectorizer_lemma.get_feature_names_out())

vectorizer_stem = TfidfVectorizer(max_features=5000)
tfidf_matrix_stem = vectorizer_stem.fit_transform(stemmed_sentences)
df_tfidf_stem = pd.DataFrame(tfidf_matrix_stem.toarray(), columns=vectorizer_stem.get_feature_names_out())

# 5. Kayıt klasörü
os.makedirs("vectors", exist_ok=True)

# 6. CSV olarak kaydet
df_tfidf_lemma.to_csv("vectors/tfidf_lemmatized_5000.csv", index=False)
df_tfidf_stem.to_csv("vectors/tfidf_stemmed_5000.csv", index=False)

print("TF-IDF çıktıları başarıyla kaydedildi: tfidf_lemmatized_5000.csv ve tfidf_stemmed_5000.csv")


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Beyza\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Beyza\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Beyza\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


TF-IDF çıktıları başarıyla kaydedildi: tfidf_lemmatized_5000.csv ve tfidf_stemmed_5000.csv
