In [2]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
import pandas as pd
import string
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
# Ortak ön işleme adımları
def to_lowercase(text):
    """
    Metni küçük harfe çevirir.
    """
    return text.lower()
def remove_punctuation(text):
    """
    Metindeki noktalama işaretlerini kaldırır.
    """
    return text.translate(str.maketrans("", "", string.punctuation))
def remove_numbers(text, replace_with_tag=False):
    """
    Metindeki sayıları kaldırır veya <number> etiketiyle değiştirir.
    replace_with_tag=True olduğunda sayılar yerine <number> eklenir.
    """
    if replace_with_tag:
        return re.sub(r'\d+', '<number>', text)
    else:
        return re.sub(r'\d+', '', text)
def remove_stopwords(text, lang='english'):
    """
    Stopwords'leri metinden çıkarır.
    """
    stop_words = set(stopwords.words(lang))
    return " ".join([word for word in text.split() if word not in stop_words])
def apply_stemming(text):
    """
    Metindeki kelimeleri köklerine indirir (Stemming).
    """
    stemmer = PorterStemmer()
    return " ".join([stemmer.stem(word) for word in text.split()])
def apply_lemmatization(text):
    """
    Metindeki kelimeleri köklerine indirir (Lemmatization).
    """
    lemmatizer = WordNetLemmatizer()
    return " ".join([lemmatizer.lemmatize(word) for word in text.split()])
def preprocess_text(
    text,
    lowercase=True,
    remove_punct=True,
    remove_nums=True,
    replace_nums_with_tag=True,
    remove_sw=True,
    apply_stem=False,
    apply_lemma=True,
    lang='english'
):
    """
    Metni temizler ve ön işlemden geçirir.
    """
    if lowercase:
        text = to_lowercase(text)
    if remove_punct:
        text = remove_punctuation(text)
    if remove_nums:
        text = remove_numbers(text, replace_with_tag=replace_nums_with_tag)
    if remove_sw:
        text = remove_stopwords(text, lang=lang)
    if apply_stem:
        text = apply_stemming(text)
    if apply_lemma:
        text = apply_lemmatization(text)
    return text

df=pd.read_csv("data.csv")
df = df.sample(n=10000, random_state=42)  # Sadece 10.000 satır kullan

# Özellikler ve etiketleri ayır
df['processed_review'] = df['review'].apply(lambda x: preprocess_text(x))
# Örnek veri: Veri setini yükleme (örneğin, df['review'] ve df['sentiment'])
# Varsayılan veri formatı
texts = df['review'].apply(preprocess_text)  # Yorum metinleri
labels = df['sentiment'].apply(preprocess_text)  # Etiketler (0: olumsuz, 1: olumlu)

# Etiketleri sayısal değerlere dönüştürme
labels = labels.map({'positive': 1, 'negative': 0})

# Eğitim ve test setine ayırma
X_train, X_test, y_train, y_test = train_test_split(texts, labels, test_size=0.2, random_state=42)

# Bag-of-Words (BoW) ile temsil max features üreteceği max kelime sayısı
bow_vectorizer = CountVectorizer(max_features=5000, ngram_range=(1, 2))  # Unigram ve Bigram
X_train_bow = bow_vectorizer.fit_transform(X_train)
X_test_bow = bow_vectorizer.transform(X_test)

# TF-IDF ile temsil
tfidf_vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))  # Unigram ve Bigram
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# SVM Modeli (Bag-of-Words ile)
svm_bow = SVC(kernel='rbf',C=1, degree=2, gamma='scale', random_state=42)
svm_bow.fit(X_train_bow, y_train)
y_pred_bow = svm_bow.predict(X_test_bow)
accuracy_bow = accuracy_score(y_test, y_pred_bow)

# SVM Modeli (TF-IDF ile)
svm_tfidf = SVC(kernel='rbf',C=1, degree=2, gamma='scale', random_state=42)
svm_tfidf.fit(X_train_tfidf, y_train)
y_pred_tfidf = svm_tfidf.predict(X_test_tfidf)
accuracy_tfidf = accuracy_score(y_test, y_pred_tfidf)

# Sonuçları görüntüleme
bow_report = classification_report(y_test, y_pred_bow)
tfidf_report = classification_report(y_test, y_pred_tfidf)

# Raporları ve doğrulukları yazdır
print(f"Bag-of-Words SVM Accuracy: {accuracy_bow:.2f}")
print("Bag-of-Words Classification Report:")
print(bow_report)

print(f"TF-IDF SVM Accuracy: {accuracy_tfidf:.2f}")
print("TF-IDF Classification Report:")
print(tfidf_report)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ACER\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ACER\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\ACER\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Bag-of-Words SVM Accuracy: 0.86
Bag-of-Words Classification Report:
              precision    recall  f1-score   support

           0       0.89      0.82      0.85       999
           1       0.83      0.90      0.86      1001

    accuracy                           0.86      2000
   macro avg       0.86      0.86      0.86      2000
weighted avg       0.86      0.86      0.86      2000

TF-IDF SVM Accuracy: 0.88
TF-IDF Classification Report:
              precision    recall  f1-score   support

           0       0.90      0.85      0.87       999
           1       0.86      0.91      0.88      1001

    accuracy                           0.88      2000
   macro avg       0.88      0.88      0.88      2000
weighted avg       0.88      0.88      0.88      2000



In [4]:
import joblib

# Model ve vektörleştiricileri kaydet
joblib.dump(svm_tfidf, 'models/svm/svm_model_tfidf.joblib')
joblib.dump(tfidf_vectorizer, 'models/svm/svm_tfidf_vectorizer.joblib')

# Eğer Bag-of-Words modeli de kayıt edilecekse
joblib.dump(svm_bow, 'models/svm/svm_model_bow.joblib')


['models/svm/svm_model_bow.joblib']

In [19]:
import joblib

# Kayıtlı model ve vektörleştiricileri yükle
loaded_svm_model = joblib.load('models/svm/svm_model_tfidf.joblib')
loaded_tfidf_vectorizer = joblib.load('models/svm/svm_tfidf_vectorizer.joblib')

# Yeni gelen bir yorum örneği
new_review = "dont watch this movie because it is waste of time"

# Ön işleme adımları
processed_new_review = preprocess_text(new_review)  # preprocess_text fonksiyonunu tekrar kullanıyoruz

# Veriyi vektörleştirme
X_new = loaded_tfidf_vectorizer.transform([processed_new_review])

# Tahmin
prediction = loaded_svm_model.predict(X_new)
print("Tahmin Sonucu:", prediction)


Tahmin Sonucu: [0]
