<a href="https://colab.research.google.com/github/aradeyal/machine_learning/blob/main/%D7%97%D7%96%D7%95%D7%99_%D7%A6%D7%99%D7%95%D7%9F_%D7%91%D7%99%D7%A7%D7%95%D7%A8%D7%AA_%D7%9C%D7%A4%D7%99_%D7%98%D7%A7%D7%A1%D7%98%E2%80%8E.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from pathlib import Path

# --- קריאת הקובץ ---
csv_path = Path(r"/tripadvisor_hotel_reviews.csv")
dataset = pd.read_csv(csv_path)

# --- הורדת stopwords ---
nltk.download('stopwords')

# --- עיבוד טקסט ---
corpus = []
ps = PorterStemmer()
all_stopwords = stopwords.words('english')
if 'not' in all_stopwords:
    all_stopwords.remove('not')

for i in range(len(dataset)):
    review = re.sub('[^a-zA-Z]', ' ', dataset['Review'][i])
    review = review.lower().split()
    review = [ps.stem(word) for word in review if word not in set(all_stopwords)]
    review = ' '.join(review)
    corpus.append(review)

# --- Bag of Words ---
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=1500)
X = cv.fit_transform(corpus).toarray()
y = dataset.iloc[:, 1].values   # Rating

# --- Train/Test split ---
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
y_train = y_train - 1
y_test = y_test - 1

# --- מודל ANN ---
import tensorflow as tf
model = tf.keras.models.Sequential([
    tf.keras.Input((1500,)),
    tf.keras.layers.Dense(100, activation='relu'),
    tf.keras.layers.Dense(100, activation='relu'),
    tf.keras.layers.Dense(5, activation='softmax')
])

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# --- Data pipeline ---
train = tf.data.Dataset.from_tensor_slices((x_train, y_train)).shuffle(1024).batch(32)
test  = tf.data.Dataset.from_tensor_slices((x_test, y_test)).batch(1)

# --- אימון + הערכה ---
print(y_train[:20])  # הדפסה לבדיקה
model.fit(train, epochs=10)
model.evaluate(test)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


[3 4 3 2 3 3 4 3 4 3 2 4 4 2 1 2 4 3 0 4]
Epoch 1/10
[1m513/513[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 5ms/step - accuracy: 0.5325 - loss: 1.0947
Epoch 2/10
[1m513/513[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 5ms/step - accuracy: 0.6706 - loss: 0.7638
Epoch 3/10
[1m513/513[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 5ms/step - accuracy: 0.7409 - loss: 0.6151
Epoch 4/10
[1m513/513[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 5ms/step - accuracy: 0.8224 - loss: 0.4363
Epoch 5/10
[1m513/513[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 9ms/step - accuracy: 0.9018 - loss: 0.2697
Epoch 6/10
[1m513/513[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 5ms/step - accuracy: 0.9514 - loss: 0.1496
Epoch 7/10
[1m513/513[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 5ms/step - accuracy: 0.9715 - loss: 0.1009
Epoch 8/10
[1m513/513[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 6ms/step - accuracy: 0.9837 - loss: 0.064

[2.682398796081543, 0.5606245398521423]