In [42]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [43]:
!pip install pyspark





In [44]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("FilmYorumAnalizi").getOrCreate()
print("Apache Spark başarıyla başlatıldı!")

Apache Spark başarıyla başlatıldı!


In [45]:
df = spark.read.json("/content/drive/MyDrive/etiketlenmis_yorumlar_line.json")
df.show(5)  # İlk 5 satırı göster


+-----------------+--------------------+-------+---------+
|           author|             content|film_id|sentiment|
+-----------------+--------------------+-------+---------+
|         llgolden|      best movi ever|     13|      İyi|
|     bluedevil667|      best movi ever|     13|      İyi|
|       CinemaSerf|forrest tom hank ...|     13|      İyi|
|        Ian Beale|ponder meander ep...|     28|      İyi|
|Rocketeer Raccoon|think lot peopl t...|     28|     Kötü|
+-----------------+--------------------+-------+---------+
only showing top 5 rows



In [46]:
df.printSchema()  # Veri çerçevesinin yapısını gösterir
df.show(5)  # İlk 5 satırı ekrana yazdır


root
 |-- author: string (nullable = true)
 |-- content: string (nullable = true)
 |-- film_id: long (nullable = true)
 |-- sentiment: string (nullable = true)

+-----------------+--------------------+-------+---------+
|           author|             content|film_id|sentiment|
+-----------------+--------------------+-------+---------+
|         llgolden|      best movi ever|     13|      İyi|
|     bluedevil667|      best movi ever|     13|      İyi|
|       CinemaSerf|forrest tom hank ...|     13|      İyi|
|        Ian Beale|ponder meander ep...|     28|      İyi|
|Rocketeer Raccoon|think lot peopl t...|     28|     Kötü|
+-----------------+--------------------+-------+---------+
only showing top 5 rows



In [47]:
print("Toplam yorum sayısı:", df.count())

Toplam yorum sayısı: 928


In [48]:
import json

data = []
with open("/content/drive/MyDrive/etiketlenmis_yorumlar_line.json", "r", encoding="utf-8") as f:
    for line in f:
        data.append(json.loads(line.strip()))

print("JSON dosyasındaki toplam yorum sayısı:", len(data))

JSON dosyasındaki toplam yorum sayısı: 928


In [49]:
df.groupBy("sentiment").count().show()


+---------+-----+
|sentiment|count|
+---------+-----+
|     Kötü|  132|
|      İyi|  796|
+---------+-----+



In [50]:
df.groupBy("film_id", "sentiment").count().orderBy("count", ascending=False).show(10)


+-------+---------+-----+
|film_id|sentiment|count|
+-------+---------+-----+
| 533535|      İyi|   18|
| 299534|      İyi|   17|
| 157336|      İyi|   17|
| 324857|      İyi|   17|
| 872585|      İyi|   16|
| 496243|      İyi|   14|
| 299536|      İyi|   14|
| 575264|      İyi|   13|
|    278|      İyi|   13|
| 263115|      İyi|   12|
+-------+---------+-----+
only showing top 10 rows



In [51]:
df.createOrReplaceTempView("yorumlar")


In [52]:
spark.sql("""
    SELECT film_id, COUNT(*) AS iyi_yorum_sayisi
    FROM yorumlar
    WHERE sentiment = 'İyi'
    GROUP BY film_id
    ORDER BY iyi_yorum_sayisi DESC
    LIMIT 10
""").show()


+-------+----------------+
|film_id|iyi_yorum_sayisi|
+-------+----------------+
| 533535|              18|
| 157336|              17|
| 324857|              17|
| 299534|              17|
| 872585|              16|
| 496243|              14|
| 299536|              14|
|    278|              13|
| 575264|              13|
| 426063|              12|
+-------+----------------+



In [53]:
spark.sql("""
    SELECT film_id, COUNT(*) AS kotu_yorum_sayisi
    FROM yorumlar
    WHERE sentiment = 'Kötü'
    GROUP BY film_id
    ORDER BY kotu_yorum_sayisi DESC
    LIMIT 10
""").show()


+-------+-----------------+
|film_id|kotu_yorum_sayisi|
+-------+-----------------+
| 299536|                6|
|    278|                5|
| 872585|                4|
| 933260|                3|
|    155|                3|
|1197306|                3|
|    550|                3|
| 324857|                3|
|    141|                3|
| 558449|                3|
+-------+-----------------+



In [54]:
from pyspark.ml.feature import Tokenizer, StopWordsRemover, HashingTF, IDF

# Yorumları kelimelere ayırma (Tokenization)
tokenizer = Tokenizer(inputCol="content", outputCol="words")
df = tokenizer.transform(df)

# Stop words temizleme
stopwords_remover = StopWordsRemover(inputCol="words", outputCol="filtered_words")
df = stopwords_remover.transform(df)

# Hashing TF (Kelime vektörleri oluşturma)
hashing_tf = HashingTF(inputCol="filtered_words", outputCol="raw_features", numFeatures=2000)
df = hashing_tf.transform(df)

# IDF (Kelime önemini hesaplama)
idf = IDF(inputCol="raw_features", outputCol="features")
idf_model = idf.fit(df)
df = idf_model.transform(df)

# Şimdi veri çerçevesinin şemasını tekrar kontrol edelim:
df.printSchema()
df.show(5)



root
 |-- author: string (nullable = true)
 |-- content: string (nullable = true)
 |-- film_id: long (nullable = true)
 |-- sentiment: string (nullable = true)
 |-- words: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- filtered_words: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- raw_features: vector (nullable = true)
 |-- features: vector (nullable = true)

+-----------------+--------------------+-------+---------+--------------------+--------------------+--------------------+--------------------+
|           author|             content|film_id|sentiment|               words|      filtered_words|        raw_features|            features|
+-----------------+--------------------+-------+---------+--------------------+--------------------+--------------------+--------------------+
|         llgolden|      best movi ever|     13|      İyi|  [best, movi, ever]|  [best, movi, ever]|(2000,[763,1083,1...|(2000,[763,1083,1...|
|     

In [55]:

from pyspark.ml.feature import StringIndexer
if "sentiment_index" not in df.columns:
        indexer = StringIndexer(inputCol="sentiment", outputCol="sentiment_index")
        df = indexer.fit(df).transform(df)
        print("Sentiment sütunu sayısal değere dönüştürüldü!")
else:
    print("Sentiment_index sütunu zaten mevcut, tekrar oluşturulmadı.")




Sentiment sütunu sayısal değere dönüştürüldü!


In [56]:
# Veriyi %80 eğitim, %20 test
train_data, test_data = df.randomSplit([0.8, 0.2], seed=42)

print(f"Eğitim veri seti boyutu: {train_data.count()}, Test veri seti boyutu: {test_data.count()}")


Eğitim veri seti boyutu: 777, Test veri seti boyutu: 151


In [57]:
from pyspark.ml.classification import LogisticRegression


lr = LogisticRegression(featuresCol="features", labelCol="sentiment_index")
model = lr.fit(train_data)

print("Model başarıyla eğitildi!")


Model başarıyla eğitildi!


In [58]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator


predictions = model.transform(test_data)


evaluator = MulticlassClassificationEvaluator(labelCol="sentiment_index", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)

print(f"Model doğruluk oranı: {accuracy:.2f}")


Model doğruluk oranı: 0.72


In [59]:
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator


paramGrid = (ParamGridBuilder()
             .addGrid(lr.regParam, [0.1, 0.01, 0.001])  # Düzenleme parametresi
             .addGrid(lr.maxIter, [10, 50, 100])  # Maksimum iterasyon sayısı
             .build())

print("Hiperparametreler başarıyla tanımlandı!")


Hiperparametreler başarıyla tanımlandı!


In [60]:

crossval = CrossValidator(estimator=lr,
                          estimatorParamMaps=paramGrid,
                          evaluator=MulticlassClassificationEvaluator(labelCol="sentiment_index", metricName="accuracy"),
                          numFolds=3)  # 3 katlı çapraz doğrulama


cv_model = crossval.fit(train_data)

print("Çapraz doğrulama tamamlandı, en iyi model seçildi!")


Çapraz doğrulama tamamlandı, en iyi model seçildi!


In [None]:

best_predictions = cv_model.transform(test_data)

# Model doğruluk oranını yeniden hesapla
best_evaluator = MulticlassClassificationEvaluator(labelCol="sentiment_index", metricName="accuracy")
best_accuracy = best_evaluator.evaluate(best_predictions)

print(f"En iyi modelin doğruluk oranı: {best_accuracy:.2f}")


En iyi modelin doğruluk oranı: 0.85


In [None]:
cv_model.bestModel.write().overwrite().save("en_iyi_logistic_regression_model")
print("En iyi Logistic Regression modeli başarıyla kaydedildi!")



En iyi Logistic Regression modeli başarıyla kaydedildi!


In [None]:
from pyspark.ml.classification import LogisticRegressionModel

# Kaydedilen en iyi modeli yükle
loaded_model = LogisticRegressionModel.load("en_iyi_logistic_regression_model")

# Modeli test veri seti üzerinde çalıştır
loaded_predictions = loaded_model.transform(test_data)

# Model doğruluk oranını yeniden hesapla
loaded_evaluator = MulticlassClassificationEvaluator(labelCol="sentiment_index", metricName="accuracy")
loaded_accuracy = loaded_evaluator.evaluate(loaded_predictions)

print(f"Yeniden yüklenen modelin doğruluk oranı: {loaded_accuracy:.2f}")



Yeniden yüklenen modelin doğruluk oranı: 0.85


In [None]:
from pyspark.sql import Row

# Yeni İngilizce yorumlar için test veri seti oluştur
new_data = spark.createDataFrame([
    Row(content="I loved this movie, it was amazing!", film_id="A001"),
    Row(content="The script was terrible, I didn't like it at all.", film_id="B002"),
    Row(content="The acting was brilliant, I really enjoyed it!", film_id="C003"),
    Row(content="So boring, total waste of time.", film_id="D004"),
    Row(content="The effects were stunning, a visual masterpiece!", film_id="E005")
])

# Yeni veri setini modelin istediği forma dönüştürelim
new_data = tokenizer.transform(new_data)
new_data = stopwords_remover.transform(new_data)
new_data = hashing_tf.transform(new_data)
new_data = idf_model.transform(new_data)

# Modeli kullanarak tahmin yap
new_predictions = loaded_model.transform(new_data)

# Sonuçları göster
new_predictions.select("content", "prediction").show()


+--------------------+----------+
|             content|prediction|
+--------------------+----------+
|I loved this movi...|       0.0|
|The script was te...|       0.0|
|The acting was br...|       0.0|
|So boring, total ...|       0.0|
|The effects were ...|       0.0|
+--------------------+----------+



In [None]:
from pyspark.sql import SparkSession
from pyspark.sql import Row

def tahmin(yorum_metni):
    # 1. Girdiyi DataFrame'e çevir
    spark = SparkSession.builder.getOrCreate()
    data = spark.createDataFrame([Row(content=yorum_metni)])

    # 2. Metin işleme adımları
    data = tokenizer.transform(data)
    data = stopwords_remover.transform(data)
    data = hashing_tf.transform(data)
    data = idf_model.transform(data)

    # 3. Model ile tahmin
    new_predictions = loaded_model.transform(data)
    new_predictions.show()
    if new_predictions.select("prediction").first()[0] == 0.0:
        print("Tahmin: Kötü")
    else:
        print("Tahmin: İyi")

In [None]:
yorum=input("Yorumunuz nedir?")
tahmin(yorum)

Yorumunuz nedir?Not sure how many times I've seen this (first time was on VHS ~1986 or 87 I think) but still an incredibly fun sci-fi comedy with Fox and Lloyd working perfectly off one another. Holds up so well each and every time I re-visit.
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|             content|               words|      filtered_words|        raw_features|            features|       rawPrediction|         probability|prediction|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|Not sure how many...|[not, sure, how, ...|[sure, many, time...|(2000,[107,157,19...|(2000,[107,157,19...|[1.25416380079309...|[0.77801980272400...|       0.0|
+--------------------+--------------------+--------------------+--------------------+--------------------+----------

In [None]:
yorum=input("Yorumunuz nedir?")
tahmin(yorum)

Yorumunuz nedir?think lot peopl think film classic derang memor scene film drop explod napalm guy hat say love smell napalm morn rest film complet bore bombshel like film high 
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|             content|               words|      filtered_words|        raw_features|            features|       rawPrediction|         probability|prediction|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|think lot peopl t...|[think, lot, peop...|[think, lot, peop...|(2000,[94,116,189...|(2000,[94,116,189...|[0.31686230271588...|[0.57855938153104...|       0.0|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+

Tahmin: Kötü
