# Import Libraries

In [31]:
import numpy as np
import pandas as pd
import re
import string
from collections import defaultdict
from nltk.corpus import stopwords
from nltk.tokenize import WhitespaceTokenizer
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from app import Flask, request, jsonify

# Data Preprocessing

In [32]:
# Load dataset
df = pd.read_csv('IMDB_Dataset.csv')
df['sentiment'] = df['sentiment'].apply(lambda x: 1 if x == 'positive' else 0)

In [33]:
# Text preprocessing
def preprocess_text(text):
    text = re.sub(r'<[^<>]*>', '', text)
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    stop_words = set(stopwords.words('english'))
    tokenizer = WhitespaceTokenizer()
    lemmatizer = WordNetLemmatizer()
    return ' '.join(lemmatizer.lemmatize(word) for word in tokenizer.tokenize(text) if word not in stop_words)

df['review'] = df['review'].apply(preprocess_text)

In [34]:
# Split dataset
train, test = train_test_split(df, test_size=0.25, random_state=42)
X_train, y_train = train['review'], train['sentiment']
X_test, y_test = test['review'], test['sentiment']

# Classifier

### Prior

In [35]:
# Build Vocabulary
vocab = set()
word_counts = defaultdict(lambda: [0, 0])  # {word: [neg_count, pos_count]}
class_doc_counts = [0, 0]  # [neg_class_count, pos_class_count]

for text, label in zip(X_train, y_train):
    words = text.split()
    class_doc_counts[label] += 1  # Count number of documents per class
    for word in words:
        vocab.add(word)
        word_counts[word][label] += 1

vocab_size = len(vocab)

In [36]:
# Compute Prior Probabilities
prior_A = class_doc_counts[0] / sum(class_doc_counts)  # P(A)
prior_B = class_doc_counts[1] / sum(class_doc_counts)  # P(B)
prior_A, prior_B

(0.50248, 0.49752)

### Calculating coefficients

In [37]:
# Önceden toplam kelime sayılarını hesapla
total_words_per_class = {
    0: sum(word_counts[w][0] for w in vocab),
    1: sum(word_counts[w][1] for w in vocab)
}

def compute_prob(word, label):
    word_count = word_counts[word][label]  
    total_words_in_class = total_words_per_class[label]  # O sınıftaki toplam kelime sayısı
    vocab_size = len(vocab)  # Kelime çeşitliliği (Laplace Smoothing için)
    
    return (word_count + 1) / (total_words_in_class + vocab_size)


### Predict

In [38]:
def predict(text):
    words = text.split()
    log_prob_0 = prior_A
    log_prob_1 = prior_B
    
    for word in words:
        if word in vocab:
            log_prob_0 += np.log(compute_prob(word, 0))
            log_prob_1 += np.log(compute_prob(word, 1))
    
    return 1 if log_prob_1 > log_prob_0 else 0

In [39]:
predictions = [predict(text) for text in X_test]

# Evaluate
print("Classification Report: \n", classification_report(y_test, predictions))
print("Confusion Matrix: \n", confusion_matrix(y_test, predictions))
print("Accuracy: \n", accuracy_score(y_test, predictions))

Classification Report: 
               precision    recall  f1-score   support

           0       0.85      0.88      0.86      6157
           1       0.88      0.85      0.86      6343

    accuracy                           0.86     12500
   macro avg       0.86      0.86      0.86     12500
weighted avg       0.86      0.86      0.86     12500

Confusion Matrix: 
 [[5399  758]
 [ 981 5362]]
Accuracy: 
 0.86088


In [40]:
# Add predictions to DataFrame for analysis
test_results = pd.DataFrame({
    'review': X_test,
    'actual_sentiment': y_test,
    'predicted_sentiment': predictions
})

# Display results
test_results.head(20)

Unnamed: 0,review,actual_sentiment,predicted_sentiment
33553,really liked summerslam due look arena curtain...,1,1
9427,many television show appeal quite many differe...,1,1
199,film quickly get major chase scene ever increa...,0,0
12447,jane austen would definitely approve onegwynet...,1,1
39489,expectation somewhat high went see movie thoug...,0,0
42724,ive watched movie fairly regular basis life ne...,1,1
10822,story hope highlighted tragic reality youth fa...,1,1
49498,okay didnt get purgatory thing first time watc...,1,0
4144,disappointed series lot cool graphic thats lev...,0,0
36958,first 30 minute tinseltown finger teetering re...,0,0
