In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
data = pd.read_csv('../artifacts/preprocessed_reviews.csv')

In [None]:
data.head()

In [None]:
data.shape

In [None]:
data["lemmatized_text"].tail(10)

## building vocablary

In [None]:
from collections import Counter
vocab = Counter()

In [None]:
data['lemmatized_text'] = data['lemmatized_text'].astype(str)
for sentence in data['lemmatized_text']:
    vocab.update(sentence.split())


In [None]:
tokens = [key for key in vocab if vocab[key] > 10]

In [None]:
len(tokens)

In [None]:
def save_vocabulary(lines, filename):
    data = '\n'.join(lines)
    file = open(filename, 'w', encoding="utf-8")
    file.write(data)
    file.close()

save_vocabulary(tokens, '../artifacts/vocabulary.txt')

In [None]:
data.head()

In [None]:
x = data['lemmatized_text']
y = data['Rating']

In [None]:
y.head()

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

In [None]:
x_train.shape

In [None]:
x_test.shape

### Vectorization

In [None]:
import numpy as np
from joblib import Parallel, delayed

def vectorize_sentence(sentence, vocabulary):
    vector = np.zeros(len(vocabulary), dtype=np.float32)
    if isinstance(sentence, str):
        words = set(sentence.split())
        for i, token in enumerate(vocabulary):
            if token in words:
                vector[i] = 1
    return vector

def vectorizer_parallel_chunked(ds, vocabulary, n_jobs=-1, chunk_size=1000):
    all_vectors = []

    for i in range(0, len(ds), chunk_size):
        chunk = ds[i:i+chunk_size]
        vectors_chunk = Parallel(n_jobs=n_jobs)(
            delayed(vectorize_sentence)(sentence, vocabulary) for sentence in chunk
        )
        all_vectors.extend(vectors_chunk)  # Efficient memory use
        print(f"Processed {min(i+chunk_size, len(ds))} / {len(ds)}")

    return np.array(all_vectors, dtype=np.float32)


In [None]:
vectorized_x_train = vectorizer_parallel_chunked(x_train, tokens, n_jobs=4, chunk_size=500)


In [None]:
vectorized_x_train

In [None]:
vectorized_x_test = vectorizer_parallel_chunked(x_test, tokens, n_jobs=4, chunk_size=500)


In [None]:
y_train.value_counts()

In [None]:
from imblearn.over_sampling import SMOTE
smote = SMOTE()
vectorized_x_train_smote, y_train_smote = smote.fit_resample(vectorized_x_train, y_train)
print(vectorized_x_train_smote.shape, y_train_smote.shape)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

In [None]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

# Function to calculate training scores
def training_scores(y_act, y_pred):
    acc = round(accuracy_score(y_act, y_pred), 3)
    pr = round(precision_score(y_act, y_pred), 3)
    rec = round(recall_score(y_act, y_pred), 3)
    f1 = round(f1_score(y_act, y_pred), 3)
    
    print(f'Training Scores:\n\tAccuracy = {acc}\n\tPrecision = {pr}\n\tRecall = {rec}\n\tF1-Score = {f1}')

# Function to calculate validation scores
def validation_scores(y_act, y_pred):
    acc = round(accuracy_score(y_act, y_pred), 3)
    pr = round(precision_score(y_act, y_pred), 3)
    rec = round(recall_score(y_act, y_pred), 3)
    f1 = round(f1_score(y_act, y_pred), 3)
    
    print(f'Validation Scores:\n\tAccuracy = {acc}\n\tPrecision = {pr}\n\tRecall = {rec}\n\tF1-Score = {f1}')


In [None]:
lr = LogisticRegression()

# Train the model
lr.fit(vectorized_x_train_smote, y_train_smote)

# Make predictions
y_train_pred = lr.predict(vectorized_x_train_smote)
y_test_pred = lr.predict(vectorized_x_test)

# Evaluate training and validation performance
training_scores(y_train_smote, y_train_pred)
validation_scores(y_test, y_test_pred)


## Naive Bayes

In [None]:
mnb = MultinomialNB()

# Train the model
mnb.fit(vectorized_x_train_smote, y_train_smote)

# Make predictions
y_train_pred = mnb.predict(vectorized_x_train_smote)
y_test_pred = mnb.predict(vectorized_x_test)

# Evaluate training and validation performance
training_scores(y_train_smote, y_train_pred)
validation_scores(y_test, y_test_pred)


## Desition tree

In [None]:
dt = DecisionTreeClassifier()

# Train the model
dt.fit(vectorized_x_train_smote, y_train_smote)

# Make predictions
y_train_pred = dt.predict(vectorized_x_train_smote)
y_test_pred = dt.predict(vectorized_x_test)

# Evaluate training and validation performance
training_scores(y_train_smote, y_train_pred)
validation_scores(y_test, y_test_pred)

## Random Forest

In [None]:
rf = RandomForestClassifier()

# Train the model
rf.fit(vectorized_x_train_smote, y_train_smote)

# Make predictions
y_train_pred = rf.predict(vectorized_x_train_smote)
y_test_pred = rf.predict(vectorized_x_test)

# Evaluate training and validation performance
training_scores(y_train_smote, y_train_pred)
validation_scores(y_test, y_test_pred)

## Support Vector Machine

In [None]:
svm = SVC()

# Train the model
svm.fit(vectorized_x_train_smote, y_train_smote)

# Make predictions
y_train_pred = svm.predict(vectorized_x_train_smote)
y_test_pred = svm.predict(vectorized_x_test)

# Evaluate training and validation performance
training_scores(y_train_smote, y_train_pred)
validation_scores(y_test, y_test_pred)

In [None]:
import pickle

with open('../static/model/model.pickle', 'wb') as file:
    pickle.dump(lr, file)