### Load Data:

In [14]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [15]:
import pickle
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from nltk.stem import LancasterStemmer
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score, roc_curve
import seaborn as sns
import matplotlib.pyplot as plt
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import string
from sklearn.model_selection import train_test_split


In [2]:
 # Load raw data
amazon_data = pd.read_csv('/MLOps/Sentiment/data/test_amazon.csv')
amazon_test = pd.read_csv('/MLOps/Sentiment/data/test_amazon.csv')
movie_data = pd.read_csv("/MLOps/Sentiment/data/train.csv")

In [3]:
amazon_data.loc[len(amazon_data)] = amazon_data.columns
amazon_data.columns = ["polarity","title",'review']
amazon_data["polarity"] = amazon_data["polarity"].astype(int)
amazon_data["polarity"] = amazon_data["polarity"].replace({1:"Negative",2:"Positive"})
amazon_data["datatype"] = "Train"
amazon_data["reviewtype"] = "Product"
amazon_data = amazon_data.drop("title", axis=1)
# amazon_data = pd.concat([amazon_data[amazon_data["polarity"]=="Positive"].head(100000),amazon_data[amazon_data["polarity"]=="Negative"].head(100000)]).reset_index(drop=True)
amazon_data

Unnamed: 0,polarity,review,datatype,reviewtype
0,Positive,Despite the fact that I have only played a sma...,Train,Product
1,Negative,I bought this charger in Jul 2003 and it worke...,Train,Product
2,Positive,Check out Maha Energy's website. Their Powerex...,Train,Product
3,Positive,Reviewed quite a bit of the combo players and ...,Train,Product
4,Negative,I also began having the incorrect disc problem...,Train,Product
...,...,...,...,...
399995,Negative,My son recieved this as a birthday gift 2 mont...,Train,Product
399996,Negative,"I bought this toy for my son who loves the ""Th...",Train,Product
399997,Positive,This is a compilation of a wide range of Mitfo...,Train,Product
399998,Negative,This DVD will be a disappointment if you get i...,Train,Product


In [4]:
amazon_test.loc[len(amazon_test)] = amazon_test.columns
amazon_test.columns = ["polarity","title",'review']
amazon_test["polarity"] = amazon_test["polarity"].astype(int)
amazon_test["polarity"] = amazon_test["polarity"].replace({1:"Negative",2:"Positive"})
amazon_test["datatype"] = "Test"
amazon_test["reviewtype"] = "Product"
amazon_test = amazon_test.drop("title", axis=1)
# amazon_test = pd.concat([amazon_test[amazon_test["polarity"]=="Positive"].head(20000),amazon_test[amazon_test["polarity"]=="Negative"].head(20000)]).reset_index(drop=True)
amazon_test

Unnamed: 0,polarity,review,datatype,reviewtype
0,Positive,Despite the fact that I have only played a sma...,Test,Product
1,Negative,I bought this charger in Jul 2003 and it worke...,Test,Product
2,Positive,Check out Maha Energy's website. Their Powerex...,Test,Product
3,Positive,Reviewed quite a bit of the combo players and ...,Test,Product
4,Negative,I also began having the incorrect disc problem...,Test,Product
...,...,...,...,...
399995,Negative,My son recieved this as a birthday gift 2 mont...,Test,Product
399996,Negative,"I bought this toy for my son who loves the ""Th...",Test,Product
399997,Positive,This is a compilation of a wide range of Mitfo...,Test,Product
399998,Negative,This DVD will be a disappointment if you get i...,Test,Product


In [5]:
movie_data = movie_data.drop("Unnamed: 0", axis=1)
movie_data.columns = ["review","polarity"]
movie_data["polarity"] = movie_data["polarity"].astype(str)
movie_data["reviewtype"] = "Movies"

# split them equally so the case balance is preserved in split
tta, ta = train_test_split(movie_data, test_size=0.15, stratify=movie_data['polarity'], random_state=42)
tta['datatype'] = 'Train'
ta['datatype'] = 'Test'

movie_data_stratified = pd.concat([tta, ta]).reset_index(drop=True)
movie_data_stratified = movie_data_stratified[["polarity","review","datatype","reviewtype"]]
movie_data_stratified

Unnamed: 0,polarity,review,datatype,reviewtype
0,neg,No idea how this is rated as high as it is (5....,Train,Movies
1,neg,I just blew four dollars renting this movie! W...,Train,Movies
2,neg,"As others have mentioned, this movie is simila...",Train,Movies
3,pos,Begotten is black and white distorted images. ...,Train,Movies
4,pos,First of all - I'm not one to go all sappy ove...,Train,Movies
...,...,...,...,...
39995,pos,Paul Verhoeven's De Vierde Man (The Fourth Man...,Test,Movies
39996,neg,"This movie could have been an impressing epic,...",Test,Movies
39997,pos,10/10 for this film.<br /><br />i'm a british ...,Test,Movies
39998,neg,"First off, the title character is not even the...",Test,Movies


In [6]:
data = pd.concat([amazon_data,amazon_test,movie_data_stratified]).reset_index(drop=True)
data

Unnamed: 0,polarity,review,datatype,reviewtype
0,Positive,Despite the fact that I have only played a sma...,Train,Product
1,Negative,I bought this charger in Jul 2003 and it worke...,Train,Product
2,Positive,Check out Maha Energy's website. Their Powerex...,Train,Product
3,Positive,Reviewed quite a bit of the combo players and ...,Train,Product
4,Negative,I also began having the incorrect disc problem...,Train,Product
...,...,...,...,...
839995,pos,Paul Verhoeven's De Vierde Man (The Fourth Man...,Test,Movies
839996,neg,"This movie could have been an impressing epic,...",Test,Movies
839997,pos,10/10 for this film.<br /><br />i'm a british ...,Test,Movies
839998,neg,"First off, the title character is not even the...",Test,Movies


In [7]:
data.groupby(["reviewtype","polarity","datatype"]).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,review
reviewtype,polarity,datatype,Unnamed: 3_level_1
Movies,neg,Test,3000
Movies,neg,Train,17000
Movies,pos,Test,3000
Movies,pos,Train,17000
Product,Negative,Test,200000
Product,Negative,Train,200000
Product,Positive,Test,200000
Product,Positive,Train,200000


### Split Data

In [8]:
train_text = data[data['datatype']=="Train"]["review"].tolist()
train_labels = data[data['datatype']=="Train"]['polarity'].tolist()

test_text = data[data['datatype']=="Test"]["review"].tolist()
test_labels = data[data['datatype']=="Test"]['polarity'].tolist()

### Text cleaning and preprocessing:

In [9]:
# Initialize the lemmatizer and stopword list
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words("english"))

def preprocess_text_nltk(text_list):
    '''Function to preprocess a list of texts by cleaning, lemmatizing, and removing unnecessary elements using NLTK.'''
    processed_texts = []

    for text in text_list:
        text = text.lower()
        tokens = word_tokenize(text)
        tokens = [
            lemmatizer.lemmatize(token) for token in tokens
            if token not in stop_words               # Remove stopwords
            and token not in string.punctuation      # Remove punctuation
            and token.isalpha()                      # Keep only alphabetic words (no digits or symbols)
        ]
        
        processed_texts.append(" ".join(tokens))

    return processed_texts

# Preprocess the training and test data
train_text_processed = preprocess_text_nltk(train_text)
test_text_processed = preprocess_text_nltk(test_text)

KeyboardInterrupt: 

### Vectorizer

In [60]:
# TF-IDF Vectorization
max_feature_num = 500
vectorizer = TfidfVectorizer(max_features=max_feature_num)

In [61]:
# Fit and transform training data, and transform test data
train_vec = vectorizer.fit_transform(train_text_processed)
test_vec = vectorizer.transform(test_text_processed)

In [62]:
# Check the shape of train_vec to confirm it's 2D
print("Shape of train_vec:", train_vec.shape)
print("Shape of test_vec:", test_vec.shape)

Shape of train_vec: (3634000, 500)
Shape of test_vec: (406000, 500)


### Model Training

In [63]:
# Model Training 
clf = MultinomialNB().fit(train_vec, train_labels)

In [64]:
# Predict on training data
train_pred = clf.predict(train_vec)

# Predict on test data
test_pred = clf.predict(test_vec)

In [65]:
# Accuracy
train_accuracy = accuracy_score(train_labels, train_pred)
test_accuracy = accuracy_score(test_labels, test_pred)

In [66]:
# Classification Report (Precision, Recall, F1-Score)
train_classification_report = classification_report(train_labels, train_pred)
test_classification_report = classification_report(test_labels, test_pred)

In [67]:
# Confusion Matrix
train_confusion_matrix = confusion_matrix(train_labels, train_pred)
test_confusion_matrix = confusion_matrix(test_labels, test_pred)

In [68]:
# Print the Results
print("Training Accuracy:", train_accuracy)
print("Test Accuracy:", test_accuracy)

print("\nTraining Classification Report:")
print(train_classification_report)

print("\nTest Classification Report:")
print(test_classification_report)

Training Accuracy: 0.7756518987341772
Test Accuracy: 0.7735443349753695

Training Classification Report:
              precision    recall  f1-score   support

    Negative       0.78      0.77      0.78   1800000
    Positive       0.77      0.78      0.78   1800000
         neg       0.82      0.41      0.54     17000
         pos       0.80      0.39      0.53     17000

    accuracy                           0.78   3634000
   macro avg       0.79      0.59      0.66   3634000
weighted avg       0.78      0.78      0.78   3634000


Test Classification Report:
              precision    recall  f1-score   support

    Negative       0.78      0.77      0.77    200000
    Positive       0.77      0.78      0.78    200000
         neg       0.83      0.41      0.55      3000
         pos       0.81      0.40      0.53      3000

    accuracy                           0.77    406000
   macro avg       0.80      0.59      0.66    406000
weighted avg       0.77      0.77      0.77    4060

### Creating a Re-Usable Model called Pickle

In [69]:
# save model and other necessary modules
all_info_want_to_save = {
    'model': clf,
    'vectorizer': vectorizer
}
save_path = open("models/sample_trained_model.pickle","wb")
pickle.dump(all_info_want_to_save, save_path)


### Demonstrate

In [5]:
# Function to load the model and vectorizer, and make predictions
def predict_polarity(model_path, user_query):
    saved_model_dic = pickle.load(open(model_path, "rb"))
    saved_clf = saved_model_dic['model']
    saved_vectorizer = saved_model_dic['vectorizer']
    
    # Preprocess the user query
    preprocessed_query = preprocess_text_nltk([user_query])
    
    # Transform the query text using the saved vectorizer
    query_vec = saved_vectorizer.transform(preprocessed_query)
    
    # Predict the polarity
    prediction = saved_clf.predict(query_vec)
    
    return prediction[0]

In [9]:
# Main function to interact with the user
def main():
    # User input query
    user_query = input("Enter your query: ")
    
    # Load the model and make prediction
    polarity = predict_polarity("/MLOps/Sentiment/models/sample_trained_model.pickle", user_query)
    
    # #Vader
    # analyzer = SentimentIntensityAnalyzer()
    # scores = analyzer.polarity_scores(user_query)

    # Output the polarity
    print(f"Polarity of your query: {polarity}")
    # print(scores)

if __name__ == "__main__":
    main()

NameError: name 'preprocess_text_nltk' is not defined