In [5]:
#Step 1. Prepare the data (Cleaning and tokenizing)

import pandas as pd
import csv
import re
import nltk 
from nltk.corpus import stopwords
# nltk.download('stopwords')
# nltk.download('punkt')
# from google.colab import drive
# drive.mount('/content/drive')

data = pd.read_csv('IMDB_Dataset.csv', error_bad_lines = 'skip', encoding='utf-8' )
stopwordlist = stopwords.words('english')

# Data cleaning function
def clean_text(text):
    text = re.sub(r'[^\w\s]', ' ', text)  # Remove special characters except alphanumeric and whitespace
    text = text.lower()  # Convert to lowercase
    text = nltk.word_tokenize(text)  # Tokenize text
    text = [word for word in text if word not in stopwordlist]  # Remove stopwords
    return ' '.join(text)

# data['review'] = data['review'].astype(str)
data['review'] = data['review'].apply(clean_text)
data.head()


Unnamed: 0,review,sentiment
0,one reviewers mentioned watching 1 oz episode ...,positive
1,wonderful little production br br filming tech...,positive
2,thought wonderful way spend time hot summer we...,positive
3,basically family little boy jake thinks zombie...,negative
4,petter mattei love time money visually stunnin...,positive


In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score

def build_sentiment_analyzer(data, review_col='review', sentiment_col='sentiment'):
    """
    This function builds and returns a sentiment analysis model using logistic regression for binary classification.

    Args:
      data (pandas.DataFrame): The DataFrame containing your data.
      review_col (str, optional): The column name containing the review text. Defaults to 'review'.
      sentiment_col (str, optional): The column name containing the sentiment label. Defaults to 'sentiment'.

    Returns:
      tuple: A tuple containing the trained Logistic Regression model and the TfidfVectorizer object.
    """

    # Feature extraction (text to numbers)
    vectorizer = TfidfVectorizer(max_features=10000)  # Limit features to 10000 most important words
    X_features = vectorizer.fit_transform(data[review_col])  # Convert reviews to numerical features

    # Prepare labels (assuming sentiment is binary: positive or negative)
    y = data[sentiment_col].map({'positive': 1, 'negative': 0})  # Map labels to 1 (positive) and 0 (negative)

    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X_features, y, test_size=0.2, random_state=42)

    # Train the model (logistic regression for binary sentiment classification)
    model = LogisticRegression()
    model.fit(X_train, y_train)


    #Evaluating Model
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)

    print("Model accuracy:", accuracy.round(3))
    print("Model precision:", precision.round(3))
    print("Model recall:", recall.round(3))

    return model, vectorizer




#Creating Model
model, vectorizer = build_sentiment_analyzer(data)


Model accuracy: 0.895
Model precision: 0.883
Model recall: 0.912


In [19]:
def predict_sentiment(model, vectorizer, review):
    """
    This function predicts the sentiment of a new review using the trained model and vectorizer.

    Args:
      model (sklearn.linear_model.LogisticRegression): The trained sentiment analysis model.
      vectorizer (sklearn.feature_extraction.text.TfidfVectorizer): The TfidfVectorizer object used for feature extraction.
      review (str): The review text for which to predict sentiment.

    Returns:
      str: The predicted sentiment label (positive or negative).
    """

    # Convert the review text to features
    X_new = vectorizer.transform([review])

    # Predict sentiment (probability of being positive)
    predicted_proba = model.predict_proba(X_new)[0][1]  # Get probability of positive class

    # Define a threshold (adjust as needed)
    threshold = 0.5  # You can adjust this threshold based on your needs

    # Classify based on threshold
    predicted_sentiment = "positive" if predicted_proba > threshold else "negative"

    return predicted_sentiment


  # Predict sentiment for a new review
review = "This movie was fantastic!"
predicted_sentiment = predict_sentiment(model, vectorizer, review)

print(review + "\n"+ "Predicted sentiment:", predicted_sentiment)

review = "This movie was  horrible"
predicted_sentiment = predict_sentiment(model, vectorizer, review)
print(review + "\n"+ "Predicted sentiment:", predicted_sentiment)


This movie was fantastic!
Predicted sentiment: positive
This movie was  horrible
Predicted sentiment: negative


In [1]:
# from tensorflow.keras.preprocessing.sequence import pad_sequences
# from tensorflow.keras.models import Sequential
# from tensorflow.keras.layers import Embedding, LSTM, Dense
# from sklearn.model_selection import train_test_split
# from sklearn.metrics import accuracy_score, precision_score, recall_score

# def build_lstm_sentiment_analyzer(data, review_col='review', sentiment_col='sentiment', max_len=100):
#     """
#     This function builds and returns a sentiment analysis model using an LSTM network,
#     directly extracting preprocessed data and sentiment labels from the DataFrame.

#     Args:
#       data (pandas.DataFrame): The DataFrame containing your preprocessed text data and sentiment labels.
#       review_col (str, optional): The column name containing the preprocessed review text. Defaults to 'review'.
#       sentiment_col (str, optional): The column name containing the sentiment labels. Defaults to 'sentiment'.
#       max_len (int, optional): The maximum sequence length for padding. Defaults to 100.

#     Returns:
#       tuple: A tuple containing the trained LSTM model.
#     """

#     # Extract preprocessed data and sentiment labels
#     preprocessed_data = data[review_col]
#     sentiment_labels = data[sentiment_col]

#     # Train-test split
#     X_train, X_test, y_train, y_test = train_test_split(preprocessed_data, sentiment_labels, test_size=0.2, random_state=42)

#     # Build the LSTM model (unchanged)
#     model = Sequential()
#     model.add(Embedding(max_features=10000, embedding_dim=128, input_length=max_len))  # Embedding layer
#     model.add(LSTM(64))  # LSTM layer with 64 units
#     model.add(Dense(1, activation='sigmoid'))  # Output layer for binary classification

#     model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
#     model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))  # Train the model

#     # Evaluate the model (unchanged)
#     y_pred = model.predict(X_test)
#     accuracy = accuracy_score(y_test, y_pred.round())
#     precision = precision_score(y_test, y_pred.round())
#     recall = recall_score(y_test, y_pred.round())

#     print("Model Accuracy:", accuracy.round(3))
#     print("Model Precision:", precision.round(3))
#     print("Model Recall:", recall.round(3))

#     return model

# lstm_model = build_lstm_sentiment_analyzer(data)


In [2]:
# from tensorflow.keras.preprocessing.sequence import pad_sequences

# def predict_sentiment_lstm(model, max_len=100, review):
#     """
#     This function predicts the sentiment of a new review using the trained LSTM model.

#     Args:
#       model (tensorflow.keras.models.Model): The trained LSTM sentiment analysis model.
#       max_len (int): The maximum sequence length used during training.
#       review (str): The review text for which to predict sentiment.

#     Returns:
#       str: The predicted sentiment label (positive or negative).
#     """

#     # Preprocess the review (similar to your original preprocessing steps)
#     # ... (your preprocessing steps here)

#     # Convert the review text to a sequence (assuming you tokenized before)
#     review_sequence = [word for word in review.split()]  # Assuming tokenization

#     # Pad the sequence (if necessary)
#     padded_sequence = pad_sequences([review_sequence], maxlen=max_len, padding='post')

#     # Predict sentiment
#     predicted_proba = model.predict(padded_sequence)[0][1]  # Get probability of positive class

#     # Define a threshold (adjust as needed)
#     threshold = 0.5  # You can adjust this threshold based on your needs

#     # Classify based on threshold
#     predicted_sentiment = "positive" if predicted_proba > threshold else "negative"

#     return predicted_sentiment


In [None]:
from flask import Flask, render_template, request

app = Flask(__name__)

@app.route('/', methods=["POST", "GET"])
def home(result=None):
    if request.method == "POST":
        review = request.form["review"]
        if (request.form["algorithm"] == "simple_regression"):
            
            model, vectorizer = build_sentiment_analyzer(data)
            sentiment = predict_sentiment(model, vectorizer, review)

            # Render the template with the predicted sentiment
            return render_template("index.html", result=sentiment)
        else:
            return render_template("index.html", result='Algorithm not yet supported')

    else:  # GET request
        return render_template("index.html")
    


if __name__ == '__main__':
    app.run()

 * Serving Flask app "__main__" (lazy loading)
 * Environment: production
   Use a production WSGI server instead.
 * Debug mode: off


 * Running on http://127.0.0.1:5000/ (Press CTRL+C to quit)
127.0.0.1 - - [24/Jun/2024 01:02:50] "[37mGET / HTTP/1.1[0m" 200 -
127.0.0.1 - - [24/Jun/2024 01:02:52] "[37mGET / HTTP/1.1[0m" 200 -
127.0.0.1 - - [24/Jun/2024 01:04:04] "[37mGET / HTTP/1.1[0m" 200 -
127.0.0.1 - - [24/Jun/2024 01:04:37] "[37mGET / HTTP/1.1[0m" 200 -
127.0.0.1 - - [24/Jun/2024 01:05:18] "[37mGET / HTTP/1.1[0m" 200 -
