<a href="https://colab.research.google.com/github/bhavani772/SE-lab/blob/main/aia_case_study.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Read the data from the CSV file
df = pd.read_csv('/content/test.csv')

# Print the column names to debug
print("Columns in the dataset:", df.columns.tolist())

# Check for missing values in the reviews
print("Missing values in the dataset:\n", df.isnull().sum())

# Use column "2" for labels (1 for bad, 2 for good)
# Use the column "Great CD" for the first review
# Use the third column for additional reviews
df['review_text'] = df['Great CD'] + ' ' + df[df.columns[2]]

# Handle missing values: fill NaN with an empty string
df['review_text'].fillna('', inplace=True)

# Convert labels from 1 and 2 to 'Negative' and 'Positive'
df['sentiment'] = df['2'].replace({1: 'Negative', 2: 'Positive'})

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df['review_text'], df['sentiment'], test_size=0.2, random_state=42)

# Create TF-IDF vectorizer
vectorizer = TfidfVectorizer(max_features=5000)
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

# Create and train the Logistic Regression model
model = LogisticRegression()
model.fit(X_train_vectorized, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test_vectorized)

# Evaluate the model's performance
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)

# Function for predicting the sentiment of user input
def classify_review():
    while True:
        new_review = input("Enter a review (or type 'exit' to quit): ")

        if new_review.lower() == 'exit':
            break

        # Transform the new review
        new_review_vectorized = vectorizer.transform([new_review])

        # Predict sentiment
        prediction = model.predict(new_review_vectorized)[0]

        # Map the prediction back to 1 or 2
        sentiment_label = 2 if prediction == 'Positive' else 1

        print(f"Predicted sentiment: {sentiment_label} ({'Good' if sentiment_label == 2 else 'Bad'})")

# Call the function to classify user reviews
classify_review()


Columns in the dataset: ['2', 'Great CD', 'My lovely Pat has one of the GREAT voices of her generation. I have listened to this CD for YEARS and I still LOVE IT. When I\'m in a good mood it makes me feel better. A bad mood just evaporates like sugar in the rain. This CD just oozes LIFE. Vocals are jusat STUUNNING and lyrics just kill. One of life\'s hidden gems. This is a desert isle CD in my book. Why she never made it big is just beyond me. Everytime I play this, no matter black, white, young, old, male, female EVERYBODY says one thing "Who was that singing ?"']
Missing values in the dataset:
 2                                                                                                                                                                                                                                                                                                                                                                                                            

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['review_text'].fillna('', inplace=True)


Accuracy: 0.8973
Precision: 0.8973000420601682
Recall: 0.8973
F1-score: 0.8973000159827563
Enter a review (or type 'exit' to quit): EXIT
