In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import nltk
from nltk.corpus import stopwords
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, get_linear_schedule_with_warmup
from datasets import Dataset
from statsmodels.tsa.arima.model import ARIMA
import re
import logging
 
nltk.download('stopwords')


In [None]:
df = pd.read_csv("amazon_reviews.csv")

df.dropna(subset=['review'], inplace=True)

def assign_sentiment(rating):
    if rating >= 4:
        return 1  # Positive
    elif rating == 3:
        return 0  # Neutral
    else:
        return -1  # Negative

df['sentiment'] = df['rating'].apply(assign_sentiment)

stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    text = text.lower()
    words = [word for word in text.split() if word.isalpha() and word not in stop_words]
    return ' '.join(words)

df['cleaned_review'] = df['review'].apply(preprocess_text)

print(df.head())


In [None]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-multilingual-cased")
model = AutoModelForSequenceClassification.from_pretrained("bert-base-multilingual-cased", num_labels=3)


In [None]:
df['bert_sentiment'] = df['sentiment'].apply(lambda x: 0 if x == -1 else x)
hf_dataset = Dataset.from_pandas(df[['cleaned_review', 'bert_sentiment']])

def tokenize_function(examples):
    return tokenizer(examples['cleaned_review'], padding="max_length", truncation=True, max_length=128)

hf_dataset = hf_dataset.map(tokenize_function, batched=True, remove_columns=['cleaned_review'])
hf_dataset = hf_dataset.rename_column("bert_sentiment", "labels")
hf_dataset = hf_dataset.train_test_split(test_size=0.2)

print(hf_dataset)


In [None]:
training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    num_train_epochs=3,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss"
)

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    
    report = classification_report(labels, preds)
    print("Classification Report:\n", report)
    
    return {
        'accuracy': accuracy_score(labels, preds),
    }

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=hf_dataset["train"],
    eval_dataset=hf_dataset["test"],
    compute_metrics=compute_metrics,
    tokenizer=tokenizer
)
trainer.train()


In [None]:
batch_size = 32
results = []

for i in range(0, len(df), batch_size):
    batch = df['cleaned_review'][i:i + batch_size].tolist()
    inputs = tokenizer(batch, return_tensors="pt", padding=True, truncation=True, max_length=128)
    device = torch.device("cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu")
    print(f"Using device: {device}")
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
    model.to(device)
    inputs = {key: value.to(device) for key, value in inputs.items()}
    
    with torch.no_grad():
        outputs = model(**inputs)
    
    predictions = torch.argmax(outputs.logits, dim=1).cpu().numpy()
    results.extend(predictions)

df['bert_sentiment'] = results

print(df[['cleaned_review', 'bert_sentiment']].head())



In [None]:
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

In [None]:
sentiment_counts = df['bert_sentiment'].value_counts()
sns.barplot(x=sentiment_counts.index, y=sentiment_counts.values, palette="viridis")
plt.title("Sentiment Distribution")
plt.xlabel("Sentiment")
plt.ylabel("Count")
plt.show()


In [None]:
df['date'] = pd.to_datetime(df['date'])

daily_sentiment = df.groupby('date')['bert_sentiment'].mean()

plt.figure(figsize=(12, 6))
plt.plot(daily_sentiment.index, daily_sentiment.values)
plt.title("Average Sentiment Over Time")
plt.xlabel("Date")
plt.ylabel("Average Sentiment")
plt.grid()
plt.show()

arima_model = ARIMA(daily_sentiment, order=(1, 1, 1))
arima_result = arima_model.fit()
print(arima_result.summary())

forecast = arima_result.forecast(steps=10)
print("Forecasted Sentiments:", forecast)


In [None]:
# 1. Statistical Analysis of Sentiments
sentiment_array = df['bert_sentiment'].values
print("Sentiment Statistics:")
print("Mean Sentiment:", np.mean(sentiment_array))
print("Median Sentiment:", np.median(sentiment_array))
print("Standard Deviation:", np.std(sentiment_array))

# 2. Binning Sentiments
sentiment_bins = np.histogram(df['bert_sentiment'], bins=3)
print("Sentiment Distribution Bins:", sentiment_bins)

# 3. Advanced Feature Engineering
def extract_review_length(reviews):
    """Calculate review lengths using NumPy"""
    return np.array([len(review.split()) for review in reviews])

df['review_length'] = extract_review_length(df['cleaned_review'])

# Correlation between review length and sentiment
length_sentiment_correlation = np.corrcoef(df['review_length'], df['bert_sentiment'])[0, 1]
print("Correlation between Review Length and Sentiment:", length_sentiment_correlation)

# 4. Random Sampling for Model Validation
np.random.seed(42)
random_sample_indices = np.random.choice(len(df), size=100, replace=False)
sample_reviews = df.iloc[random_sample_indices]

# 5. One-hot Encoding of Sentiments
sentiment_one_hot = np.eye(3)[df['bert_sentiment']]

In [None]:
X = df['cleaned_review']
y = df['sentiment']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

classifiers = {
    'Naive Bayes': MultinomialNB(),
    'Decision Tree': DecisionTreeClassifier(random_state=42),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42)
}

for name, clf in classifiers.items():
    clf.fit(X_train_tfidf, y_train)
    y_pred = clf.predict(X_test_tfidf)

    print(f"\n{name} Classifier Results:")
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))

    plt.figure(figsize=(8, 6))
    cm = confusion_matrix(y_test, y_pred)
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title(f'{name} Confusion Matrix')
    plt.ylabel('True Label')
    plt.xlabel('Predicted Label')
    plt.show()

def predict_sentiment_tfidf(reviews, vectorizer, classifier):
    reviews_tfidf = vectorizer.transform(reviews)

    predictions = classifier.predict(reviews_tfidf)

    sentiment_map = {1: "Positive", 0: "Neutral", -1: "Negative"}

    for review, pred in zip(reviews, predictions):
        print(f"Review: {review}")
        print(f"Predicted Sentiment: {sentiment_map[pred]}\n")

# Example usage
new_reviews = [
    "The camera quality is amazing and the battery life is impressive!",
    "The product was okay, but the delivery was late.",
    "I am very disappointed with the purchase. It broke within a week."
]

best_model = classifiers['Decision Tree']
predict_sentiment_tfidf(new_reviews, tfidf_vectorizer, best_model)

In [None]:
def generate_wordcloud(text_series, title, sentiment_type=None):
    """
    Generate a WordCloud visualization for a given text series.
    
    :param text_series: Pandas Series of text data
    :param title: Title for the WordCloud plot
    :param sentiment_type: Optional filter for specific sentiment type
    """
    if sentiment_type is not None:
        text_series = text_series[df['sentiment'] == sentiment_type]
    
    text = ' '.join(text_series)
    
    wordcloud = WordCloud(
        width=800, 
        height=400, 
        background_color='white', 
        stopwords=stop_words,
        max_words=50000,
        colormap='viridis'
    ).generate(text)
    
    plt.figure(figsize=(10, 5))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.title(title)
    plt.tight_layout(pad=0)
    plt.show()

generate_wordcloud(df['cleaned_review'], 'Overall Most Frequent Words')

# WordCloud for Positive Reviews
generate_wordcloud(df['cleaned_review'], 'Most Frequent Words in Positive Reviews', sentiment_type=1)

# WordCloud for Neutral Reviews
generate_wordcloud(df['cleaned_review'], 'Most Frequent Words in Neutral Reviews', sentiment_type=0)

# WordCloud for Negative Reviews
generate_wordcloud(df['cleaned_review'], 'Most Frequent Words in Negative Reviews', sentiment_type=-1)

In [None]:
df.to_csv("output_with_sentiments.csv", index=False)
print("Results saved to output_with_sentiments.csv")


# EXAMPLE USAGE

In [None]:
# Example usage
new_reviews = [
    "The camera quality is amazing and the battery life is impressive!",
    "The product was okay, but the delivery was late.",
    "I am very disappointed with the purchase. It broke within a week."
]

best_model = classifiers['Decision Tree'] # Other options: 'Naive Bayes', 'Random Forest'
predict_sentiment_tfidf(new_reviews, tfidf_vectorizer, best_model)

In [None]:
df['date'] = pd.to_datetime(df['date'])

monthly_trends = df.resample('M', on='date')['bert_sentiment'].mean().reset_index()

print("Monthly Sentiment Trends:")
print(monthly_trends)

if len(monthly_trends) < 10:
    print("Warning: Not enough data points for reliable ARIMA forecasting")
else:
    try:
        model = ARIMA(monthly_trends['bert_sentiment'], order=(1, 1, 1))
        model_fit = model.fit()

        print("\nARIMA Model Summary:")
        print(model_fit.summary())

        # Forecast next 12 months
        forecast_steps = 12
        forecast = model_fit.forecast(steps=forecast_steps)

        last_date = monthly_trends['date'].iloc[-1]
        forecast_dates = pd.date_range(start=last_date, periods=forecast_steps+1, freq='M')[1:]
        forecast_df = pd.DataFrame({
            'date': forecast_dates,
            'forecasted_sentiment': forecast
        })

        plt.figure(figsize=(12, 6))
        
        plt.plot(monthly_trends['date'], monthly_trends['bert_sentiment'], 
                 label='Historical Sentiment', color='blue')

        plt.plot(forecast_df['date'], forecast_df['forecasted_sentiment'], 
                 label='Forecasted Sentiment', color='red', linestyle='--')

        try:
            forecast_ci = model_fit.get_forecast(steps=forecast_steps)
            conf_int = forecast_ci.conf_int()

            conf_int_array = conf_int.to_numpy() if hasattr(conf_int, 'to_numpy') else np.array(conf_int)

            plt.fill_between(
                forecast_dates, 
                conf_int_array[:, 0], 
                conf_int_array[:, 1], 
                color='pink', 
                alpha=0.3, 
                label='95% Confidence Interval'
            )
        except Exception as ci_error:
            print("Could not plot confidence interval:", ci_error)

        plt.title('Monthly Sentiment Trends with ARIMA Forecast')
        plt.xlabel('Date')
        plt.ylabel('Average Sentiment Score')
        plt.legend()
        plt.grid(True)
        plt.tight_layout()
        plt.show()

        print("\nForecast for Next 12 Months:")
        print(forecast_df)

    except Exception as model_error:
        print("Error in ARIMA modeling:", model_error)
        print("Suggestion: Try different ARIMA order or check data preprocessing")

if 'monthly_trends' in locals():
    plt.figure(figsize=(12, 6))
    plt.plot(monthly_trends['date'], monthly_trends['bert_sentiment'])
    plt.title('Monthly Sentiment Trends')
    plt.xlabel('Date')
    plt.ylabel('Average Sentiment Score')
    plt.show()