In [None]:
import pandas as pd
from transformers import pipeline
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize

file_path = './imdb_reviews_unique.csv'
reviews_df = pd.read_csv(file_path)

sentiment_model = pipeline("sentiment-analysis", truncation=True)

def analyze_sentiment(title, review):
    full_text = title + ". " + review
    sentences = sent_tokenize(full_text)
    scores = []

    for sentence in sentences:
        max_length = 512
        parts = [sentence[i:i + max_length] for i in range(0, len(sentence), max_length)]

        for part in parts:
            if part.strip() != '':
                result = sentiment_model(part)[0]
                scores.append(result['score'] if result['label'] == 'POSITIVE' else 1 - result['score'])

    return sum(scores) / len(scores) if scores else 0

reviews_df['Sentiment_Score'] = reviews_df.apply(lambda x: analyze_sentiment(x['Title'], x['Review']), axis=1)

reviews_df.head()

output_path = "/content/drive/My Drive/5243data/reviews_df.csv"
reviews_df.to_csv(output_path, index=False)


In [None]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt

reviews_df['Date'] = pd.to_datetime(reviews_df['Date'])

reviews_df['Year-Half_Numeric'] = reviews_df['Date'].apply(lambda x: x.year + (0.5 if x.month > 6 else 0))

score_by_time = reviews_df.groupby('Year-Half_Numeric')['Sentiment_Score'].mean().reset_index()

score_by_time['Moving_Avg'] = score_by_time['Sentiment_Score'].rolling(window=3).mean()

X = score_by_time['Year-Half_Numeric'].values.reshape(-1, 1)
y = score_by_time['Sentiment_Score'].values

model = LinearRegression().fit(X, y)
score_by_time['Trend'] = model.predict(X)

plt.figure(figsize=(12, 6))
plt.plot(score_by_time['Year-Half_Numeric'], score_by_time['Sentiment_Score'], marker='o', label='Average Sentiment Score')
plt.plot(score_by_time['Year-Half_Numeric'], score_by_time['Moving_Avg'], marker='o', linestyle='--', color='red', label='Moving Average')
plt.plot(score_by_time['Year-Half_Numeric'], score_by_time['Trend'], linestyle='-', color='green', label='Trend Line')

plt.xticks(score_by_time['Year-Half_Numeric'], rotation=45)
plt.xlabel('Year-Half')
plt.ylabel('Average Sentiment Score')
plt.title('Sentiment Score Trend Analysis')
plt.legend()
plt.grid(True)
plt.show()


In [None]:
import pandas as pd
import scipy.stats as stats
import matplotlib.pyplot as plt
import seaborn as sns

reviews_df_path = './tille+review.csv'
imdb_reviews_path = './imdb_reviews_2(1).csv'

reviews_df = pd.read_csv(reviews_df_path)
imdb_reviews = pd.read_csv(imdb_reviews_path)

merged_df = pd.merge(reviews_df, imdb_reviews[['Author', 'Rating']], on='Author', how='left')

merged_df = merged_df[merged_df['Rating'] != 'No Rating']
merged_df.dropna(subset=['Rating'], inplace=True)

merged_df['Rating'] = merged_df['Rating'].str.split('/').str[0]
merged_df['Rating'] = pd.to_numeric(merged_df['Rating'], errors='coerce')

merged_df['Transformed_Sentiment_Score'] = (merged_df['Sentiment_Score'] * 9)+1

transformed_correlation = merged_df[['Transformed_Sentiment_Score', 'Rating']].corr()

spearman_corr_transformed, spearman_p_value_transformed = stats.spearmanr(merged_df['Transformed_Sentiment_Score'], merged_df['Rating'], nan_policy='omit')

print("Correlation Matrix:\n", transformed_correlation)
print("\nSpearman Correlation Coefficient: ", spearman_corr_transformed)
print("Spearman P-value: ", spearman_p_value_transformed)

print("\nDescriptive Statistics:\n", merged_df[['Transformed_Sentiment_Score', 'Rating']].describe())

plt.figure(figsize=(12, 6))

plt.subplot(1, 2, 1)
sns.boxplot(data=merged_df, y='Transformed_Sentiment_Score')
plt.title('Boxplot of Transformed Sentiment Score')

plt.subplot(1, 2, 2)
sns.boxplot(data=merged_df, y='Rating')
plt.title('Boxplot of Rating')

plt.tight_layout()
plt.show()