In [None]:
# prompt: Mount drive

from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
import os
import pandas as pd
import numpy as np

df = pd.read_csv(os.path.join('/content/drive/MyDrive/', 'Project', 'dataset.csv'))
df.dropna(subset=['review_text'], inplace=True)
df.head()


Unnamed: 0,app_id,app_name,review_text,review_score,review_votes
0,10,Counter-Strike,Ruined my life.,1,0
1,10,Counter-Strike,This will be more of a ''my experience with th...,1,1
2,10,Counter-Strike,This game saved my virginity.,1,0
3,10,Counter-Strike,• Do you like original games? • Do you like ga...,1,0
4,10,Counter-Strike,"Easy to learn, hard to master.",1,1


In [None]:
clean_df = df.dropna(subset=['review_text'])

In [None]:
filtered_df = clean_df[(clean_df['review_score'] > 0) & (clean_df['review_votes'] > 0)]


In [None]:
sampled_df = filtered_df.sample(1000)


In [None]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
import re

# Download necessary NLTK data
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('omw-1.4')
nltk.download('stopwords')


# Compile the regular expression only once
compiled_re = re.compile(r'[^a-zA-Z\s]')

# Initialize the WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

def preprocess_reviews(df, column_name):
    stop_words = set(stopwords.words('english'))

    def clean_text(text):
        text = compiled_re.sub('', text).lower()
        tokens = word_tokenize(text)

        return " ".join(tokens)

    # Apply the clean_text function
    df['processed_' + column_name] = df[column_name].apply(lambda x: clean_text(x) if isinstance(x, str) else x)

    return df

# Example usage
processed_df = preprocess_reviews(sampled_df, 'review_text')


In [None]:
from transformers import pipeline
import pandas as pd

# Load the sentiment analysis pipeline using the EmoRoBERTa model
emotion = pipeline('sentiment-analysis', model='arpanghoshal/EmoRoBERTa', return_all_scores=True)

# Sample 1000 entries from the filtered DataFrame
sampled_df = filtered_df.sample(1000)

def perform_sentiment_analysis(review_text):
    # Split the input text into smaller segments of up to 512 characters
    review_text_segments = [review_text[i:i+512] for i in range(0, len(review_text), 512)]

    # Perform sentiment analysis on each segment and accumulate the results
    results = []
    for segment in review_text_segments:
        # The model might still throw an error if a single word is longer than 512 characters,
        # so we handle this case by truncating the segment
        segment = segment[:512]
        result = emotion(segment)
        results.extend(result)

    # Aggregate the results (here, you might want to average the scores or choose the dominant sentiment)
    # This is a placeholder for your aggregation logic
    aggregated_result = results  # Modify this as needed

    return aggregated_result

# Apply the sentiment analysis function to each row in the 'review_text' column
sampled_df['sentiment'] = processed_df['processed_review_text'].apply(perform_sentiment_analysis)

# Print the updated DataFrame with the sentiment predictions
print(sampled_df)


All model checkpoint layers were used when initializing TFRobertaForSequenceClassification.

All the layers of TFRobertaForSequenceClassification were initialized from the model checkpoint at arpanghoshal/EmoRoBERTa.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaForSequenceClassification for predictions without further training.


KeyboardInterrupt: 

In [None]:
# prompt: get the first row of sampled_df

print(sampled_df.head(1)["sentiment"])


In [None]:
import pandas as pd

# Function to calculate average sentiment scores
def calculate_average_sentiments(sentiments_list):
    # Initialize a dictionary to store the sum and count of scores for each sentiment
    sentiment_sums = {}
    sentiment_counts = {}

    # Iterate through each list of sentiments in the cell
    for sentiments in sentiments_list:
        for sentiment in sentiments:
            label = sentiment['label']
            score = sentiment['score']
            if label in sentiment_sums:
                sentiment_sums[label] += score
                sentiment_counts[label] += 1
            else:
                sentiment_sums[label] = score
                sentiment_counts[label] = 1

    # Calculate the average score for each sentiment
    average_sentiments = {label: sentiment_sums[label] / sentiment_counts[label] for label in sentiment_sums}

    return average_sentiments

# Apply the function to each row and update the DataFrame with new columns for each sentiment
for index, row in sampled_df.iterrows():
    if row['sentiment']:  # Check if the sentiment list is not empty
        average_sentiments = calculate_average_sentiments(row['sentiment'])
        for sentiment, score in average_sentiments.items():
            sampled_df.at[index, sentiment] = score


In [None]:
sampled_df

In [None]:
sampled_df.drop("sentiment", inplace=True, axis=1)

In [None]:
sampled_df.to_csv(os.path.join('/content/drive/MyDrive/', 'Project', 'sentiment_1000_dataset.csv'), index=False)


In [None]:
import matplotlib.pyplot as plt

# List of specified attributes
attributes = [
    'admiration', 'amusement', 'anger', 'annoyance', 'approval', 'caring',
    'confusion', 'curiosity', 'desire', 'disappointment', 'disapproval',
    'disgust', 'embarrassment', 'excitement', 'fear', 'gratitude', 'grief',
    'joy', 'love', 'nervousness'
]

# Calculate the average for each attribute
averages = sampled_df[attributes].mean()

# Create a bar plot for the averages
plt.figure(figsize=(10, 6))
averages.plot(kind='bar')
plt.title('Average Scores of Sentiments')
plt.ylabel('Average Score')
plt.xlabel('Sentiments')
plt.xticks(rotation=45, ha='right')  # Rotate labels for better readability
plt.tight_layout()  # Adjust layout to not cut off labels

# Show the plot
plt.show()


In [None]:
!pip install kneed

import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
from kneed import KneeLocator


sentiments = sampled_df[attributes]

# Determine the optimal number of clusters using the elbow method
wcss = []
max_clusters = 10  # You can adjust the maximum number of clusters you want to test
for i in range(1, max_clusters + 1):
    kmeans = KMeans(n_clusters=i, random_state=42)
    kmeans.fit(sentiments)
    wcss.append(kmeans.inertia_)


# Assuming the WCSS values are stored in `wcss` from the previous snippet
kneedle = KneeLocator(range(1, max_clusters + 1), wcss, curve='convex', direction='decreasing')
optimal_clusters = kneedle.knee

print(f"Optimal number of clusters: {optimal_clusters}")


In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
import seaborn as sns

num_clusters = optimal_clusters  # This should be set based on your optimal cluster count found earlier
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
clusters = kmeans.fit_predict(sentiments)

centroids = kmeans.cluster_centers_

# Convert centroids into a DataFrame for easier interpretation
centroids = kmeans.cluster_centers_

# Optionally, if your sentiments data was standardized, you might want to transform centroids back to the original scale
# centroids = scaler.inverse_transform(centroids) # Uncomment this if you've used StandardScaler or similar

# Creating a DataFrame for the centroids for easier plotting
centroids_df = pd.DataFrame(centroids, columns=attributes)  # Ensure 'attributes' list is defined as in your data

# Plotting the heatmap
plt.figure(figsize=(10, 6))
sns.heatmap(centroids_df, annot=True, fmt=".2f", cmap="YlGnBu")
plt.title("Heatmap of Cluster Centroids")
plt.xlabel("Attributes")
plt.ylabel("Cluster")
plt.show()



In [None]:
sampled_df['cluster'] = clusters
sampled_df.to_csv(os.path.join('/content/drive/MyDrive/', 'Project', 'sentiment_1000_dataset.csv'), index=False)
