In [1]:
# prompt: Mount drive

from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [31]:
import torch
import os
from transformers import pipeline
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
import re
from collections import Counter
import torch.nn as nn

nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('omw-1.4')
nltk.download('stopwords')

class Net(nn.Module):
    def __init__(self, input_size, output_size):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(input_size, 128)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(128, output_size)

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        return self.fc2(x)

class Classifier():
  def __init__(self):
    self.model = torch.load(os.path.join('/content/drive','MyDrive', 'Project', 'cluster_model.pt'))
    self.nlp = pipeline('sentiment-analysis', model='arpanghoshal/EmoRoBERTa', return_all_scores=True)
    self.compiled_re = re.compile(r'[^a-zA-Z\s]')

  def preprocess_reviews(self, text):
      stop_words = set(stopwords.words('english'))
      text = self.compiled_re.sub('', text).lower()
      tokens = word_tokenize(text)

      return " ".join(tokens)

  def average_sentiments(self, sentiments_list):
    sentiment_sums = {}
    sentiment_counts = {}
    for sentiments in sentiments_list:
        for sentiment in sentiments:
            label = sentiment['label']
            score = sentiment['score']
            if label in sentiment_sums:
                sentiment_sums[label] += score
                sentiment_counts[label] += 1
            else:
                sentiment_sums[label] = score
                sentiment_counts[label] = 1
    average_sentiments = {label: sentiment_sums[label] / sentiment_counts[label] for label in sentiment_sums}
    return average_sentiments

  def predict(self, review_text):
    review_text = self.preprocess_reviews(review_text)
    # Split the review text into segments of up to 512 characters
    review_text_segments = [review_text[i:i+512] for i in range(0, len(review_text), 512)]
    results = []

    # Analyze the sentiment of each text segment
    for segment in review_text_segments:
        segment = segment[:512]  # Ensure segment is not longer than 512 characters
        result = self.nlp(segment, return_all_scores=True)
        results.extend(result)

    # Average the sentiment scores from all segments
    average_sentiments_list = self.average_sentiments(results)

    # Convert the averaged sentiments into a tensor (if needed for further processing)
    sentiment_vector = [average_sentiments_list[key] for key in sorted(average_sentiments_list.keys())]
    sentiment_tensor = torch.tensor(sentiment_vector).float().unsqueeze(0)  # Add batch dimension if needed

    # Pass the sentiment tensor through the model
    net_output = self.model(sentiment_tensor)

    # Get the index of the maximum value (argmax)
    _, predicted_index = torch.max(net_output, dim=1)  # Assuming net_output is 2D: [batch_size, num_classes]

    return predicted_index.item() -1 # Return the index as a Python int

  def predict_multiple(self, review_texts):
    predictions = []
    for review_text in review_texts:
        prediction = self.predict(review_text)
        predictions.append(prediction)

    item_counts = Counter(predictions)

    # Find the most common item
    most_common_item = item_counts.most_common(1)

    return most_common_item[0][0]

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [32]:
classifier = Classifier()

All model checkpoint layers were used when initializing TFRobertaForSequenceClassification.

All the layers of TFRobertaForSequenceClassification were initialized from the model checkpoint at arpanghoshal/EmoRoBERTa.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaForSequenceClassification for predictions without further training.


In [33]:
classifier.predict('this is a test')

16

In [34]:
classifier.predict_multiple(['this is a test', 'this is another test'])

16

In [35]:
import pickle


# Save the classifier object to a file using pickle
with open(os.path.join('/content/drive','MyDrive', 'Project', 'classifier.pkl'), 'wb') as file:
    pickle.dump(classifier, file)

print("Classifier saved successfully.")




Classifier saved successfully.


In [37]:
class Recommender():
  def __init__(self):
      with open(os.path.join('/content/drive/MyDrive/', 'Project', 'cluster_rules.pkl'), 'rb') as file:
        self.cluster_rules = pickle.load(file)

      with open(os.path.join('/content/drive/MyDrive/', 'Project', 'rules.pkl'), 'rb') as file:
        self.universal_rules = pickle.load(file)

  def recommend_helper(self, rules, items, n_recommendations):
      # Filter rules with the input items as antecedents
      filtered_rules = rules[rules['antecedents'].apply(lambda x: x.issubset(items))]

      # Sort rules by confidence, lift, or other metric
      sorted_rules = filtered_rules.sort_values(by=['confidence', 'lift'], ascending=False)

      # Get the consequents from the rules as recommendations
      recommendations = sorted_rules['consequents'].apply(lambda x: list(x)).tolist()

      # Flatten the list and remove duplicates
      recommendations = list(set([item for sublist in recommendations for item in sublist]))

      # Remove input items from recommendations
      recommendations = [r for r in recommendations if r not in items]

      return recommendations[:n_recommendations]


  def recommend(self, items, cluster, n_recommendations=5):
      """
      Recommend new items based on a set of input items.

      Parameters:
      - items: a set of items for which to find recommendations
      - rules: precomputed association rules
      - n_recommendations: the number of recommendations to return

      Returns: a sorted list of recommended items
      """
      if cluster is None:
        return set(self.recommend_helper(self.universal_rules, items, n_recommendations))
      else:
        return set(self.recommend_helper(self.cluster_rules[cluster], items, n_recommendations) + self.recommend_helper(self.universal_rules, items, n_recommendations))

# Example usage:
items = {'bioshock'}
recommender = Recommender()
recommender.recommend(items, 1)


{'grand theft auto v',
 'orion prelude',
 'rising stormred orchestra 2 multiplayer',
 'rocket league',
 'torchlight ii'}

In [38]:
with open(os.path.join('/content/drive','MyDrive', 'Project', 'recommender.pkl'), 'wb') as file:
    pickle.dump(recommender, file)

print("Recommender saved successfully.")


Recommender saved successfully.
