# Predict the review sentiment of the game

I made the script here now but I will use Google colab

I will split the data again for this specific task, in such a way that there is a balanced distribution of the sentiment labels in every dataset 

I'm using a pre-trained CNN model VGG-16.  
- 16 layers in total, of which 13 convolutional layers and 3 fully connected layers (dense layers)
- the convolutional layers are composed of 3x3 filters
- softmax layer in the end for classification

The data contains 6 sentiment classes (and also missings)

Inspo:
- https://keras.io/api/applications/#usage-examples-for-image-classification-models

In [5]:
#Import packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import torch
import os
import tensorflow

from tensorflow.keras.preprocessing.image import ImageDataGenerator #for data augmentation
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout #the layers we need for the CNN
from tensorflow.keras.models import Sequential #Sequential will contain all the different CNN layers
from tensorflow.keras.applications import VGG16 #VGG-16

#for the metadata
import json
import csv

## Metadata file for sentiment prediction 

Don't need to run this again

Using Grégoire's code, I created new metadata files because I need the sentiment labels.
I did it for the test, train and validation sets separately 

In [17]:
# Path to your folder containing images
folder_path = "../datasets/validation"

# Path to your JSON file
json_file_path = "../datasets/dataset.json"

# Path to save the CSV file
csv_file_path = "../datasets/metadata2_validation.csv"

# Load JSON data into a dictionary
with open(json_file_path, 'r') as json_file:
    labels_dict = json.load(json_file)

In [18]:
# Initialize a list to store image name and label pairs
data = []

# Iterate over the images in the folder
for filename in os.listdir(folder_path):
    # Check if the file is an image
    if filename.endswith('.jpg') or filename.endswith('.png'):
        # Check if the image filename is present in the labels dictionary
        for element in labels_dict:
            #print(element["screenshots"])
            if filename in element["screenshots"]:
                # Append image name and label to the data list
                data.append([filename, element["sentiment"], element["title"]])

In [19]:
# Write the data to a CSV file
with open(csv_file_path, 'w', newline='') as csvfile:
    csv_writer = csv.writer(csvfile)
    # Write header
    csv_writer.writerow(['Image Name', 'Sentiment', 'Game Name'])
    # Write data
    csv_writer.writerows(data)

print("CSV file created successfully.")

CSV file created successfully.


## Explore

I check out how many sentiment classes there are and if this is the same for all datasets.

There are 6 sentiment classes:
- Overwhelmingly Positive
- Very Positive
- Mostly Positive
- Positive
- Mixed
- Mostly Negative
- (missing)

In [21]:
#Load CSV files
df_train = pd.read_csv("../datasets/metadata2_train.csv")
df_test = pd.read_csv("../datasets/metadata2_test.csv")
df_validation = pd.read_csv("../datasets/metadata2_validation.csv")

#Check the unique values in the 'Sentiment' column
sentiment_classes_train = df_train['Sentiment'].unique()
sentiment_classes_test = df_test['Sentiment'].unique()
sentiment_classes_validation = df_validation['Sentiment'].unique()

#Print the unique classes
print("Number of classes for 'Sentiment':", len(sentiment_classes_train))
print("Classes:", sentiment_classes_train)

print("Number of classes for 'Sentiment':", len(sentiment_classes_test))
print("Classes:", sentiment_classes_test)

print("Number of classes for 'Sentiment':", len(sentiment_classes_validation))
print("Classes:", sentiment_classes_validation)


Number of classes for 'Sentiment': 7
Classes: ['Mostly Positive' nan 'Very Positive' 'Mixed' 'Positive'
 'Overwhelmingly Positive' 'Mostly Negative']
Number of classes for 'Sentiment': 7
Classes: ['Mostly Positive' nan 'Very Positive' 'Mixed' 'Positive'
 'Overwhelmingly Positive' 'Mostly Negative']
Number of classes for 'Sentiment': 7
Classes: ['Mostly Positive' nan 'Very Positive' 'Mixed' 'Positive'
 'Overwhelmingly Positive' 'Mostly Negative']


## Data augmentation

In [None]:
# Define paths to dataset folders
train_dir = "../datasets/train"
test_dir = "../datasets/test"
val_dir = "../datasets/validation"
metadata_file = "../datasets/metadata.csv"

I did some data augmentation but apparently it's better to start more simple and add complexity

In [None]:
datagen = ImageDataGenerator(rescale=1/255, #each pixel value in the images will be divided by 255, normalizing them to the range [0, 1]
                            rotation_range=10, #random 10 degree rotation to the left or right
                            width_shift_range = 0.1, #10% width shift
                            height_shift_range=0.1, #10% height shift
                            shear_range=0.1, #another type of transformation
                            zoom_range=0.1, #10% zoom 
                            horizontal_flip=True) 
 
train_generator = datagen.flow_from_directory(train_dir, 
                                            target_size=(224,224), #input size for VGG-16
                                            batch_size=4, 
                                            class_mode="categorical", 
                                            subset='training') 

test_generator = datagen.flow_from_directory(test_dir, 
                                                target_size=(224,224), 
                                                batch_size=4, 
                                                class_mode="categorical", 
                                                subset='test')
                                                   
validation_generator = datagen.flow_from_directory(val_dir, 
                                                target_size=(224,224), 
                                                batch_size=4, 
                                                class_mode="categorical", 
                                                subset='validation')
                                                   


## Model building

In [None]:
#base model 
model = Sequential() 

model.add(Conv2D(64, (3, 3), activation='relu', 
                 input_shape=(224, 224, 3))) 

model.add(Conv2D(64, (3, 3), activation='relu')) 

model.add(MaxPooling2D((2, 2))) model.add(Dropout(0.2)) 

model.add(Conv2D(128, (3, 3), activation='relu')) 

model.add(Conv2D(128, (3, 3), activation='relu')) 

model.add(MaxPooling2D((2, 2))) model.add(Dropout(0.2)) 

model.add(Flatten()) model.add(Dense(512, activation='relu')) #convert the 2D feature maps into a 1D vector

model.add(Dropout(0.5)) model.add(Dense(6, activation='softmax')) #6 sentiment classes

opt = tf.keras.optimizers.Adam(learning_rate=0.0001) #very small? i dont know, Adam optimizer

model.compile(loss='categorical_crossentropy', optimizer=opt, metrics=['accuracy'])

In [None]:
#summary of the NN architecture 
model.summary()

In [None]:
#VGG-16 model
  
model_vgg16 = VGG16(input_shape=(224,224,3), include_top=False, weights='imagenet') #im not sure what include_top=false does


In [None]:
#For each layer, we set the trainable attribute to False
#this is effectively freezing the layer's weights. 
#This means that during training, the weights of these layers will not be updated, 
#and they will retain the pre-trained weights from the ImageNet dataset.

for layer in model_vgg16.layers: layer.trainable=False 


In [None]:
#VGG-16 architecture 
model_vgg16.summary()

In [None]:
#making a new sequential model by stacking the different layers
model = Sequential() 

model.add(model_vgg16) #using the pre-trained VGG_16 mode

model.add(Flatten()) 

model.add(Dense(512, activation='relu')) 

model.add(Dropout(0.5)) 

model.add(Dense(6, activation='softmax')) #6 sentiment classes

model.summary()

## Training the model

In [None]:
#I still need to check the specifications
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy']) 

history=model.fit(train_generator, epochs=5, validation_data=validation_generator)
#number of epochs?

In [None]:
model.save('sentiment_VGG-16.h5')