# Import libraries

In [20]:
import pandas as pd
pd.options.mode.chained_assignment = None  
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import datetime as dt 
import re
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import emoji
seed = 0
np.random.seed(seed) 

# Load dataset

In [3]:
df = pd.read_csv('app_review.csv')

In [4]:
df.head()

Unnamed: 0,reviewId,userName,userImage,content,score,thumbsUpCount,reviewCreatedVersion,at,replyContent,repliedAt,appVersion
0,9b1e9713-f88e-4547-be41-b87d840089cc,A Google user,https://play-lh.googleusercontent.com/EGemoI2N...,"Great game, and I enjoy playing it, but now it...",2,1,2.6.0,2024-10-29 13:45:27,,,2.6.0
1,057d6353-31e5-4cf1-9a6d-a9947631bece,A Google user,https://play-lh.googleusercontent.com/EGemoI2N...,For me the game itself is very well-made in te...,5,0,2.6.0,2024-10-29 13:38:53,,,2.6.0
2,a78877c7-5ea4-4689-a72f-e8ead2ac8c3a,A Google user,https://play-lh.googleusercontent.com/EGemoI2N...,This game is as marvelous as Genshin Impact! T...,5,5,2.6.0,2024-10-29 13:31:21,,,2.6.0
3,59a0b3c8-1b2a-4877-84fb-c76b79b5a924,A Google user,https://play-lh.googleusercontent.com/EGemoI2N...,Not fold friendly. Cant aim at enemies at the ...,3,0,2.6.0,2024-10-29 12:48:45,,,2.6.0
4,d135b884-683a-4daa-9b2e-d8ef29c69df4,A Google user,https://play-lh.googleusercontent.com/EGemoI2N...,Other than storage this game is amazing I love...,5,0,,2024-10-29 11:12:16,,,


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 37018 entries, 0 to 37017
Data columns (total 11 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   reviewId              37018 non-null  object
 1   userName              37018 non-null  object
 2   userImage             37018 non-null  object
 3   content               37015 non-null  object
 4   score                 37018 non-null  int64 
 5   thumbsUpCount         37018 non-null  int64 
 6   reviewCreatedVersion  26993 non-null  object
 7   at                    37018 non-null  object
 8   replyContent          1191 non-null   object
 9   repliedAt             1191 non-null   object
 10  appVersion            26993 non-null  object
dtypes: int64(2), object(9)
memory usage: 3.1+ MB


In [5]:
df.shape

(37018, 11)

In [5]:
clean_df = df.dropna(subset=['content'])
clean_df = clean_df.drop_duplicates(subset=['content'])

clean_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 33428 entries, 0 to 37017
Data columns (total 11 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   reviewId              33428 non-null  object
 1   userName              33428 non-null  object
 2   userImage             33428 non-null  object
 3   content               33428 non-null  object
 4   score                 33428 non-null  int64 
 5   thumbsUpCount         33428 non-null  int64 
 6   reviewCreatedVersion  24382 non-null  object
 7   at                    33428 non-null  object
 8   replyContent          1178 non-null   object
 9   repliedAt             1178 non-null   object
 10  appVersion            24382 non-null  object
dtypes: int64(2), object(9)
memory usage: 3.1+ MB


In [6]:
clean_df.shape

(33428, 11)

In [10]:
from transformers import pipeline

sentiment_pipeline = pipeline('sentiment-analysis', device="cuda")

clean_df['sentiment'] = clean_df['content'].apply(lambda x: sentiment_pipeline(x)[0]['label'])

No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


In [11]:
clean_df[['content','sentiment']].head()

Unnamed: 0,content,sentiment
0,"Great game, and I enjoy playing it, but now it...",NEGATIVE
1,For me the game itself is very well-made in te...,POSITIVE
2,This game is as marvelous as Genshin Impact! T...,POSITIVE
3,Not fold friendly. Cant aim at enemies at the ...,NEGATIVE
4,Other than storage this game is amazing I love...,POSITIVE


In [12]:
clean_df.to_csv('clean_app_review.csv', index=False)

# Text Preprocessing

In [None]:
# uncomment this if error happens in the next cell
# import nltk
# nltk.download('stopwords')

In [None]:
# Function to clean text
def clean_text(text):
    # Convert text to lowercase
    text = text.lower()
    
    # Remove punctuation
    text = re.sub(f'[{re.escape(string.punctuation)}]', '', text)
    
    # Remove emojis
    text = emoji.replace_emoji(text, replace='')

    # Remove White Space
    text = text.strip()
    
    # Tokenize the text
    tokens = word_tokenize(text)
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    
    # Join tokens back to string
    cleaned_text = ' '.join(tokens)
    
    return cleaned_text

# Apply the cleaning function to the 'content' column
clean_df['cleaned_content'] = clean_df['content'].apply(clean_text)

# Display the cleaned data
clean_df[['content', 'cleaned_content']].head()

Unnamed: 0,content,cleaned_content
0,"Great game, and I enjoy playing it, but now it...",great game enjoy playing starting crash often
1,For me the game itself is very well-made in te...,game wellmade terms graphics audio gameplay re...
2,This game is as marvelous as Genshin Impact! T...,game marvelous genshin impact characters uniqu...
3,Not fold friendly. Cant aim at enemies at the ...,fold friendly cant aim enemies edge theres roo...
4,Other than storage this game is amazing I love...,storage game amazing loved lot


In [41]:
clean_df['cleaned_content']

0            great game enjoy playing starting crash often
1        game wellmade terms graphics audio gameplay re...
2        game marvelous genshin impact characters uniqu...
3        fold friendly cant aim enemies edge theres roo...
4                           storage game amazing loved lot
                               ...                        
37012                                               aboard
37014                                       beautiful game
37015        dont mind ill downloading giving first rating
37016                                   soro soro jikan da
37017                                                  omg
Name: cleaned_content, Length: 33222, dtype: object

In [39]:
# check if there is any empty string in the cleaned_content column
clean_df[clean_df['cleaned_content'] == '']


Unnamed: 0,reviewId,userName,userImage,content,score,thumbsUpCount,reviewCreatedVersion,at,replyContent,repliedAt,appVersion,sentiment,sentiment_score,sentiment_subjectivity,sentiment_label,cleaned_content


In [42]:
# drop rows with empty string in the cleaned_content column
clean_df = clean_df[clean_df['cleaned_content'] != '']

## Text Labelling

In [43]:
# labelling with textblob
from textblob import TextBlob

def sentiment_score(text):
    analysis = TextBlob(text)
    return analysis.sentiment.polarity

def sentiment_subjectivity(text):
    analysis = TextBlob(text)
    return analysis.sentiment.subjectivity

clean_df['sentiment_score'] = clean_df['cleaned_content'].apply(sentiment_score)
clean_df['sentiment_subjectivity'] = clean_df['cleaned_content'].apply(sentiment_subjectivity)

def sentiment_label(score):
    if score > 0:
        return 'Positive'
    elif score < 0:
        return 'Negative'
    else:
        return 'Neutral'
    
clean_df['sentiment_label'] = clean_df['sentiment_score'].apply(sentiment_label)



In [44]:
clean_df[['cleaned_content','sentiment_score','sentiment_label']]

Unnamed: 0,cleaned_content,sentiment_score,sentiment_label
0,great game enjoy playing starting crash often,0.20000,Positive
1,game wellmade terms graphics audio gameplay re...,0.16000,Positive
2,game marvelous genshin impact characters uniqu...,0.35119,Positive
3,fold friendly cant aim enemies edge theres roo...,0.19375,Positive
4,storage game amazing loved lot,0.30000,Positive
...,...,...,...
37012,aboard,0.00000,Neutral
37014,beautiful game,0.22500,Positive
37015,dont mind ill downloading giving first rating,-0.12500,Negative
37016,soro soro jikan da,0.00000,Neutral


In [46]:
# count the number of positive, negative and neutral reviews
clean_df['sentiment_label'].value_counts()

sentiment_label
Positive    18315
Negative     8604
Neutral      6303
Name: count, dtype: int64