In [70]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report
import re
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
import joblib

Load the dataset into a pandas DataFrame

In [55]:
df = pd.read_csv('Reviews.csv')
#df = df.sample(frac=0.5, random_state=42) #reduce samples (memory issues)

Get sentiment using SentimentIntensityAnalyzer for the dataset

In [56]:
#sentiment annotation function
def get_sentiment(text):
    '''
    Sentiment annotation function
    arguments : text = text to be processed
    '''
    
    #calculate sentiment based on intensity
    sid = SentimentIntensityAnalyzer()
    sentiment_scores = sid.polarity_scores(text)

    #assign label used for each classification
    if sentiment_scores['compound'] >= 0.1:
        return 'positive'
    elif sentiment_scores['compound'] <= -0.1:
        return 'negative'
    else:
        return 'neutral'

df['sentiment'] = df['content'].apply(lambda x: get_sentiment(str(x)) if isinstance(x, float) else get_sentiment(x))

In [57]:
# Print the selected columns in table format
selected_columns = ['content', 'sentiment']
sample_df = df[selected_columns].sample(n=5)

print(sample_df)

                                                  content sentiment
56640   I really like the game but I also think that i...  positive
111054  The reason I rate this game one star is becaus...  negative
3015    Do not get this game something is in the cats ...   neutral
54987   The game is good in some what ways... But when...  negative
46804                                                Good  positive


Apply preprocessing to the dataset

In [58]:

#uses stopwords library from NLTK
stop = set(nltk.corpus.stopwords.words('english'))

#pre-processing function
def pre_processing(text):
    '''
    Pre-processing text function
    arguments : text = text to be pre-processed
    '''
    
    #lowercasing & remove any symbols
    text = str(text).lower()
    text = re.sub(r'\W',' ', text)
    
    #handle negation
    text = re.sub(r'\bnot\b', 'not_', text)
    text = re.sub(r'\bno\b', 'no_', text)
    
    #split words in the text & remove stopwords
    words = text.split()
    text = ' '.join([word for word in words if word not in stop])
    
    return text

#apply the pre-processing to the dataset
df['content'] = df['content'].apply(pre_processing)

In [68]:
sample_text = "This app is good!!."
print("Before: ", sample_text)
print("After: ", pre_processing(sample_text))

sample_text = "Nice. Not bad."
print("Before: ", sample_text)
print("After: ", pre_processing(sample_text))

Before:  This app is good!!.
After:  app good
Before:  Nice. Not bad.
After:  nice not_ bad


Split the data into features and labels and test split

In [59]:
#split dataset into training and testing
features_train, features_test, labels_train, labels_test = train_test_split(
    df['content'], 
    df['sentiment'], 
    test_size=0.1, 
    random_state=42
)

Transform text into numerical data using TFIDF Vectorizer

In [60]:
#define vectorizer
vectorizer = TfidfVectorizer()

#transform both training and testing into numerical data
features_train = vectorizer.fit_transform(features_train)
features_test = vectorizer.transform(features_test)

Train the Decision Tree model

In [61]:
clf = DecisionTreeClassifier()
clf.fit(features_train, labels_train)

tree_depth = clf.tree_.max_depth
print("Depth of the decision tree:", tree_depth)

Depth of the decision tree: 2945


Export model & TF-IDF parameter using Joblib

In [77]:
joblib.dump(clf, 'decision_tree_model.joblib')
joblib.dump(vectorizer, 'vectorizer.joblib')

['vectorizer.joblib']

In [72]:
clf = joblib.load('decision_tree_model.joblib')
vectorizer =joblib.load('vectorizer.joblib')

Testing the model

In [73]:
y_pred = clf.predict(features_test)

Evaluate the model

In [74]:
accuracy = accuracy_score(labels_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.8463489287275907


In [75]:
report = classification_report(labels_test, y_pred)
print("Classification Report:\n", report)

Classification Report:
               precision    recall  f1-score   support

    negative       0.63      0.62      0.62      1727
     neutral       0.76      0.76      0.76      1598
    positive       0.91      0.91      0.91      8110

    accuracy                           0.85     11435
   macro avg       0.76      0.76      0.76     11435
weighted avg       0.85      0.85      0.85     11435



Manual input test

In [76]:
review_input = input("Enter a review: ")
review_preprocessed = pre_processing(review_input)
print("Processed Text: ", review_preprocessed)
review_preprocessed = vectorizer.transform([review_preprocessed])
print("Vectorized Text: ", review_preprocessed)
sentiment = clf.predict(review_preprocessed)
print("Sentiment:", sentiment)

Processed Text:  not_ good
Vectorized Text:    (0, 22956)	0.735979734276324
  (0, 14896)	0.6770035677413757
Sentiment: ['negative']
