In [5]:
# sentiment_analysis_svc.py

import re
import string
import pandas as pd
import numpy as np
from collections import Counter

# Text processing
from textblob import TextBlob
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer

# Machine Learning
from sklearn import preprocessing
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.svm import SVC
from sklearn import metrics
from sklearn.metrics import classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE
import matplotlib.pyplot as plt

def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            plt.text(j, i, format(cm[i, j], fmt),
                     horizontalalignment="center",
                     color="white" if cm[i, j] > thresh else "black")

    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.tight_layout()

def main():
    # Import dataset
    raw_reviews = pd.read_json('Data.json')
    print("The shape of the data is (row, column):", raw_reviews.shape)

    # Preprocessing and cleaning
    process_reviews = raw_reviews.copy()
    
    # Handle missing values
    process_reviews['reviewText'] = process_reviews['reviewText'].fillna('Missing')
    
    # Combine review text and summary
    process_reviews['reviews'] = process_reviews['reviewText'] + ' ' + process_reviews['summary']
    process_reviews = process_reviews.drop(['reviewText', 'summary'], axis=1)
    
    # Create sentiment column
    def f(row):
        if row['overall'] == 3.0:
            val = 'Neutral'
        elif row['overall'] == 1.0 or row['overall'] == 2.0:
            val = 'Negative'
        elif row['overall'] == 4.0 or row['overall'] == 5.0:
            val = 'Positive'
        else:
            val = -1
        return val
    
    process_reviews['sentiment'] = process_reviews.apply(f, axis=1)
    
    # Remove unnecessary columns
    process_reviews = process_reviews.drop(['reviewerName', 'unixReviewTime', 'reviewTime', 'helpful'], axis=1)
    
    # Text cleaning
    def review_cleaning(text):
        text = str(text).lower()
        text = re.sub('\[.*?\]', '', text)
        text = re.sub('https?://\S+|www\.\S+', '', text)
        text = re.sub('<.*?>+', '', text)
        text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
        text = re.sub('\n', '', text)
        text = re.sub('\w*\d\w*', '', text)
        return text
    
    process_reviews['reviews'] = process_reviews['reviews'].apply(lambda x: review_cleaning(x))
    
    # Remove stopwords
    stop_words = ['yourselves', 'between', 'whom', 'itself', 'is', "she's", 'up', 'herself', 'here', 'your', 'each', 
                 'we', 'he', 'my', "you've", 'having', 'in', 'both', 'for', 'themselves', 'are', 'them', 'other',
                 'and', 'an', 'during', 'their', 'can', 'yourself', 'she', 'until', 'so', 'these', 'ours', 'above', 
                 'what', 'while', 'have', 're', 'more', 'only', "needn't", 'when', 'just', 'that', 'were', "don't", 
                 'very', 'should', 'any', 'y', 'isn', 'who', 'a', 'they', 'to', 'too', "should've", 'has', 'before',
                 'into', 'yours', "it's", 'do', 'against', 'on', 'now', 'her', 've', 'd', 'by', 'am', 'from', 
                 'about', 'further', "that'll", "you'd", 'you', 'as', 'how', 'been', 'the', 'or', 'doing', 'such',
                 'his', 'himself', 'ourselves', 'was', 'through', 'out', 'below', 'own', 'myself', 'theirs', 
                 'me', 'why', 'once', 'him', 'than', 'be', 'most', "you'll", 'same', 'some', 'with', 'few', 'it',
                 'at', 'after', 'its', 'which', 'there', 'our', 'this', 'hers', 'being', 'did', 'of', 'had', 'under',
                 'over', 'again', 'where', 'those', 'then', "you're", 'i', 'because', 'does', 'all']
    
    process_reviews['reviews'] = process_reviews['reviews'].apply(
        lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))
    
    # Encode target variable
    label_encoder = preprocessing.LabelEncoder() 
    process_reviews['sentiment'] = label_encoder.fit_transform(process_reviews['sentiment']) 
    
    # Prepare features for modeling
    review_features = process_reviews[['reviews']].reset_index(drop=True)
    
    # Stemming
    ps = PorterStemmer()
    corpus = []
    for i in range(0, len(review_features)):
        review = re.sub('[^a-zA-Z]', ' ', review_features['reviews'][i])
        review = review.split()
        review = [ps.stem(word) for word in review if not word in stop_words]
        review = ' '.join(review)
        corpus.append(review)
    
    review_features['reviews'] = corpus
    
    # TF-IDF Vectorization
    tfidf_vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(2, 2))
    X = tfidf_vectorizer.fit_transform(review_features['reviews'])
    y = process_reviews['sentiment']
    
    # Handle class imbalance with SMOTE
    print(f'Original dataset shape: {Counter(y)}')
    smote = SMOTE(random_state=42)
    X_res, y_res = smote.fit_resample(X, y)
    print(f'Resampled dataset shape: {Counter(y_res)}')
    
    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size=0.25, random_state=0)
    
    # SVC Model
    svc = SVC()
    print("SVC Test Accuracy:", cross_val_score(svc, X, y, cv=10, scoring='accuracy').mean())
    
    # Train final model
    svc.fit(X_train, y_train)
    y_pred = svc.predict(X_test)
    print('Accuracy of SVC classifier on test set: {:.2f}'.format(svc.score(X_test, y_test)))
    
    # Classification metrics
    cm = metrics.confusion_matrix(y_test, y_pred)
    plot_confusion_matrix(cm, classes=['Negative', 'Neutral', 'Positive'])
    plt.show()
    
    print("Classification Report:\n", classification_report(y_test, y_pred))

if __name__ == "__main__":
    main()


invalid escape sequence '\['


invalid escape sequence '\S'


invalid escape sequence '\w'


invalid escape sequence '\['


invalid escape sequence '\S'


invalid escape sequence '\w'


invalid escape sequence '\['


invalid escape sequence '\S'


invalid escape sequence '\w'


invalid escape sequence '\['


invalid escape sequence '\S'


invalid escape sequence '\w'



ValueError: Trailing data