In [None]:
# Pandas : Pandas is a Python package providing fast, flexible, and expressive data structures designed to make working with “relational”or “labelled” data both easy and intuitive.
# Matplotlib & Seaborn : These packages provide a high-level interface
# BeautifulSoup : BeautifulSoup is  a Python library. It is used for parsing XML and HTML.
# importing pandas package
import pandas as pd
# importing numpy package
import numpy as np
# importing matplotlib and seaborn packages
import matplotlib.pyplot as plt
# importing re package
import re
import seaborn as sns
from bs4 import BeautifulSoup
# importing NLTK package's
import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from nltk import word_tokenize
# importing gensim package's
from gensim.models import Word2Vec
import warnings
warnings.filterwarnings("ignore")


In [None]:
# Load mobile review data
df = pd.read_csv('training-dataset.csv')
df.head()

#  Step 2 - Cleansing data

In [None]:
# function to perform the cleaning of data using  wordCloud corpus
def clean_text(raw_text, remove_stopwords=False, stemming=False, split_text=False):
    '''
    Convert a raw review to a cleaned review
    '''
    text = BeautifulSoup(raw_text, 'lxml').get_text()  #remove html
    letters_only = re.sub("[^a-zA-Z]", " ", text)  # remove non-character
    words = letters_only.lower().split() # convert to lower case

    if remove_stopwords: # remove stopword
        stops = set(stopwords.words("english"))
        words = [w for w in words if not w in stops]

    if stemming is True: # stemming
        stemmer = SnowballStemmer('english')
        words = [stemmer.stem(w) for w in words]

    if split_text is True:  # split text
        return words

    return " ".join(words)

In [None]:
# function to perform tokenization
def parse_sent(text, tokenizer, remove_stopwords=False):
    '''
    Parse text into sentences
    '''
    raw_sentences = tokenizer.tokenize(text.strip())
    sentences = []
    for raw_sentence in raw_sentences:
        if len(raw_sentence) is not 0:
            sentences.append(clean_text(raw_sentence, remove_stopwords, split_text=True))
    return sentences

# step 3 - Exploratory Data Analysis

In [None]:
# Summary of the data
print("Summary statistics of numerical features : \n", df.describe())
# Total number of reviews
print("\nTotal number of reviews: ", len(df))
# Percentage of positive, negative and Neutral reviews
print("\nPercentage of reviews with neutral sentiment : {:.2f}%"      \
      .format(df[df['rating'] == 3]["body"].count()/len(df)*100))
print("\nPercentage of reviews with positive sentiment : {:.2f}%"     \
      .format(df[df['rating'] > 3]["body"].count()/len(df)*100))
print("\nPercentage of reviews with negative sentiment : {:.2f}%"     \
      .format(df[df['rating'] < 3]["body"].count()/len(df)*100))

In [None]:
# Drawing Bar Plot on distribution of rating
plt.figure(figsize=(12, 8))
df['rating'].value_counts().sort_index().plot(kind='bar')
plt.title('Distribution of Rating')
plt.xlabel('Rating')
plt.ylabel('Count')

In [None]:
# Drawing Bar Plot on distribution of review length
REVIEW_LENGTH = df["body"].dropna().map(lambda x: len(x))
plt.figure(figsize=(12, 8))
REVIEW_LENGTH.loc[REVIEW_LENGTH < 1500].hist()
plt.title("Distribution of Review Length")
plt.xlabel('Review length (Number of character)')
plt.ylabel('Count')

# Step-4 - Data Pre-processing

In [None]:
# Null values in the data
df.isna().sum()

In [None]:
# Drop missing values
df.dropna(inplace=True)

In [None]:
# Null values in the data
df.isna().sum()

In [None]:
# Encode 4s and 5s as 1 (positive sentiment) and 1s, 2s and 3s as 0 (negative sentiment)
df['sentiment'] = np.where(df['rating'] > 3, 1, 0)
df.head()

In [None]:
sns.countplot(x='sentiment', data=df)

In [None]:
train_df = df[['body', 'sentiment']]

In [None]:
train_clean = []
# final sentence after performing cleansing
for d in train_df['body']:
    train_clean.append(clean_text(d))
print('Show a cleaned review in the training set : \n', train_clean[30])

In [None]:
# Split review text into parsed sentences uisng NLTK's punkt tokenizer
# nltk.download

In [None]:
TOKENIZER = nltk.data.load('tokenizers/punkt/english.pickle')

In [None]:
# Parse each review in the training set into sentences
SENTENCES = []
for review in train_clean:
    SENTENCES += parse_sent(review, TOKENIZER)

In [None]:
print('%d parsed sentence in the training set\n'  %len(SENTENCES))
print('Show a parsed sentence in the training set : \n', SENTENCES[10])

In [None]:
NUM_FEATURES = 300  #embedding dimension
MIN_WORD_COUNT = 5
NUM_WORKERS = 4
CONTEXT = 5
DOWNSAMPLING = 1e-3

In [None]:
print("Training Word2Vec model ...\n")
# perform word2vec
W2V = Word2Vec(SENTENCES, workers=NUM_WORKERS, vector_size=NUM_FEATURES, \
          min_count=MIN_WORD_COUNT, window=CONTEXT, sample=DOWNSAMPLING)
W2V.init_sims(replace=True)
W2V.save("w2v_300features_10minwordcounts_10context") #save trained word2vec model

In [None]:
print("Number of words in the vocabulary list : %d \n" %len(W2V.wv.index_to_key))
print("Show first 10 words in the vocalbulary list  vocabulary list: \n", W2V.wv.index_to_key[0:10])

In [None]:
# function to obtain features from the word2vec
def make_feature_vec(text, model, num_feat):
    '''
    Transform a review to a feature vector by averaging feature vectors of words
    appeared in that review and in the volcabulary list created
    '''
    feat_vector = np.zeros((num_feat,), dtype="float32")
    nwords = 0.
    index2word_set = set(model.wv.index_to_key) # index2word is the volcabulary list of the Word2Vec model
    zero_vector = True
    for word in text:
        if word in index2word_set:
            nwords = nwords + 1.
            feat_vector = np.add(feat_vector, model.wv[word])
            zero_vector = False
    if zero_vector is False:
        feat_vector = np.divide(feat_vector, nwords)
    return feat_vector

In [None]:
# function to obtain average features from the sentance
def get_avg_feature_vecs(texts, model, num_feat):
    '''
    Transform all reviews to feature vectors using make_feature_vec()
    '''
    counter = 0
    review_feature_vectors = np.zeros((len(texts), num_feat), dtype="float32")
    for text in texts:
        review_feature_vectors[counter] = make_feature_vec(text, model, num_feat)
        counter = counter + 1
    return review_feature_vectors

In [None]:
# Get feature vectors for training set
X_TRAIN_CLEANED = []
for review in train_df['body']:
    X_TRAIN_CLEANED.append(clean_text(review, remove_stopwords=True, split_text=True))
TRAIN_VECTOR = get_avg_feature_vecs(X_TRAIN_CLEANED, W2V, NUM_FEATURES)
print("Training set : %d feature vectors with %d dimensions" %TRAIN_VECTOR.shape)

In [None]:
# pad sequences
max_length = max([len(s.split()) for s in train_df['body']])

# Step 5 - Applying Algorithm and Train the model on training data

In [None]:
# Random Forest Classifier
RF = RandomForestClassifier(n_estimators=100)
RF.fit(TRAIN_VECTOR, train_df['sentiment'])

# Step - 6 - Preparing Test Data


In [None]:
# Load mobile review test data
test_df = pd.read_csv('testing-dataset.csv')
test_df.head()

In [None]:
# Drop missing values
test_df.dropna(inplace=True)

In [None]:
# Encode 4s and 5s as 1 (positive sentiment) and 1s, 2s and 3s as 0 (negative sentiment)
test_df['sentiment'] = np.where(test_df['rating'] > 3, 1, 0)
test_df.head()

In [None]:
test_clean = []
for d in test_df['body']:
    test_clean.append(clean_text(d))

In [None]:
# Get feature vectors for validation set
X_TEST_CLEANED = []
for review in test_df['body']:
    X_TEST_CLEANED.append(clean_text(review, remove_stopwords=True, split_text=True))
TEST_VECTOR = get_avg_feature_vecs(X_TEST_CLEANED, W2V, NUM_FEATURES)
print("Validation set : %d feature vectors with %d dimensions" %TEST_VECTOR.shape)

# Step 7 - Scoring the model on Test Dataset

In [None]:
PREDICTIONS = RF.predict(TEST_VECTOR)

# Step 8 - Evaluating the model

In [None]:
# function to evaluate the model
def model_evaluation(predict):
    '''
    Print model evaluation to predicted result
    '''
    print("\nAccuracy on validation set: {:.4f}".format(accuracy_score(test_df['sentiment'], predict)))
    print("\nAUC score : {:.4f}".format(roc_auc_score(test_df['sentiment'], predict)))
    print("\nClassification report : \n", metrics.classification_report(test_df['sentiment'], predict))
    print("\nConfusion Matrix : \n", metrics.confusion_matrix(test_df['sentiment'], predict))

# to predict the model
model_evaluation(PREDICTIONS)

In [None]:
CONFUSION_MATRIX = metrics.confusion_matrix(test_df['sentiment'], PREDICTIONS)
sns.heatmap(CONFUSION_MATRIX, annot=True, fmt='g', xticklabels=\
              ["Negative", "Positive"], yticklabels=["Negative", "Positive"])
plt.ylabel('True label')
plt.xlabel('Predicted label')

In [None]:
# function to evaluate the model
def model_evaluation(predict):
    '''
    Print model evaluation to predicted result
    '''
    print("\nAccuracy on validation set: {:.4f}".format(accuracy_score(test_df['sentiment'], predict)))
    print("\nAUC score : {:.4f}".format(roc_auc_score(test_df['sentiment'], predict)))
    print("\nClassification report : \n", metrics.classification_report(test_df['sentiment'], predict))
    print("\nConfusion Matrix : \n", metrics.confusion_matrix(test_df['sentiment'], predict))

# to predict the model
model_evaluation(PREDICTIONS)