In [51]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline
import warnings
warnings.filterwarnings("ignore")

import sqlite3
import nltk
import string

from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import roc_curve, auc
from nltk.stem.porter import PorterStemmer

import re
import string
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer

from gensim.models import Word2Vec
from gensim.models import KeyedVectors
import pickle

from tqdm import tqdm
import os

from sklearn.utils import shuffle

from sklearn.manifold import TSNE

In [4]:
#defining all the required functions here
def partition(x):
    if x < 3:
        return 'negative'
    return 'positive'

def cleanHTML(sentence):
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, ' ', sentence)
    return cleantext

def cleanPunctuation(sentence):
    cleaned = re.sub(r'[?|!|\'|"|#]', r'', sentence)
    cleaned = re.sub(r'[.|,|)|(|\|/]', r'', sentence)
    return cleaned


#removing stop words
stop = set(stopwords.words('english')) #set of stopwords
sno = nltk.stem.SnowballStemmer('english') #initializing the Snowball Stemmer

final_dataset_path = '/home/monodeepdas112/Datasets/amazon-fine-food-reviews/final.sqlite'
initial_dataset_path = '/home/monodeepdas112/Datasets/amazon-fine-food-reviews/database.sqlite'

In [9]:
if os.path.isfile(final_dataset_path):
    #loading the cleaned dataset if present
    con = sqlite3.connect(final_dataset_path)
    data = pd.read_sql_query('select * from Reviews', con)
    
    with open('positive_words.pkl', 'rb') as f:
        all_positive_words = pickle.load(f)
    with open('negitive_words.pkl', 'rb') as f:
        all_negative_words = pickle.load(f)
else:
    #cleaning the dataset and making the finally cleaned dataset if the cleaned dataset is not present
    con = sqlite3.connect(initial_dataset_path)
    filtered_data = pd.read_sql_query('select * from Reviews where score != 3', con)
    
    #replacing the score column to contain only positive or negative rather than continuous range of values
    actualScore = filtered_data['Score']
    positiveNegativeScore = actualScore.map(partition)
    filtered_data['Score'] = positiveNegativeScore
    
    #data deduplication
    sorted_data = filtered_data.sort_values('ProductId', axis=0, ascending=True, inplace=False, kind='quicksort', na_position='last')
    data = sorted_data.drop_duplicates(subset={"UserId","ProfileName","Time","Text"}, keep='first', inplace=False)
    
    #removing some wrong data points
    data = data[data.HelpfulnessNumerator<=data.HelpfulnessDenominator]
    
    #text pre-processing
    final_string=[]
    all_positive_words=[] # store words from +ve reviews here
    all_negative_words=[] # store words from -ve reviews here.
    for i, sent in enumerate(tqdm(data['Text'].values)):
        filtered_sentence=[]
        sent = cleanHTML(sent)#removing HTML tags
        for w in sent.split():
            for cleaned_words in cleanPunctuation(w).split():
                if((cleaned_words.isalpha()) & (len(cleaned_words)>2)):    
                    if(cleaned_words.lower() not in stop):
                        s=(sno.stem(cleaned_words.lower()))
                        filtered_sentence.append(s)
                        if (final['Score'].values)[i] == 'positive':
                            all_positive_words.append(s) #list of all words used to describe positive reviews
                        if(final['Score'].values)[i] == 'negative':
                            all_negative_words.append(s) #list of all words used to describe negative reviews reviews
        str1 = ' '.join(filtered_sentence) #final string of cleaned words
        final_string.append(str1)
    
    #############---- storing the data into final.sqlite file ------########################
    data['CleanedText']=final_string #adding a column of CleanedText which displays the data after pre-processing of the review 
    data['CleanedText']=final['CleanedText']
    
    conn = sqlite3.connect(final_dataset_path)
    c=conn.cursor()
    conn.text_factory = str
    data.to_sql('Reviews', conn,  schema=None, if_exists='replace', \
                 index=True, index_label=None, chunksize=None, dtype=None)
    conn.close()
    
    #saving the dictionaries
    with open('positive_words.pkl', 'wb') as f:
        pickle.dump(all_positive_words, f)
    with open('negitive_words.pkl', 'wb') as f:
        pickle.dump(all_negative_words, f)

In [12]:
data.head()

Unnamed: 0,index,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text,CleanedText
0,138706,150524,6641040,ACITT7DI6IDDL,shari zychinski,0,0,positive,939340800,EVERY book is educational,this witty little book makes my son laugh at l...,witti littl book make son laugh loud recit car...
1,138688,150506,6641040,A2IW4PEEKO2R0U,Tracy,1,1,positive,1194739200,"Love the book, miss the hard cover version","I grew up reading these Sendak books, and watc...",grew read sendak book watch realli rosi movi i...
2,138689,150507,6641040,A1S4A3IQ2MU7V4,"sally sue ""sally sue""",1,1,positive,1191456000,chicken soup with rice months,This is a fun way for children to learn their ...,fun way children learn month learn poem throug...
3,138690,150508,6641040,AZGXZ2UUK6X,"Catherine Hallberg ""(Kate)""",1,1,positive,1076025600,a good swingy rhythm for reading aloud,This is a great little book to read aloud- it ...,great littl book read nice rhythm well good re...
4,138691,150509,6641040,A3CMRKGE0P909G,Teresa,3,4,positive,1018396800,A great way to learn the months,This is a book of poetry about the months of t...,book poetri month year goe month cute littl po...


In [43]:
min_data_points = 1000

In [44]:
pos_data = data[data.Score=='positive'].loc[:min_data_points, ['CleanedText', 'Score']]
neg_data = data[data.Score=='negative'].loc[:min_data_points, ['CleanedText', 'Score']]
part_data = pos_data.append(neg_data)
part_data = shuffle(part_data)

In [45]:
part_data.head(100)

Unnamed: 0,CleanedText,Score
248,cours alreadi dvd upgrad halloween forget bran...,positive
476,use trap mani year work never break alway catc...,positive
11,author wrote wild thing carol king wrote great...,positive
279,movi top notch lot way great famili movi may a...,positive
409,beetlejuic funni love come dvd thank maitland ...,positive
891,averag baker use cupcak ice far extrem pleas o...,positive
652,found video review help quick figur set trap h...,positive
100,dog anyth treat smell bad mani treat easi brea...,positive
768,product use catch cluster fli pest septemb jan...,negative
283,movi tim burton three name come lot discuss co...,positive


## T-SNE with Bag of Words

In [48]:
#Generating the Count Vectors for the cleaned Text with bi-grams and unigrams
cnt_vec = CountVectorizer(ngram_range=(1,2))
count_vectors = cnt_vec.fit_transform(part_data['CleanedText'].values)
print("the shape of out text BOW vectorizer ",count_vectors.get_shape())
print("the number of unique words ", count_vectors.get_shape()[1])

the shape of out text BOW vectorizer  (1001, 37251)
the number of unique words  37251


In [None]:
#setting up t-SNE
features = count_vectors.todense()
labels = part_data['Score'].values

model = TSNE(n_components=1, random_state=0, perplexity=50, n_iter=5000)
tsne_data = model.fit_transform(features)

In [None]:
tsne_data.shape

In [57]:
#creating a new dataframe which help in plotting the result data
tsne_data = np.vstack((tsne_data.T, labels)).T
tsne_df = pd.DataFrame(data=tsne_data, columns=('Count_vect', 'label'))

#plotting the results
sns.FacetGrid(tsne_df, hue='label', height=3)
plt.show()

ValueError: Shape of passed values is (6, 1001), indices imply (2, 1001)