In [55]:
import contractions
import numpy as np
import pandas as pd
import pickle 
import plotly.express as px
import spacy
import time

In [24]:
# Load model, larger model required to be installed for vectorisation
nlp = spacy.load("en_core_web_lg")

In [3]:
# Read review file
file_dir = "C:/Users/mnelo/Documents/masters/ANLP/AT3/Reviews.csv"
data = pd.read_csv(file_dir)
data.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...


In [36]:
# Basic processing. 
# Spacy adjusts for punctuation, upper case, etc. 
# Stop words are not removed as they may also add to sentence context. 

def text_processing(df):
    # Replace contractions 
    df['processed'] = df['Text'].apply(contractions.fix)
    
    # Remove HTML tags
    df['processed'] = df['processed'].str.replace(r'<[^>]+>', '', regex=True)
    
    return df

data = text_processing(data)

In [157]:
# Spacy's nlp function (corpus processing) is very time consuming because of vectorisations. 
# Generator below to process data in chunks.
def nlp_chunking(df, column, chunk_size=10000):
    start = 0
    
    while start <= len(df):
        #start_time = time.time()
        end = min(start+chunk_size, len(df))
        chunk = df.iloc[start:start+end][column]
        yield chunk.apply(nlp)
        
        start = start + chunk_size
        #chunk_time = time.time() - start_time
        #print('Chunk processed in {:.2f} seconds, {} rows remaining...'.format(chunk_time, len(df) - start))

In [None]:
# WARNING - Entire dataframe cannot be processed at once. Will not fit in memory. 
# Breaking down by 100k rows is still quite time consuming. 
nlp_results = []
for chunk in nlp_chunking(data[100000:200000], 'Text'):
    nlp_results.append(chunk) 

In [169]:
# If above cell has been run, option to save down pickle file for later use.
with open('F:/nlp_result_4.pickle', 'wb') as handle:
    pickle.dump(nlp_results[9:], handle, protocol=pickle.HIGHEST_PROTOCOL)

In [172]:
# Optional: Load previous pickle file. 
with open('F:/nlp_result_4.pickle', 'rb') as handle:
    nlp_results = pickle.load(handle)

In [193]:
pd.concat(nlp_results).head()

190000    (The, Orville, Redenbacher, brand, has, meant,...
190001    (I, 've, popped, gallons, of, Orville, Redenba...
190002    (This, is, the, best, popping, corn, ever, ., ...
190003    (My, dog, loves, to, chew, ,, does, n't, yours...
190004    (I, 'll, be, honest, ,, the, Chinese, Elm, Bon...
                                ...                        
199995    (Could, n't, find, this, one, in, our, local, ...
199996    (This, was, the, first, kind, my, family, and,...
199997    (I, have, healthy, 45, year, old, teeth, and, ...
199998    (You, might, as, well, eat, the, box, it, came...
199999    (Let, me, set, the, scene, for, you, :, three,...
Name: Text, Length: 10000, dtype: object

In [200]:
# Add scores from original dataset to NLP results
nlp_df = pd.concat(nlp_results)
nlp_df = pd.concat([data.loc[190000:199999, ['HelpfulnessNumerator', 'HelpfulnessDenominator', 'Score']],
                    nlp_df], axis=1)
nlp_df['helpfulness'] = nlp_df['HelpfulnessNumerator'] / nlp_df['HelpfulnessDenominator']

# Create some summary stats
nlp_df['sentences'] = nlp_df['Text'].apply(lambda x: [sent for sent in x.sents])
nlp_df['num_sent'] = nlp_df['sentences'].apply(len)
nlp_df['max_sent_len'] = nlp_df['sentences'].apply(lambda x: max([len(sent) for sent in x]))
nlp_df['mean_sent_len'] = nlp_df['sentences'].apply(lambda x: np.mean([len(sent) for sent in x]))

# Extract Entity names
nlp_df['ner'] = nlp_df['Text'].apply(lambda x: [x for x in x.ents])
nlp_df['ner_len'] = nlp_df['ner'].apply(len)
nlp_df

Unnamed: 0,HelpfulnessNumerator,HelpfulnessDenominator,Score,Text,helpfulness,sentences,num_sent,max_sent_len,mean_sent_len,ner,ner_len
190000,0,0,5,"(The, Orville, Redenbacher, brand, has, meant,...",,"[(The, Orville, Redenbacher, brand, has, meant...",6,54,27.833333,"[(decades), (today), (Orville), (first), (two)...",6
190001,0,1,2,"(I, 've, popped, gallons, of, Orville, Redenba...",0.000000,"[(I, 've, popped, gallons, of, Orville, Redenb...",7,33,22.428571,"[(Orville, Redenbacher), (Presto, PowerPop), (...",10
190002,0,1,5,"(This, is, the, best, popping, corn, ever, ., ...",0.000000,"[(This, is, the, best, popping, corn, ever, .)...",18,31,15.222222,"[(two), (1/3), (Orville, Redenbacher), (Salt),...",5
190003,3,3,4,"(My, dog, loves, to, chew, ,, does, n't, yours...",1.000000,"[(My, dog, loves, to, chew, ,, does, n't, your...",8,81,23.625000,"[(USA), (matter?<br), (Whole, Dog, Journal), (...",6
190004,0,1,5,"(I, 'll, be, honest, ,, the, Chinese, Elm, Bon...",0.000000,"[(I, 'll, be, honest, ,, the, Chinese, Elm, Bo...",6,21,15.000000,"[(Chinese), (9GreenBox), (countless, years)]",3
...,...,...,...,...,...,...,...,...,...,...,...
199995,0,0,5,"(Could, n't, find, this, one, in, our, local, ...",,"[(Could, n't, find, this, one, in, our, local,...",2,11,10.000000,[],0
199996,1,2,1,"(This, was, the, first, kind, my, family, and,...",0.500000,"[(This, was, the, first, kind, my, family), (a...",13,36,20.076923,"[(first), (those, after, hour), (the, morning)...",7
199997,4,7,1,"(I, have, healthy, 45, year, old, teeth, and, ...",0.571429,"[(I, have, healthy, 45, year, old, teeth, and,...",5,27,17.400000,"[(45, year, old)]",1
199998,0,7,2,"(You, might, as, well, eat, the, box, it, came...",0.000000,"[(You, might, as, well, eat, the, box, it, cam...",2,70,41.000000,[],0


## EDA

In [208]:
fig = px.histogram(nlp_df, x='num_sent',
                   labels={'num_sent': 'Number of Sentences'},
                   color_discrete_sequence=px.colors.qualitative.Dark24,
                   title='Number of Sentences per Review (sample size 10k)')
fig.update_layout(bargap=0.1, template='plotly_dark', yaxis_title='Count')
fig.show()

In [206]:
fig = px.box(nlp_df, x='num_sent', y='Score', orientation='h',
             labels={'num_sent': 'Number of Sentences'},
                   color_discrete_sequence=px.colors.qualitative.Dark24,
                   title='Number of Sentences vs Score (sample size 10k)')
fig.update_layout(template='plotly_dark')
fig.show()

In [209]:
fig = px.scatter(nlp_df, x='num_sent', y='helpfulness',
                 labels={'num_sent': 'Number of Sentences'},
                 color_discrete_sequence=px.colors.qualitative.Dark24,
                 title='Number of Sentences vs Helpfulness (sample size 10k)')
fig.update_layout(template='plotly_dark')
fig.show()

In [184]:
fig = px.histogram(nlp_df, x='max_sent_len',
                   labels={'max_sent_len': 'Sentence Length'},
                   color_discrete_sequence=px.colors.qualitative.Dark24[1:],
                   title='Longest Sentences per Review (sample size 10k)')
fig.update_layout(bargap=0.1, template='plotly_dark', yaxis_title='Count')
fig.show()

In [210]:
fig = px.box(nlp_df, x='max_sent_len', y='Score', orientation='h',
             labels={'max_sent_len': 'Sentence Length'},
             color_discrete_sequence=px.colors.qualitative.Dark24[1:],
             title='Longest Sentences vs Score (sample size 10k)')
fig.update_layout(template='plotly_dark')
fig.show()

In [211]:
fig = px.scatter(nlp_df, x='max_sent_len', y='helpfulness',
                 labels={'max_sent_len': 'Sentence Length'},
                 color_discrete_sequence=px.colors.qualitative.Dark24[1:],
                 title='Longest Sentences vs Helpfulness (sample size 10k)')
fig.update_layout(template='plotly_dark')
fig.show()

In [185]:
fig = px.histogram(nlp_df, x='mean_sent_len',
                   labels={'mean_sent_len': 'Sentence Length'},
                   color_discrete_sequence=px.colors.qualitative.Dark24[2:],
                   title='Average Sentence Length per Review (sample size 10k)')
fig.update_layout(bargap=0.1, template='plotly_dark', yaxis_title='Count')
fig.show()

In [212]:
fig = px.box(nlp_df, x='mean_sent_len', y='Score', orientation='h',
             labels={'mean_sent_len': 'Average Sentence Length'},
             color_discrete_sequence=px.colors.qualitative.Dark24[2:],
             title='Average Sentence Length vs Score (sample size 10k)')
fig.update_layout(template='plotly_dark')
fig.show()

In [213]:
fig = px.scatter(nlp_df, x='mean_sent_len', y='helpfulness',
                 labels={'mean_sent_len': 'Sentence Length'},
                 color_discrete_sequence=px.colors.qualitative.Dark24[2:],
                 title='Average Sentence Length vs Helpfulness (sample size 10k)')
fig.update_layout(template='plotly_dark')
fig.show()

In [186]:
fig = px.histogram(nlp_df, x='ner_len',
                   labels={'ner_len': 'Number of Entities'},
                   color_discrete_sequence=px.colors.qualitative.Dark24[3:],
                   title='Entities per Review (sample size 10k)')
fig.update_layout(bargap=0.1, template='plotly_dark', yaxis_title='Count')
fig.show()

In [214]:
fig = px.box(nlp_df, x='ner_len', y='Score', orientation='h',
             labels={'ner_len': 'Number of Entities'},
             color_discrete_sequence=px.colors.qualitative.Dark24[3:],
             title='Entities Mentioned vs Score (sample size 10k)')
fig.update_layout(template='plotly_dark')
fig.show()

In [215]:
fig = px.scatter(nlp_df, x='ner_len', y='helpfulness',
                 labels={'ner_len': 'Sentence Length'},
                 color_discrete_sequence=px.colors.qualitative.Dark24[3:],
                 title='Entities Mentioned vs Helpfulness (sample size 10k)')
fig.update_layout(template='plotly_dark')
fig.show()

In [216]:
# Placeholder to lemmatised/stop words removed analysis
def nlp_process(series):
    new_series = []
    
    for sent in series:
        new_sent = [word.lemma_ for word in sent if not word.is_stop]
        new_series.append(new_sent)
        
    return new_series

nlp_df['processed'] = nlp_df['sentences'].apply(nlp_process)

## Clustering

In [221]:
from sklearn.cluster import KMeans, DBSCAN
from sklearn.decomposition import PCA

In [218]:
vectors = nlp_df['Text'].apply(lambda x: x.vector)
vec_df = vectors.apply(pd.Series)

# n=3 for visualisation. For modeling use a higher n.
pca = PCA(n_components=3)
pca_fitted = pca.fit_transform(vec_df)

In [219]:
pca_df = pd.DataFrame(pca_fitted, columns=['x', 'y', 'z'])
pca_df

Unnamed: 0,x,y,z
0,2.732997,-1.820448,-1.805919
1,4.142956,-1.747589,-1.315306
2,4.958202,-3.451770,-1.976848
3,2.022873,-1.050215,-3.301380
4,-2.314983,-2.446839,-1.943270
...,...,...,...
9995,-0.564707,5.191981,-1.508976
9996,-0.071814,1.828592,0.217747
9997,-7.248748,1.072634,-1.509653
9998,-3.913288,-3.546699,2.032552


In [233]:
from sklearn.neighbors import NearestNeighbors

In [235]:
# Use nearest neighbors to find a good epsilon input

# Fit Model
nn_model = NearestNeighbors(n_neighbors=2)
nn = nn_model.fit(pca_df)
distances, _indices = nn.kneighbors(pca_df)

# Sort and plot distances
distances = np.sort(distances, axis=0)
fig = px.line(distances[:, 1],
              labels={'ner_len': 'Sentence Length'},
              color_discrete_sequence=px.colors.qualitative.Dark24,
              title='Distance of Nearest Neighbours')
fig.update_layout(template='plotly_dark')
fig.show()

In [244]:
# When domain knowledge is not known, use >= dimensions + 1
min_samples = pca_df.shape[1] + 2

# Clustering model
clustering = DBSCAN(eps=1.5, min_samples=min_samples).fit(pca_df)

# Merge results 
chart_df = pca_df.copy()
chart_df['labels'] = clustering.labels_
chart_df = chart_df.sort_values(['labels'])
chart_df['labels'] = chart_df['labels'].astype(str)

In [257]:
#Plot
fig = px.scatter_3d(chart_df, x='x', y='y', z='z', color='labels',
                    opacity=0.7)
fig.update_layout(margin=dict(l=0, r=0, b=0, t=0))
fig.show()