# Word vectors

In [None]:
import spacy

In [None]:
nlp = spacy.load('en_core_web_lg')

In [None]:
# access vectors of tokens
nlp(u'lion').vector

In [None]:
# doc and span objects have corresponding vectors themselves, derived from the avarages of individual token vectors
nlp(u'The quick brown fox jumped').vector

In [None]:
# 300 dim vectors
nlp(u'lion').vector.shape

In [None]:
nlp(u'The quick brown fox jumped').vector.shape

In [None]:
# vectors are of type numpy.ndarray
type(nlp(u'lion').vector)

In [None]:
# check for similarity btw tokens
tokens = nlp(u'lion cat pet')

for token1 in tokens:
    for token2 in tokens:
        print(token1.text, token2.text, token1.similarity(token2))

In [None]:
tokens = nlp(u'like love hate despise')

for token1 in tokens:
    print('\n')
    for token2 in tokens:
        print(token1.text, token2.text, token1.similarity(token2))

In [None]:
# nr of vectors 
len(nlp.vocab.vectors)

In [None]:
nlp.vocab.vectors.shape

In [None]:
tokens = nlp(u'dog cat nargle John')

for token in tokens:
    print(token.text, token.has_vector, token.vector_norm, token.is_oov)
    

In [None]:
# normalised vector: sum of the squares of all 300-dim
nlp('cat').vector_norm

In [None]:
# compute new vectors
from scipy import spatial

cosine_similarity = lambda x,y: 1 - spatial.distance.cosine(x,y)

king = nlp.vocab['king'].vector
man = nlp.vocab['man'].vector
woman = nlp.vocab['woman'].vector

# king - man + woman = new vector for ~ queen, princess, highness, ...
new_vector = king-man+woman

computed_similarities = []

# for all vocab words 
for word in nlp.vocab:
    if word.has_vector:
        if word.is_lower:
            if word.is_alpha:
                similarity = cosine_similarity(new_vector, word.vector)
                computed_similarities.append((word,similarity))
                

In [None]:
# sorted in descending order (-item) based on their similarity value (item[1])
computed_similarities = sorted(computed_similarities, key=lambda item:-item[1])

In [None]:
# print top 10 similar words
print([t[0].text for t in computed_similarities[:10]])

# Sentiment analysis

In [None]:
import nltk

In [None]:
nltk.download('vader_lexicon')

In [None]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer

sid = SentimentIntensityAnalyzer()

In [None]:
# takes a string and returns a dict of scores (0-1) in four categories: negative, neutral, positive, compound
string = "I liked it"
sid.polarity_scores(string)

In [None]:
string = "I loved it"
sid.polarity_scores(string)

In [None]:
string = "I LOVED it!!!"
sid.polarity_scores(string)

In [None]:
# compound is negative for an overall negative sentiment
string = "I loved it, but hated it at the same time"
sid.polarity_scores(string)

In [None]:
import pandas as pd

df = pd.read_csv('../../pythongyak/UPDATED_NLP_COURSE/TextFiles/amazonreviews.tsv', sep='\t')

In [None]:
df.head()

In [None]:
len(df)

In [None]:
df['label'].value_counts()

In [None]:
#check for null values
df.isnull().sum()

In [None]:
len(df)

In [None]:
# delete if there are any
df.dropna(inplace=True)

In [None]:
# check for empty strings
blanks = []

for i,lb,rv in df.itertuples():
    if type(rv) == str:
        if rv.isspace():
            blanks.append(i)
            
blanks

In [None]:
# delete if there are any
df.drop(blanks, inplace=True)

In [None]:
len(df)

In [None]:
# check first review
df.iloc[0]['review']

In [None]:
sid.polarity_scores(df.iloc[0]['review'])

In [None]:
# add scores to the df
df['scores'] = df['review'].apply(lambda review: sid.polarity_scores(review))

In [None]:
df.head()

In [None]:
# extract compound scores
df['compound'] = df['scores'].apply(lambda score: score['compound'])
df.head()

In [None]:
# label compound scores
df['comp_score'] = df['compound'].apply(lambda score: 'pos' if score>=0 else 'neg')
df.head()

In [None]:
# compare predicted labels with actual labels
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix

accuracy_score(df['label'],df['comp_score'])

In [None]:
print(classification_report(df['label'],df['comp_score']))

In [None]:
print(confusion_matrix(df['label'],df['comp_score']))

# Project outlines

In [None]:
import numpy as np
import pandas as pd

In [None]:
df = pd.read_csv('../../pythongyak/UPDATED_NLP_COURSE/TextFiles/moviereviews.tsv', sep='\t')

In [None]:
df.dropna(inplace=True)

In [None]:
blanks = []

for i,lb,rv in df.itertuples():
    if type(rv) == str:
        if rv.isspace():
            blanks.append(i)
            
df.drop(blanks, inplace=True)

In [None]:
df['label'].value_counts()

In [None]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer

sid = SentimentIntensityAnalyzer()

In [None]:
df['scores'] = df['review'].apply(lambda review:sid.polarity_scores(review))

In [None]:
df['compound'] = df['scores'].apply(lambda score: score['compound'])

In [None]:
df['pred_label'] = df['compound'].apply(lambda cp: 'pos' if cp>=0 else 'neg')

In [None]:
from sklearn.metrics import confusion_matrix,classification_report,accuracy_score

print('Accuracy score:')
print(accuracy_score(df['label'],df['pred_label']))
print('\n')
print('Classification report:')
print(classification_report(df['label'],df['pred_label']))
print('\n')
print('Confusion matrix:')
print(confusion_matrix(df['label'],df['pred_label']))