In [4]:
import json
import pandas as pd
import numpy as np
import os
import multiprocessing

from gensim.models import Word2Vec
from gensim.models.phrases import Phrases, Phraser

from sklearn.cluster import KMeans

from nltk.tokenize import word_tokenize,RegexpTokenizer

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score
from IPython.display import display

### Load the data, filter on English podcasts and insert into dataframe

In [6]:
cd ..

/home/jdh/Desktop


In [7]:
# Set directories of main dataset and metadata
directory_main = 'podcast_data_no_audio/transcripts/podcasts-transcripts/2'
directory_meta = "podcast_data_no_audio/metadata/metadata.tsv"

# Get metadata and filter out non-english podcasts
dftest = pd.read_csv(directory_meta,sep='\t')
dftest = dftest[dftest['language'].isin(["['en']","['en-US']", "['en-AU']", "['en-CA']", "['en-GB']",
       "['en-NZ']"])]

# Create list of english podcasts
english_id = dftest['episode_filename_prefix'].values

df = pd.DataFrame()
df_textlist = []
df_idlist = []

# Walk through all subdirs and find jsons
for subdir, dirs, files in os.walk(directory_main):
    
    for filename in files:
        filepath = subdir + os.sep + filename

        if filepath.endswith(".json"):
            
            # Create id to check if english
            p_id = filename.replace('.json', '')
            if p_id in english_id:
            
                # create empty string to append to
                utterancestr = ''
                
                with open(filepath) as f:
                    data = json.load(f)
                for utterance_number, _ in enumerate(data['results']): 
                    utterance_dict = data['results'][utterance_number]['alternatives'][0]

                    try:
                        utterance = utterance_dict['transcript']
                    except KeyError:
                        utterance = '';

                    utterancestr += utterance

                df_textlist.append(utterancestr.strip())
                df_idlist.append(p_id)

### Create text to train model with

In [32]:
# What should the saved model be called?
model_name = "test"
model_format = model_name + ".model"

In [8]:
# Create tokenized text
df['text'] = df_textlist
df['id'] = df_idlist

tokenizer = RegexpTokenizer(r'\w+')
df['text_tokenized'] = df['text'].apply(tokenizer.tokenize)
text = df['text_tokenized'].values

phrases = Phrases(text, min_count=1)
bigram = Phraser(phrases)
sentences = bigram[text]



- min count = 3 
>remove most unusual words from training embeddings, like words 'ssssuuuuuuuppppppeeeeeerrrr', which actually stands for 'super', and doesn't need additional training

- window = 4 
>Word2Vec model will learn to predict given word from up to 4 words to the left, and up to 4 words to the right

- size = 300 
>size of hidden layer used to predict surroundings of embedded word, which also stands for dimensions of trained embeddings

- sample = 1e-5 
>probability baseline for subsampling most frequent words from surrounding of embedded word

- negative = 20 
>number of negative (ones that shouldn't have been predicted while modeling selected pair of words) words that will have their corresponding weights updated while training on specific training example, along with positive word



In [33]:
modelw2v = Word2Vec(min_count=5,
                     window=4,
                     size=500,
                     sample=1e-5, 
                     alpha=0.03, 
                     min_alpha=0.0007, 
                     negative=20,
                     workers=multiprocessing.cpu_count()-1)

modelw2v.build_vocab(sentences)
print("Vocab building done!")

modelw2v.train(sentences, total_examples=modelw2v.corpus_count, epochs=5, report_delay=1)
print("Training done!")

Vocab building done!
Training done!


In [34]:
# Save the current model for use later
modelw2v.save(model_format)

# Load the model to use now
word_vectors = Word2Vec.load(model_format).wv

### K-means algorithm

In [None]:
# Initiate the K-means algorithm and find n clusters
model = KMeans(n_clusters=2, max_iter=1000, random_state=True, n_init=50).fit(X=word_vectors.vectors.astype('double'))
positive_cluster_center = model.cluster_centers_[0]
negative_cluster_center = model.cluster_centers_[1]

In [21]:
# Find words that are close to the cluster center
word_vectors.similar_by_vector(model.cluster_centers_[1], topn=10, restrict_vocab=None)

[('Understand', 0.8800795078277588),
 ('internalize', 0.8799031972885132),
 ('compromising', 0.8732283115386963),
 ('extremely_important', 0.8664716482162476),
 ('limiting_belief', 0.8576003313064575),
 ('cognizant', 0.855897068977356),
 ('counterproductive', 0.8553060293197632),
 ('intrinsically', 0.8537331223487854),
 ('counterintuitive', 0.85052490234375),
 ('compartmentalize', 0.8488243818283081)]

In [24]:
positive_cluster_index = 1
positive_cluster_center = model.cluster_centers_[positive_cluster_index]
negative_cluster_center = model.cluster_centers_[1-positive_cluster_index]

In [22]:
# Assign sentiment score according to cluster

words = pd.DataFrame(word_vectors.vocab.keys())
words.columns = ['words']
words['vectors'] = words.words.apply(lambda x: word_vectors[f'{x}'])
words['cluster'] = words.vectors.apply(lambda x: model.predict([np.array(x)]))
words.cluster = words.cluster.apply(lambda x: x[0])

words['cluster_value'] = [1 if i==positive_cluster_index else -1 for i in words.cluster]
words['closeness_score'] = words.apply(lambda x: 1/(model.transform([x.vectors]).min()), axis=1)
words['sentiment_coeff'] = words.closeness_score * words.cluster_value

### Tf-idf weighting

In [None]:
file_weighting = df['text'].copy()

sentiment_dict = dict(zip(words['words'].values, words['sentiment_coeff'].values))

tfidf = TfidfVectorizer(tokenizer=lambda y: y.split(), norm=None)
tfidf.fit(file_weighting)
features = pd.Series(tfidf.get_feature_names())
transformed = tfidf.transform(file_weighting.title)

In [None]:
def create_tfidf_dictionary(x, transformed_file, features):
    '''
    create dictionary for each input sentence x, where each word has assigned its tfidf score
    
    inspired  by function from this wonderful article: 
    https://medium.com/analytics-vidhya/automated-keyword-extraction-from-articles-using-nlp-bfd864f41b34
    
    x - row of dataframe, containing sentences, and their indexes,
    transformed_file - all sentences transformed with TfidfVectorizer
    features - names of all words in corpus used in TfidfVectorizer

    '''
    vector_coo = transformed_file[x.name].tocoo()
    vector_coo.col = features.iloc[vector_coo.col].values
    dict_from_coo = dict(zip(vector_coo.col, vector_coo.data))
    return dict_from_coo

def replace_tfidf_words(x, transformed_file, features):
    '''
    replacing each word with it's calculated tfidf dictionary with scores of each word
    x - row of dataframe, containing sentences, and their indexes,
    transformed_file - all sentences transformed with TfidfVectorizer
    features - names of all words in corpus used in TfidfVectorizer
    '''
    dictionary = create_tfidf_dictionary(x, transformed_file, features)   
    return list(map(lambda y:dictionary[f'{y}'], x.title.split()))

def replace_sentiment_words(word, sentiment_dict):
    '''
    replacing each word with its associated sentiment score from sentiment dict
    '''
    try:
        out = sentiment_dict[word]
    except KeyError:
        out = 0
    return out

In [None]:
replaced_tfidf_scores = file_weighting.apply(lambda x: replace_tfidf_words(x, transformed, features), axis=1)
replaced_closeness_scores = file_weighting.title.apply(lambda x: list(map(lambda y: replace_sentiment_words(y, sentiment_dict), x.split())))

In [None]:
replacement_df = pd.DataFrame(data=[replaced_closeness_scores, replaced_tfidf_scores, file_weighting.title, file_weighting.rate]).T
replacement_df.columns = ['sentiment_coeff', 'tfidf_scores', 'sentence', 'sentiment']
replacement_df['sentiment_rate'] = replacement_df.apply(lambda x: np.array(x.loc['sentiment_coeff']) @ np.array(x.loc['tfidf_scores']), axis=1)
replacement_df['prediction'] = (replacement_df.sentiment_rate>0).astype('int8')
replacement_df['sentiment'] = [1 if i==1 else 0 for i in replacement_df.sentiment]

### Predicting and accuracy metrics

In [None]:
predicted_classes = replacement_df['prediction']
y_test = replacement_df['sentiment']

conf_matrix = pd.DataFrame(confusion_matrix(replacement_df.sentiment, replacement_df.prediction))
print('Confusion Matrix')
display(conf_matrix)

test_scores = accuracy_score(y_test,predicted_classes), precision_score(y_test, predicted_classes), recall_score(y_test, predicted_classes), f1_score(y_test, predicted_classes)

print('\n \n Scores')
scores = pd.DataFrame(data=[test_scores])
scores.columns = ['accuracy', 'precision', 'recall', 'f1']
scores = scores.T
scores.columns = ['scores']
display(scores)