In [None]:
import json
import pandas as pd
import numpy as np
import os

from gensim.models import Word2Vec
from gensim.models.phrases import Phrases, Phraser
import gensim.downloader

from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split

from nltk.tokenize import word_tokenize,RegexpTokenizer

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score
from sklearn.metrics import classification_report

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


### Settings

## 1 Introduction

A podcast is an audio file that contains dialogue of one or more participants, which can be easily downloaded and listened to. The term podcast was first coined by the columnist and journalist Ben Hammersley in February 2004 as a way to combine the terms *'i-Pod'* and *'broadcasting'*. 
In 2005, the American company *Apple* released a new version of iTunes which provided a centralized platform for podcasts to be uploaded to and downloaded from.


It is a traditional task in A.I. to predict the general tone of a sentence or sequence. For example, a company owner might attempt to filter positive reviews from negative reviews, but this can be taxing to do by hand. Thus, the owner could decide to have the reviews analyzed by an automated agent. The conventional name for an agent performing such a task is sentiment analysis.


Podcasts are interesting because they come in various shapes and sizes. Some are serious political talkshows while others are completely based on fiction. An interesting task would therefore be to detect the general mood throughout a podcast in order to learn more about the general composition of its contents. Therefore, this report aims at finding patterns in a single podcast episode and between podcast episodes. 


**- Leuk pakkend einde van introductie**
**- Research question(s)**
The high popularity

RQ: What sentiment patterns can be found in podcasts episodes? \\
SubRQ1: What sentiment patterns can be found within a single podcast episode? \\
SubRQ2: What sentiment patterns can be found in podcast episodes in general?


**- Several methods will be examined**

**- Pattern mining**

**- Overzicht van wat er wordt besproken**


In [None]:
# Set directories of main dataset and metadata
directory_main_train = 'gdrive/My Drive/Colab Notebooks/ddp/binary/binary_train.csv'
directory_main_val = 'gdrive/My Drive/Colab Notebooks/ddp/binary/binary_val.csv'
directory_main_full = 'gdrive/My Drive/Colab Notebooks/ddp/binary/binary_full.csv'

# Should the model be saved?
save_model = False
model_name = "test"

### Load the data, filter on English podcasts and insert into dataframe

In [None]:
# Function that removes punctuation, lowercases everything (to normalize), tokenizes, and converts the labels to int
def clean_data(df):
    tokenizer = RegexpTokenizer(r'\w+')
    df['text'] = df['text'].str.lower()
    df['text_tokenized'] = df['text'].apply(tokenizer.tokenize)
    return df

In [None]:
df_train = pd.read_csv(directory_main_train,sep='\t')

# Normalize and clean text
df_train = clean_data(df_train)
text = df_train['text_tokenized'].values

# Detect common phrases in the text (n-grams)
terms = Phrases(text, min_count=2)

# Extract bigrams
optimized_terms = Phraser(terms)
text_final = optimized_terms[text]
print('Text ready!')

Text ready!


In [None]:
# Build a word2vec model using the vocabulary
modelw2v = Word2Vec(text_final,size=300)

modelw2v.build_vocab(text_final, update=True)
print("Vocab building done!")

modelw2v.train(text_final, total_examples=modelw2v.corpus_count, epochs=30)
print("Training done!")

if save_model:
    model_format = model_name + ".model"

    # Save the current model for use later
    modelw2v.save(model_format)

    # Load the model to use now
    word_vectors = Word2Vec.load(model_format).wv
else:
    word_vectors = modelw2v.wv


# Initiate the K-means algorithm and find n clusters
model = KMeans(n_clusters=2, max_iter=10000, random_state=True, n_init=1000).fit(X=word_vectors.vectors.astype('double'))
print('KMeans model ready!')

Vocab building done!
Training done!
KMeans model ready!


In [None]:
print(word_vectors.similar_by_vector(model.cluster_centers_[0], topn=10, restrict_vocab=None))
print(word_vectors.similar_by_vector(model.cluster_centers_[1], topn=10, restrict_vocab=None))

[('uk', 0.8665963411331177), ('combat', 0.8614566326141357), ('creatures', 0.853045642375946), ('free_agency', 0.8391095399856567), ('grass', 0.8373022079467773), ('incentives', 0.8346552848815918), ('safety', 0.8314654231071472), ('hollywood', 0.8295350074768066), ('wealth', 0.8280388116836548), ('champagne', 0.8279585838317871)]
[('say_anything', 0.7285462021827698), ('guilty', 0.7254054546356201), ('admit', 0.672544002532959), ('fine', 0.6677496433258057), ('laugh', 0.6655440926551819), ('um', 0.6292319893836975), ('hmm', 0.6247437596321106), ('convinced', 0.6223713755607605), ('uncomfortable', 0.6217079758644104), ('wrong', 0.6200493574142456)]


In [None]:
# Set the cluster positions
positive_cluster_index = 0
positive_cluster_center = model.cluster_centers_[positive_cluster_index]
negative_cluster_center = model.cluster_centers_[1-positive_cluster_index]

In [None]:
# Create vectors for each word
words = pd.DataFrame(word_vectors.vocab.keys())
words.columns = ['words']

# Assign words to a cluster using Sklearn's predict
words['vectors'] = words['words'].apply(lambda x: word_vectors[f'{x}'])
words['cluster'] = words['vectors'].apply(lambda x: model.predict([np.array(x)]))

# Unpack the values from list
words['cluster'] = words['cluster'].apply(lambda x: x[0])

# Assign words to cluster
words['cluster_value'] = [1 if i==positive_cluster_index else -1 for i in words['cluster']]

# Assign the inverse distance to the closest cluster to each word
words['distance'] = words.apply(lambda x: 1/(model.transform([x['vectors']]).min()), axis=1)

# Calculate the sentiment coefficient
words['sentiment_coeff'] = words['distance'] * words['cluster_value']

sentiment_dict = dict(zip(words['words'].values, words['sentiment_coeff'].values))

### ---Tf-idf weighting---

In [None]:
# Load in the validation set and clean like the training set
df_val = pd.read_csv(directory_main_val,sep='\t')
df_val = clean_data(df_val)

In [None]:
tfidf = TfidfVectorizer(tokenizer=lambda y: y.split(), norm=None)
tfidf.fit(df_val['text'])
features = pd.Series(tfidf.get_feature_names())
transformed = tfidf.transform(df_val['text'])



In [None]:
def create_tfidf_dictionary(x, transformed_file, features):
    vector_coo = transformed_file[x.name].tocoo()
    vector_coo.col = features.iloc[vector_coo.col].values
    return dict(zip(vector_coo.col, vector_coo.data))


def replace_tfidf_words(x, transformed_file, features):
    dictionary = create_tfidf_dictionary(x, transformed_file, features)   
    return list(map(lambda y:dictionary[f'{y}'], x['text'].split()))

# Replaces a word with its respective sentiment value
def replace_sentiment_words(word, sentiment_dict):
    try:
        return sentiment_dict[word]
    except KeyError:
        return 0

replaced_tfidf_scores = df_val.apply(lambda x: replace_tfidf_words(x, transformed, features), axis=1)
replaced_closeness_scores = df_val['text'].apply(lambda x: list(map(lambda y: replace_sentiment_words(y, sentiment_dict), x.split())))

# Create new dataframe for final calculations
df_kmeans = pd.DataFrame(data=[replaced_closeness_scores, replaced_tfidf_scores, df_val['text'], df_val['sentiment_score']).T
df_kmeans.columns = ['sentiment_coeff', 'tfidf_scores', 'sentence', 'sentiment_score']

# Take the dot product to determine if a segment is mostly positive or mostly negative
df_kmeans['prediction'] = df_kmeans.apply(lambda x: np.array(x.loc['sentiment_coeff']) @ np.array(x.loc['tfidf_scores']), axis=1)

# Predict the label and convert to the same datatype
df_kmeans['prediction'] = (df_kmeans['prediction']>=0).astype('int8')
df_kmeans['sentiment_score'] = df_kmeans['sentiment_score'].astype('int8')

### ---Performance Metrics---

In [None]:
y_true_kmeans = df['sentiment_score']
y_pred_kmeans = df['prediction']

# Display the final scores
print('Confusion Matrix\n',confusion_matrix(y_true_kmeans,y_pred_kmeans))
print(classification_report(y_true_kmeans, y_pred_kmeans))

Confusion Matrix
 [[233 172]
 [438 449]]
              precision    recall  f1-score   support

           0       0.35      0.58      0.43       405
           1       0.72      0.51      0.60       887

    accuracy                           0.53      1292
   macro avg       0.54      0.54      0.51      1292
weighted avg       0.61      0.53      0.54      1292

