# GOAL:
The goal of this notebook is to perform LDA topic modeling using the training data, and then applying the results on the test set. This will complete the dataset construction part of the pipeline, allowing us to move on to the models.

In [None]:
from google.colab import drive
import sys

drive.mount('/content/drive')
sys.path.append('/content/drive/My Drive/Courses/CS247/247 Project')
%cd /content/drive/My\ Drive/Courses/CS247/247 Project

Mounted at /content/drive
/content/drive/My Drive/Courses/CS247/247 Project


In [None]:
!pip3 install emoji==0.6.0
!pip install --upgrade gensim
!pip install pyLDAvis
!pip install tomotopy

Collecting emoji==0.6.0
  Downloading emoji-0.6.0.tar.gz (51 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/51.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.0/51.0 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: emoji
  Building wheel for emoji (setup.py) ... [?25l[?25hdone
  Created wheel for emoji: filename=emoji-0.6.0-py3-none-any.whl size=49719 sha256=a5d053e7f0ce42ea9e1190cd1059da87a9a8f714747ae695b04941508757c500
  Stored in directory: /root/.cache/pip/wheels/1b/bd/d9/310c33c45a553798a714e27e3b8395d37128425442b8c78e07
Successfully built emoji
Installing collected packages: emoji
Successfully installed emoji-0.6.0
Collecting pyLDAvis
  Downloading pyLDAvis-3.4.1-py3-none-any.whl (2.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.6/2.6 MB[0m [31m14.7 MB/s[0m eta [36m0:0

In [None]:
import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('stopwords')
import pandas as pd

import gensim
from gensim.corpora import Dictionary
from gensim.corpora import MmCorpus


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


## Data Processing

In [None]:
import pandas as pd

data = pd.read_csv('./data/nlpositionality_toxicity_processed.csv')
data['annotator_id'] = range(len(data))
data = data.drop(['session_id', 'age', 'religion', 'education', 'country_longest', 'country_residence', 'native_language'], axis=1)
data.to_csv('./data/toxicity_processed.csv', index=False)
data.columns

Index(['action', 'litw', 'dynahate', 'perspective', 'rewire', 'hateroberta',
       'gpt4', 'gender', 'ethnicity', 'annotator_id'],
      dtype='object')

In [None]:
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(data, test_size=0.2, random_state=8)
train_df, val_df = train_test_split(train_df, test_size=0.1, random_state=8)


train_df.to_csv('./data/toxicity_processed_train.csv', index=False)
val_df.to_csv('./data/toxicity_processed_val.csv', index=False)
test_df.to_csv('./data/toxicity_processed_test.csv', index=False)


print("train: ", len(train_df))
print("val: ", len(val_df))
print("test: ", len(test_df))

train:  5107
val:  568
test:  1419


In [None]:
tweet_text = data['action'].tolist()
humman_label = data['litw'].tolist()

In [None]:
from nltk.tokenize import RegexpTokenizer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords

def tokenization(tweet_list):
  tweet_corpus_tokenized = []
  tokenizer = RegexpTokenizer(r'\w+')
  for tweet in tweet_list:
    tokenized_lyric = tokenizer.tokenize(tweet.lower())
    tweet_corpus_tokenized.append(tokenized_lyric)
  return tweet_corpus_tokenized

def token_filtering(tweet_tokenized_list):
  for idx, tweet in enumerate(tweet_tokenized_list):
    filtered_tweets = []
    for token in tweet:
        if len(token) > 2 and not token.isnumeric():
            filtered_tweets.append(token)
    tweet_tokenized_list[idx] = filtered_tweets
  return tweet_tokenized_list

def lemmatization(tweet_corpus_tokenized):
  lemmatizer = WordNetLemmatizer()
  for idx, tweet in enumerate(tweet_corpus_tokenized):
    lemmatized_tokens = []
    for token in tweet:
        lemmatized_tokens.append(lemmatizer.lemmatize(token))
    tweet_corpus_tokenized[idx] = lemmatized_tokens
  return tweet_corpus_tokenized

def remove_stop_words(tweet_corpus_tokenized):
  stop_words = stopwords.words('english')
  for idx, tweet in enumerate(tweet_corpus_tokenized):
    filtered_text = []
    for token in tweet:
        if not contains_non_ascii(token) and token not in stop_words:
            filtered_text.append(token)
    tweet_corpus_tokenized[idx] = filtered_text
  return tweet_corpus_tokenized

def contains_non_ascii(word):
    for char in word:
        if ord(char) >= 128:
            return True
    return False


In [None]:
tweet_tokenized = tokenization(tweet_text)
tweet_tokenized = token_filtering(tweet_tokenized)
tweet_tokenized = lemmatization(tweet_tokenized)
tweet_tokenized = remove_stop_words(tweet_tokenized)

#LDA Modeling

##Gensim

In [None]:
import gensim
from gensim.corpora import Dictionary
from gensim.corpora import MmCorpus

dictionary = Dictionary(tweet_tokenized)
dictionary.filter_extremes(no_below=100, no_above=0.8)

gensim_corpus = [dictionary.doc2bow(tweet) for tweet in tweet_tokenized]
temp = dictionary[0]
id2word = dictionary.id2token

In [None]:
import gensim
from gensim import models, test
#from gensim.models import LDAModel
from gensim.test import utils
from gensim.test.utils import datapath

lda_model = models.LdaModel(
    corpus=gensim_corpus,
    id2word=id2word,
    chunksize=2000,
    alpha='auto',
    eta='auto',
    iterations=400,
    num_topics=6,
    passes=20
)

In [None]:
train_model_save = datapath("/content/drive/My Drive/Courses/CS247/247 Project/models/lda-6")
lda_model.save(train_model_save)

In [None]:
for idx, topic in lda_model.show_topics(formatted=False, num_words=15):
    print('Topic: {} \nWords: {}'.format(idx, '|'.join([w[0] for w in topic])))

Topic: 0 
Words: woman|jew|one|british|believe|back|much|gay|wa|nothing|men|man|world|european|rape
Topic: 1 
Words: always|woman|men|idiot|never|take|willing|victim|someone|wear|le|clothing|today|fault|else
Topic: 2 
Words: wa|like|people|would|want|think|shit|see|anyone|get|got|around|make|could|else
Topic: 3 
Words: people|like|get|muslim|fucking|country|white|immigrant|guy|right|make|time|black|actually|work
Topic: 4 
Words: white|people|know|need|great|really|western|dangerous|say|get|word|animal|meme|way|group
Topic: 5 
Words: people|issue|race|also|least|history|poverty|suffering|perceived|land|infamous|brutality|descended|african|africa


#Tomotopy
https://github.com/bab2min/tomotopy
https://bab2min.github.io/tomotopy/v0.12.6/en/

### LDA

In [None]:
import tomotopy as tp
import random

seed = int(random.randint(1, 300000))

mdl = tp.LDAModel(k=10, seed=seed)
for each in tweet_tokenized:
  mdl.add_doc(each)

for i in range(0, 100, 10):
    mdl.train(10)
    print('Iteration: {}\tLog-likelihood: {}'.format(i, mdl.ll_per_word))

for k in range(mdl.k):
    print('Top 10 words of topic #{}'.format(k))
    print(mdl.get_topic_words(k, top_n=10))

mdl.summary()
mdl.save('./models/tomotopy-lda-{}-{}.bin'.format(mdl.k, seed), True)

  mdl.train(10)


Iteration: 0	Log-likelihood: -6.2816301213533565
Iteration: 10	Log-likelihood: -6.060579213647868
Iteration: 20	Log-likelihood: -6.002724317650043
Iteration: 30	Log-likelihood: -5.974921130481547
Iteration: 40	Log-likelihood: -5.957357149571673
Iteration: 50	Log-likelihood: -5.950974164825691
Iteration: 60	Log-likelihood: -5.9438444192035504
Iteration: 70	Log-likelihood: -5.936114216250063
Iteration: 80	Log-likelihood: -5.932787849036342
Iteration: 90	Log-likelihood: -5.928497375949288
Top 10 words of topic #0
[('fuck', 0.01643390581011772), ('get', 0.01465738657861948), ('people', 0.013658096082508564), ('old', 0.011659513227641582), ('white', 0.010993318632245064), ('make', 0.010993318632245064), ('like', 0.01054918859153986), ('planet', 0.009660929441452026), ('ape', 0.009438864886760712), ('jew', 0.008994734846055508)]
Top 10 words of topic #1
[('people', 0.021816875785589218), ('know', 0.01999225653707981), ('muslim', 0.017374327406287193), ('non', 0.013725091703236103), ('thankfu

### Hierarchical LDA

In [None]:
# mhdl = tp.HLDAModel(k=10)
# for each in tweet_tokenized:
#   mhdl.add_doc(each)

# # for i in range(0, 100, 10):
# #     mdl.train(10)
# #     print('Iteration: {}\tLog-likelihood: {}'.format(i, mdl.ll_per_word))

# # for k in range(mdl.k):
# #     print('Top 10 words of topic #{}'.format(k))
# #     print(mdl.get_topic_words(k, top_n=10))

# mhdl.summary()
# mhdl.save('./models/tomotopy-hlda.bin', True)

TypeError: 'k' is an invalid keyword argument for this function

# NEXT: Write New Dataset Files w/ LDA Labels
This will assign a topic to each artist, lyric pair in the train and test sets. Later, we will rewrite the dataset into a csv file.

In [None]:
Y_test_tokenized = lyric_tokenization(Y_test)
Y_test_tokenized = token_filtering(Y_test_tokenized)
Y_test_tokenized = lemmatization(Y_test_tokenized)
Y_test_tokenized = remove_stop_words(Y_test_tokenized)

In [None]:
# write the training csv
with open('/content/drive/Shareddrives/CS260-Project/data/lda-train-6-updated.csv', 'w') as traindata:
  writer = csv.writer(traindata, delimiter=',')
  writer.writerow(['artist', 'topic_id', 'lyric'])
  line = 0
  for i, artist in enumerate(X_train):
    curr_doc = dictionary.doc2bow(Y_train_tokenized[i])
    #gensim_corpus = [dictionary.doc2bow(song) for song in Y_train_tokenized]
    probs = lda_model[curr_doc]
    max_prob = -1
    topic_id = -1
    for j in range(len(probs)):
      idx, curr_prob = probs[j]
      if curr_prob > max_prob:
        max_prob = curr_prob
        topic_id = idx
    if line < 4:
      print(probs)
      print(Y_train[i])
      print(topic_id)
    #max_prob = max(probs)
    #topic_id = probs.index(max_prob)
    lyrics = Y_train[i]
    writer.writerow([artist, topic_id, lyrics])
    line += 1

[(0, 0.25901562), (2, 0.03261063), (3, 0.43661842), (4, 0.23224649), (5, 0.034634035)]
Bee Gees - When A Lonely Heart Breaks


I stumble in the night
Never really knew what it would've been like
You're no longer there to break my fall
The heartache over you
I'd give it everything but I couldn't live through
I never saw the signs
You're the last to know when love is blind.

All the tears and the turbulent years
When I would not wait for no-one
Didn't stop and take a look at myself
And see me losing you.

(Chorus)
When a lonely heart breaks
It's the one that forsakes
It's the dream that we stole
And I'm missing you more
Than the fire that will roar
There's a hole in my soul
For you it's good-bye
For me it's to cry
For whom the bell tolls.

Seen you in a magazine
A picture at a party where you shouldn't have been
Hanging on the arm of someone else
I'm still in love with you
Won't you come back to your little boy blue
I've come to feel inside
This precious love was never mine.

Now I know 

In [None]:
# write the test csv
print(Y_test[0])

with open('/content/drive/Shareddrives/CS260-Project/data/lda-test-6-updated.csv', 'w') as testdata:
  writer = csv.writer(testdata, delimiter=',')
  writer.writerow(['artist', 'topic_id', 'lyric'])
  line = 0
  for i, artist in enumerate(X_test):
    curr_doc = dictionary.doc2bow(Y_test_tokenized[i])
    #gensim_corpus = [dictionary.doc2bow(song) for song in Y_train_tokenized]
    probs = lda_model[curr_doc]
    max_prob = -1
    topic_id = -1
    for j in range(len(probs)):
      idx, curr_prob = probs[j]
      if curr_prob > max_prob:
        max_prob = curr_prob
        topic_id = idx
    if line < 4:
      print(probs)
      print(Y_test[i])
      print(topic_id)
    #max_prob = max(probs)
    #topic_id = probs.index(max_prob)
    lyrics = Y_test[i]
    writer.writerow([artist, topic_id, lyrics])
    line += 1

Your love is so good for me
Your love is so good for me

Every day you're on my mind
Wanna be near you all the time
You made my poem rhyme
And my heart began to sing again
The day your eyes met mine

Your love is so good for me
Your love is so good for me

Two hearts just running free
Like a wind song through the trees
Your love does not possess
It just holds me where I wanna be
With binds of tenderness

Your love is so good for me
Your love is so good for me

Baby, you know your love is so good
It's good
You know it's good

Like a star up in the sky
Burning brightly, you and I
Time will tell if love survives
For we only have today
And today love is alive

Your love is so good for me
Your love is so good for me

It's so good
You know your love is so good,
So good for me...
It's so good

No need to say the words
When you touch me they will be heard
You gave so much to me
And you showed me how to love the way
True love was meant to be

Your love is so good for me
Your love is so good for

In [None]:
Y_val_tokenized = lyric_tokenization(Y_val)
Y_val_tokenized = token_filtering(Y_val_tokenized)
Y_val_tokenized = lemmatization(Y_val_tokenized)
Y_val_tokenized = remove_stop_words(Y_val_tokenized)

In [None]:
# write the validation CSV file
print(Y_val[0])

with open('/content/drive/Shareddrives/CS260-Project/data/lda-val-6.csv', 'w') as valdata:
  writer = csv.writer(valdata, delimiter=',')
  writer.writerow(['artist', 'topic_id', 'lyric'])
  line = 0
  for i, artist in enumerate(X_val):
    curr_doc = dictionary.doc2bow(Y_val_tokenized[i])
    #gensim_corpus = [dictionary.doc2bow(song) for song in Y_val_tokenized]
    probs = lda_model[curr_doc]
    max_prob = -1
    topic_id = -1
    for j in range(len(probs)):
      idx, curr_prob = probs[j]
      if curr_prob > max_prob:
        max_prob = curr_prob
        topic_id = idx
    if line < 4:
      print(probs)
      print(Y_val[i])
      print(topic_id)
    #max_prob = max(probs)
    #topic_id = probs.index(max_prob)
    lyrics = Y_val[i]
    writer.writerow([artist, topic_id, lyrics])
    line += 1

hold time feel break feel untrue convince speak voice tear try hold hurt try forgive okay play break string feel heart want feel tell real truth hurt lie worse anymore little turn dust play house ruin run leave save like chase train late late tear try hold hurt try forgive okay play break string feel heart want feel tell real truth hurt lie worse anymore little run leave save like chase train know late late play break string feel heart want feel tell real truth hurt lie worse anymore little know little hold time feel
[(0, 0.10326173), (2, 0.054414734), (3, 0.10834018), (4, 0.72854453)]
hold time feel break feel untrue convince speak voice tear try hold hurt try forgive okay play break string feel heart want feel tell real truth hurt lie worse anymore little turn dust play house ruin run leave save like chase train late late tear try hold hurt try forgive okay play break string feel heart want feel tell real truth hurt lie worse anymore little run leave save like chase train know late l