# Mount data

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
from os import chdir
chdir("/content/drive/My Drive/raw_data")

# Install dependencies

In [None]:
!pip install yellowbrick
!pip install zeugma

Collecting zeugma
  Downloading zeugma-0.49.tar.gz (9.9 kB)
Collecting tf-estimator-nightly==2.8.0.dev2021122109
  Downloading tf_estimator_nightly-2.8.0.dev2021122109-py2.py3-none-any.whl (462 kB)
[K     |████████████████████████████████| 462 kB 5.1 MB/s 
Building wheels for collected packages: zeugma
  Building wheel for zeugma (setup.py) ... [?25l[?25hdone
  Created wheel for zeugma: filename=zeugma-0.49-py3-none-any.whl size=8823 sha256=d4488b01bcf6df60c275222591867d46ff5f434141c33b897882c467889a83bd
  Stored in directory: /root/.cache/pip/wheels/1d/47/5b/2a59a79706cc9340c72fd6a7bfc20e7ebcab849c88c38fdfa0
Successfully built zeugma
Installing collected packages: tf-estimator-nightly, zeugma
Successfully installed tf-estimator-nightly-2.8.0.dev2021122109 zeugma-0.49


In [None]:
!pip install bs4
!pip install spacy



In [None]:
import pandas as pd
import numpy as np
import nltk
import pickle
import matplotlib.pyplot as plt
from yellowbrick.text import FreqDistVisualizer
from zeugma.embeddings import EmbeddingTransformer
import spacy
from spacy import displacy
from sklearn.decomposition import LatentDirichletAllocation
import re
import string
import tqdm
import gensim
import gensim.corpora as corpora
from gensim.models import CoherenceModel

# Load Data

In [None]:
df = pd.read_csv('fulltrain.csv', header=None)
X_train = df[1]
y_train = df[0]

In [None]:
df.dropna(subset=[1], inplace=True)

In [None]:
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Unzipping corpora/omw-1.4.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

# Preprocessing pipeline

# Text preprocessing

In [None]:
df

Unnamed: 0,0,1
0,1,"A little less than a decade ago, hockey fans w..."
1,1,The writers of the HBO series The Sopranos too...
2,1,Despite claims from the TV news outlet to offe...
3,1,After receiving 'subpar' service and experienc...
4,1,After watching his beloved Seattle Mariners pr...
...,...,...
48849,4,The ruling Kuomintang (KMT) has claimed owners...
48850,4,The Taipei city government has encouraged the ...
48851,4,President Ma Ying-jeou said Friday that a park...
48852,4,The families of the four people who were kille...


In [None]:
def clean_text(sentence):
   
    
    def replace_punctuation(sentence):
        return sentence.translate(str.maketrans(string.punctuation, ' '*len(string.punctuation)))

    def make_lower_and_strip(sentence):
        return re.sub(r'[^\w\s]', '', sentence.lower().strip())

    def remove_integer(sentence):
        return re.sub(r'[0-9]+', '', sentence)
    
  
    return replace_punctuation(make_lower_and_strip(remove_integer(sentence)))
  

In [None]:
def tokenize(sentence):
    """
    Description: Tokenizes sentence into list of words.

    Input:
    * String: The string to be tokenized.

    Return:
    * List: The list of words in the sentence.
    """
    word_list = sentence.split()
    return word_list

In [None]:
def remove_stopwords(word_list, stopwords=None):
    """
    Description: Removes stopwords and words with length less than 4.

    Input:
    * List: The list of words to be preprocessed.

    Return:
    * List: The list of words with stopwords and words with length less than 4 removed.
    """
    if stopwords is not None:
        return [word for word in word_list if ((word not in stopwords) and len(word) > 3)]
    else:
        return [word for word in word_list if len(word) > 3]


In [None]:
df['cleaned'] = df[1].apply(lambda x: clean_text(x))
df['cleaned_token'] = df.cleaned.apply(lambda x: tokenize(x))

Using NLTK stopwords

In [None]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

In [None]:
df['cleaned_tokenized_nostop'] = df.cleaned_token.apply(lambda x: remove_stopwords(x, stop_words))

In [None]:
df.head()

Unnamed: 0,0,1,cleaned,cleaned_token,cleaned_tokenized_nostop
0,1,"A little less than a decade ago, hockey fans w...",a little less than a decade ago hockey fans we...,"[a, little, less, than, a, decade, ago, hockey...","[little, less, decade, hockey, fans, blessed, ..."
1,1,The writers of the HBO series The Sopranos too...,the writers of the hbo series the sopranos too...,"[the, writers, of, the, hbo, series, the, sopr...","[writers, series, sopranos, took, another, dar..."
2,1,Despite claims from the TV news outlet to offe...,despite claims from the tv news outlet to offe...,"[despite, claims, from, the, tv, news, outlet,...","[despite, claims, news, outlet, offer, nonstop..."
3,1,After receiving 'subpar' service and experienc...,after receiving subpar service and experiencin...,"[after, receiving, subpar, service, and, exper...","[receiving, subpar, service, experiencing, unu..."
4,1,After watching his beloved Seattle Mariners pr...,after watching his beloved seattle mariners pr...,"[after, watching, his, beloved, seattle, marin...","[watching, beloved, seattle, mariners, prevail..."


Make Bigrams and Lemmatize 

In [None]:
data_words = df.cleaned_token.tolist()
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100)
# trigram = gensim.models.Phrases(bigram[data_words], threshold=100)

bigram_mod = gensim.models.phrases.Phraser(bigram)
# trigram_mod = gensim.models.phrases.Phraser(trigram)

In [None]:
def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

In [None]:
data_words_nostops = df.cleaned_tokenized_nostop.tolist()
data_words_bigrams = make_bigrams(data_words_nostops)

In [None]:
def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [None]:
!python -m spacy download en

Collecting en_core_web_sm==2.2.5
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.2.5/en_core_web_sm-2.2.5.tar.gz (12.0 MB)
[K     |████████████████████████████████| 12.0 MB 4.5 MB/s 
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')
[38;5;2m✔ Linking successful[0m
/usr/local/lib/python3.7/dist-packages/en_core_web_sm -->
/usr/local/lib/python3.7/dist-packages/spacy/data/en
You can now load the model via spacy.load('en')


In [None]:
import spacy
# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
nlp = spacy.load("en_core_web_sm", disable=['parser', 'ner'])
# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

In [None]:
df['data_lemmatized'] = data_lemmatized

In [None]:
df.head()

Unnamed: 0,0,1,cleaned,cleaned_token,cleaned_tokenized_nostop,data_lemmatized
0,1,"A little less than a decade ago, hockey fans w...",a little less than a decade ago hockey fans we...,"[a, little, less, than, a, decade, ago, hockey...","[little, less, decade, hockey, fans, blessed, ...","[little, less, decade, hockey, fan, bless, sla..."
1,1,The writers of the HBO series The Sopranos too...,the writers of the hbo series the sopranos too...,"[the, writers, of, the, hbo, series, the, sopr...","[writers, series, sopranos, took, another, dar...","[take, daring, storytelling, step, kill, fan, ..."
2,1,Despite claims from the TV news outlet to offe...,despite claims from the tv news outlet to offe...,"[despite, claims, from, the, tv, news, outlet,...","[despite, claims, news, outlet, offer, nonstop...","[claim, news, outlet, offer, nonstop, news, in..."
3,1,After receiving 'subpar' service and experienc...,after receiving subpar service and experiencin...,"[after, receiving, subpar, service, and, exper...","[receiving, subpar, service, experiencing, unu...","[receive, subpar, service, experience, unusual..."
4,1,After watching his beloved Seattle Mariners pr...,after watching his beloved seattle mariners pr...,"[after, watching, his, beloved, seattle, marin...","[watching, beloved, seattle, mariners, prevail...","[watch, beloved, seattle_mariner, prevail, die..."


# LDA Topic Modelling 

In [None]:
X = df.data_lemmatized.values

In [None]:
train_size = int(len(X) * 0.7)
train, test = X[0:train_size], X[train_size:len(X)]
print('Observations: %d' % (len(X)))
print('Training Observations: %d' % (len(train)))
print('Testing Observations: %d' % (len(test)))

Observations: 48854
Training Observations: 34197
Testing Observations: 14657


Data transformation

In [None]:
# Create Dictionary
id2word = corpora.Dictionary(train)
# Create Corpus
texts = train
# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]
# View
print(corpus[:1])

[[(0, 2), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1), (15, 2), (16, 1), (17, 2), (18, 1), (19, 1), (20, 3), (21, 2), (22, 1), (23, 1), (24, 1), (25, 1), (26, 1), (27, 1), (28, 1), (29, 1), (30, 1), (31, 1), (32, 1), (33, 1), (34, 1), (35, 1), (36, 1), (37, 1), (38, 1), (39, 1), (40, 1), (41, 1), (42, 1), (43, 1), (44, 1), (45, 1), (46, 1), (47, 1), (48, 1), (49, 1), (50, 1), (51, 2), (52, 1), (53, 1), (54, 1), (55, 1), (56, 1), (57, 1)]]


Building the LDA model

In [None]:
# Build LDA model
lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                       id2word=id2word,
                                       num_topics=10, 
                                       random_state=100,
                                       chunksize=100,
                                       passes=10,
                                       per_word_topics=True)

In [None]:
print(lda_model.print_topics())

[(0, '0.023*"money" + 0.016*"market" + 0.015*"company" + 0.014*"bank" + 0.014*"year" + 0.013*"dollar" + 0.013*"price" + 0.013*"gold" + 0.012*"financial" + 0.012*"percent"'), (1, '0.022*"news" + 0.017*"show" + 0.016*"video" + 0.015*"medium" + 0.012*"fake" + 0.011*"film" + 0.010*"narrative" + 0.009*"event" + 0.009*"edit" + 0.008*"watch"'), (2, '0.017*"world" + 0.012*"human" + 0.012*"global" + 0.009*"create" + 0.009*"technology" + 0.009*"control" + 0.008*"system" + 0.008*"power" + 0.006*"science" + 0.006*"population"'), (3, '0.022*"article" + 0.012*"report" + 0.010*"military" + 0.010*"government" + 0.010*"information" + 0.009*"also" + 0.009*"attack" + 0.008*"use" + 0.007*"terrorist" + 0.006*"say"'), (4, '0.049*"food" + 0.038*"water" + 0.013*"plant" + 0.011*"product" + 0.009*"use" + 0.009*"natural" + 0.009*"animal" + 0.009*"chemical" + 0.007*"find" + 0.007*"farm"'), (5, '0.018*"vaccine" + 0.014*"health" + 0.012*"study" + 0.010*"child" + 0.008*"cause" + 0.008*"drug" + 0.008*"case" + 0.008*"

Coherence score 

In [None]:
# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, 
                                     texts=texts, 
                                     dictionary=id2word, 
                                     coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nBaseline Coherence Score: ', coherence_lda)


Baseline Coherence Score:  0.43681715033516433


In [None]:
def compute_coherence_values(corpus, dictionary, k, a, b):
    lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                           id2word=dictionary,
                                           workers=3, # set to number of cores - 1
                                           num_topics=k, 
                                           random_state=100,
                                           chunksize=100,
                                           passes=10,
                                           alpha=a,
                                           eta=b)
    
    coherence_model_lda = CoherenceModel(model=lda_model, 
                                         texts=texts, 
                                         dictionary=id2word, 
                                         coherence='c_v')
    
    return coherence_model_lda.get_coherence()

Hyper parameter tuning 

In [None]:
model_results = {
                 'Topics': [],
                 'Alpha': [],
                 'Eta': [],
                 'Coherence': []
                }

In [None]:
min_topics = 2
max_topics = 11
step_size = 1
topics_range = range(min_topics, max_topics, step_size)

In [None]:
alpha = list(np.arange(0.01, 1, 0.3))
alpha.append('symmetric')
alpha.append('asymmetric')

In [None]:
eta = list(np.arange(0.01, 1, 0.3))
eta.append('symmetric')

In [None]:
# if 1 == 1:
#     pbar = tqdm.tqdm(total=10)
    
#     for k in topics_range:
#         for a in alpha:
#             for b in eta:
#                 cv = compute_coherence_values(corpus=corpus, 
#                                               dictionary=id2word, 
#                                               k=k, 
#                                               a=a, 
#                                               b=b)
#                 # Save the model results
#                 model_results['Topics'].append(k)
#                 model_results['Alpha'].append(a)
#                 model_results['Eta'].append(b)
#                 model_results['Coherence'].append(cv)

#                 pbar.update(1)
#     pd.DataFrame(model_results).to_csv('lda_tuning_results.csv', index=False)
#     pbar.close()

In [None]:
model_results

{'Alpha': [], 'Coherence': [], 'Eta': [], 'Topics': []}

# Building the LDA model

In [None]:
k = 6
a = 0.01
b = 'symmetric'

In [None]:
# Create Dictionary
id2word = corpora.Dictionary(X)
# Create Corpus
texts = X
# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

In [None]:
lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                       id2word=id2word,
                                       workers=3,
                                       num_topics=k, 
                                       random_state=100,
                                       chunksize=100,
                                       passes=10,
                                       alpha=a,
                                       eta=b)

In [None]:
!pip install pyLDAvis

Collecting pyLDAvis
  Downloading pyLDAvis-3.3.1.tar.gz (1.7 MB)
[K     |████████████████████████████████| 1.7 MB 5.2 MB/s 
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Installing backend dependencies ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone
Collecting funcy
  Downloading funcy-1.17-py2.py3-none-any.whl (33 kB)
Building wheels for collected packages: pyLDAvis
  Building wheel for pyLDAvis (PEP 517) ... [?25l[?25hdone
  Created wheel for pyLDAvis: filename=pyLDAvis-3.3.1-py2.py3-none-any.whl size=136898 sha256=12c4f5ea2ba4990b3ccac0d8c563b4eba093b3c22417d56d4c22f4048f0a023e
  Stored in directory: /root/.cache/pip/wheels/c9/21/f6/17bcf2667e8a68532ba2fbf6d5c72fdf4c7f7d9abfa4852d2f
Successfully built pyLDAvis
Installing collected packages: funcy, pyLDAvis
Successfully installed funcy-1.17 pyLDAvis-3.3.1


In [None]:
import pyLDAvis.gensim_models
pyLDAvis.enable_notebook()
LDAvis_prepared = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word)
LDAvis_prepared

  from collections import Iterable
  by='saliency', ascending=False).head(R).drop('saliency', 1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  token_table['Freq'] = token_table['Freq'].round()


# Engineering the features

In [None]:
def lda_make_col(df, lda_model, corpus, num_topics):
    train_vecs = []
    for i in range(len(df)):
        top_topics = (lda_model.get_document_topics(corpus[i], minimum_probability=0.0))
        topic_vec = [top_topics[i][1] for i in range(num_topics)]
        train_vecs.append(topic_vec)
    print(train_vecs)
    for i in range(num_topics):
        label = "prob_topic_" + str(i+1)
        print(label)
        df[label] = [train_vecs[x][i] for x in range(len(train_vecs))]

    return df

In [None]:
df.head()

Unnamed: 0,0,1,cleaned,cleaned_token,cleaned_tokenized_nostop,data_lemmatized
0,1,"A little less than a decade ago, hockey fans w...",a little less than a decade ago hockey fans we...,"[a, little, less, than, a, decade, ago, hockey...","[little, less, decade, hockey, fans, blessed, ...","[little, less, decade, hockey, fan, bless, sla..."
1,1,The writers of the HBO series The Sopranos too...,the writers of the hbo series the sopranos too...,"[the, writers, of, the, hbo, series, the, sopr...","[writers, series, sopranos, took, another, dar...","[take, daring, storytelling, step, kill, fan, ..."
2,1,Despite claims from the TV news outlet to offe...,despite claims from the tv news outlet to offe...,"[despite, claims, from, the, tv, news, outlet,...","[despite, claims, news, outlet, offer, nonstop...","[claim, news, outlet, offer, nonstop, news, in..."
3,1,After receiving 'subpar' service and experienc...,after receiving subpar service and experiencin...,"[after, receiving, subpar, service, and, exper...","[receiving, subpar, service, experiencing, unu...","[receive, subpar, service, experience, unusual..."
4,1,After watching his beloved Seattle Mariners pr...,after watching his beloved seattle mariners pr...,"[after, watching, his, beloved, seattle, marin...","[watching, beloved, seattle, mariners, prevail...","[watch, beloved, seattle_mariner, prevail, die..."


In [None]:
type(df)

pandas.core.frame.DataFrame

In [None]:
lda_make_col(df, lda_model, corpus, k)

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Unnamed: 0,0,1,cleaned,cleaned_token,cleaned_tokenized_nostop,data_lemmatized,prob_topic_1,prob_topic_2,prob_topic_3,prob_topic_4,prob_topic_5,prob_topic_6
0,1,"A little less than a decade ago, hockey fans w...",a little less than a decade ago hockey fans we...,"[a, little, less, than, a, decade, ago, hockey...","[little, less, decade, hockey, fans, blessed, ...","[little, less, decade, hockey, fan, bless, sla...",0.020674,0.000154,0.085877,0.892988,0.000154,0.000154
1,1,The writers of the HBO series The Sopranos too...,the writers of the hbo series the sopranos too...,"[the, writers, of, the, hbo, series, the, sopr...","[writers, series, sopranos, took, another, dar...","[take, daring, storytelling, step, kill, fan, ...",0.035763,0.000238,0.000238,0.904185,0.059339,0.000238
2,1,Despite claims from the TV news outlet to offe...,despite claims from the tv news outlet to offe...,"[despite, claims, from, the, tv, news, outlet,...","[despite, claims, news, outlet, offer, nonstop...","[claim, news, outlet, offer, nonstop, news, in...",0.176217,0.038097,0.124859,0.518177,0.124836,0.017814
3,1,After receiving 'subpar' service and experienc...,after receiving subpar service and experiencin...,"[after, receiving, subpar, service, and, exper...","[receiving, subpar, service, experiencing, unu...","[receive, subpar, service, experience, unusual...",0.256079,0.127486,0.000034,0.484533,0.000034,0.131833
4,1,After watching his beloved Seattle Mariners pr...,after watching his beloved seattle mariners pr...,"[after, watching, his, beloved, seattle, marin...","[watching, beloved, seattle, mariners, prevail...","[watch, beloved, seattle_mariner, prevail, die...",0.000169,0.000169,0.180893,0.818430,0.000169,0.000169
...,...,...,...,...,...,...,...,...,...,...,...,...
48849,4,The ruling Kuomintang (KMT) has claimed owners...,the ruling kuomintang kmt has claimed ownershi...,"[the, ruling, kuomintang, kmt, has, claimed, o...","[ruling, kuomintang, claimed, ownership, slush...","[claim, ownership, day, frozen, money, collect...",0.012660,0.000048,0.775074,0.056300,0.000048,0.155870
48850,4,The Taipei city government has encouraged the ...,the taipei city government has encouraged the ...,"[the, taipei, city, government, has, encourage...","[taipei, city, government, encouraged, rebuild...","[government, encourage, rebuild, lowtomidrise,...",0.166410,0.174087,0.220678,0.108384,0.000078,0.330363
48851,4,President Ma Ying-jeou said Friday that a park...,president ma yingjeou said friday that a park ...,"[president, ma, yingjeou, said, friday, that, ...","[president, yingjeou, said, friday, park, buil...","[say, build, commemorate, japanese, civil, eng...",0.121529,0.114347,0.243834,0.000073,0.000073,0.520144
48852,4,The families of the four people who were kille...,the families of the four people who were kille...,"[the, families, of, the, four, people, who, we...","[families, four, people, killed, landslide, na...","[family, people, kill, last, entitle, national...",0.153156,0.000105,0.480281,0.000105,0.182541,0.183812


In [None]:
df.to_csv('data_with_topicmodellingfeatures.csv', index=False)

In [None]:
# df.to_csv('/content/drive/My Drive/raw_data/data_with_topicmodellingfeatures.csv')