# Tweets preprocessing

In [None]:
# Data preprocessing for Crisis dataset v1.0

# This notebook is created in Google Colab, please change the paths to your file.
# Every preprocessing method is seperate. You can choose the ones that you need. 
# Write me a note if something goes wrong or you need some new preprocessing methods.

# Enjoy!

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install contractions

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import pandas as pd
import unicodedata
import re
import contractions

import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:
path1 = '/content/drive/MyDrive/SS_2022_Praktikum/Crisis Dataset/Dataset_12/earthquakes_eyewitness_crowdflower_2000.tsv'
path2 = '/content/drive/MyDrive/SS_2022_Praktikum/Crisis Dataset/Dataset_12/floods_eyewitness_crowdflower_2000.tsv'
path3 = '/content/drive/MyDrive/SS_2022_Praktikum/Crisis Dataset/Dataset_12/forestfires_eyewitness_crowdflower_2000.tsv'
path4 = '/content/drive/MyDrive/SS_2022_Praktikum/Crisis Dataset/Dataset_12/hurricanes_eyewitness_crowdflower_2000.tsv'

tweets_df1=pd.read_csv(path1, sep="\t")
tweets_df2=pd.read_csv(path2, sep="\t")
tweets_df3=pd.read_csv(path3, sep="\t")
tweets_df4=pd.read_csv(path4, sep="\t")

tweets_df1 = tweets_df1[['text']]
tweets_df2 = tweets_df2[['text']]
tweets_df3 = tweets_df3[['text']]
tweets_df4 = tweets_df4[['text']]

tweets_df1.rename(columns={'text':'Tweets'},inplace=True)
tweets_df2.rename(columns={'text':'Tweets'},inplace=True)
tweets_df3.rename(columns={'text':'Tweets'},inplace=True)
tweets_df4.rename(columns={'text':'Tweets'},inplace=True)

# Give the topic
tweets_df1.loc[:, 'Topics'] = 'earthquakes'
tweets_df2.loc[:, 'Topics'] = 'floods'
tweets_df3.loc[:, 'Topics'] = 'forestfires'
tweets_df4.loc[:, 'Topics'] = 'hurricanes'

# Concatenate
tweets_df = tweets_df1
tweets_df = tweets_df.append(tweets_df2, ignore_index = True)
tweets_df = tweets_df.append(tweets_df3, ignore_index = True)
tweets_df = tweets_df.append(tweets_df4, ignore_index = True)

# Shuffle
# Discuss whether to use this method or not, reason: no seed!
tweets_df = tweets_df.sample(frac=1.0).reset_index(drop=True)

In [None]:
# Define the partition of train, dev and test set, make sure they sum up to 1
PT_TRAIN = 0.8
PT_DEV = 0.1
PT_TEST = 0.1

# Define train, dev and test set
tweets_df.loc[: tweets_df.shape[0] * PT_TRAIN, 'partition'] = 'train'
tweets_df.loc[tweets_df.shape[0] * PT_TRAIN : tweets_df.shape[0] * (PT_TRAIN + PT_DEV), 'partition'] = 'dev'
tweets_df.loc[tweets_df.shape[0] * (PT_TRAIN + PT_DEV) : tweets_df.shape[0] - 1, 'partition'] = 'test'

# Shuffle again
tweets_df = tweets_df.sample(frac=1.0).reset_index(drop=True)

In [None]:
# See if the partition is correct
tweets_df.partition.value_counts()

train    6400
dev       800
test      800
Name: partition, dtype: int64

In [None]:
# Change the order of the df

order = ['Tweets', 'partition', 'Topics']
tweets_df = tweets_df[order]

In [None]:
# Uncomment to see the merged (or unmerged) dataframes
tweets_df

Unnamed: 0,Tweets,partition,Topics
0,Impact small and not until May -cyclone Debbie...,train,hurricanes
1,An Areal Flood WARNING is in effect from Green...,train,floods
2,Rheal_talk you're going to appreciate my late...,train,floods
3,Was there an earthquake? Lol my cousin asked m...,train,earthquakes
4,via *npr: 'Public Calamity' As California Wild...,train,forestfires
...,...,...,...
7995,BrennanKnighton Oh wow and the roads are star...,train,floods
7996,Going out to check on my host's other farm and...,test,hurricanes
7997,THIS weekend! Escape Cyclone Debbie and get o...,train,hurricanes
7998,https://t.co/AEMEPfjwWR by Norton Identity Sa...,train,forestfires


In [None]:
# Make sure you run this one before other methods!

def to_lowercase(text):
    return text.lower()

#testing the function on a single sample for explaination
print(to_lowercase('IN CHINESE WE CALL CAPITALIZATION AS BIG WRITTING, IN GERMAN AS WELL.'))

#converting every row of the column into lower case 
tweets_df.Tweets=tweets_df.Tweets.apply(to_lowercase)

in chinese we call capitalization as big writting, in german as well.


In [None]:
def standardize_accented_chars(text):
    return unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')

#testing the function on a single sample for explaination
print(standardize_accented_chars('sómě words such as résumé, café, prótest, divorcé, coördinate, exposé, latté.'))

#standardizing accented characters for every row
tweets_df.Tweets=tweets_df.Tweets.apply(standardize_accented_chars)

some words such as resume, cafe, protest, divorce, coordinate, expose, latte.


In [None]:
# Not a method, just to check how many tweets contain urls

def get_number_of_urls(documents):
    print("{:.2f}% of documents contain urls".format(sum
(documents.apply(lambda x:x.find('http'))>0)/len
(documents)*100))

# Passing the 'Tweets' column of the dataframe as the argument
print(get_number_of_urls(tweets_df.Tweets)) 

57.98% of documents contain urls
None


In [None]:
def remove_url(text):
    return re.sub(r'https?:\S*', '', text)

#testing the function on a single sample for explaination
print(remove_url('using https://www.google.com/ as an example'))

#removing urls from every row
tweets_df.Tweets=tweets_df.Tweets.apply(remove_url)

using  as an example


In [None]:
def expand_contractions(text):
    expanded_words = [] 
    for word in text.split():
       expanded_words.append(contractions.fix(word)) 
    return ' '.join(expanded_words)

#testing the function on a single sample for explaination
print(expand_contractions("Don't is the same as do not"))

#expanding contractions for every row
tweets_df.Tweets=tweets_df.Tweets.apply(expand_contractions)

Do not is the same as do not


In [None]:
def remove_mentions_and_tags(text):
    text = re.sub(r'@\S*', '', text)
    return re.sub(r'#\S*', '', text)

#testing the function on a single sample for explaination
print(remove_mentions_and_tags('Some random @abc and #def'))

#removing mentions and tags from every row
tweets_df.Tweets=tweets_df.Tweets.apply(remove_mentions_and_tags)

Some random  and 


In [None]:
def keep_only_alphabet(text):
    return re.sub(r'[^a-zA-Z]', ' ', text)

#testing the function on a single sample for explaination
print(keep_only_alphabet('Just a bit more $$processing required.Just a bit!!!'))

#for all the rows
tweets_df.Tweets=tweets_df.Tweets.apply(keep_only_alphabet)

Just a bit more   processing required Just a bit   


In [None]:
def remove_stop_words(text):
  """
  Returns text without stop words
  """
  text = word_tokenize(text)
  word_list = []
  for word in text:
      if word not in stopwords.words('english'):
          word_list.append(word)

  return ' '.join(word_list)

#testing the function on a single sample for explaination
print(remove_stop_words('Test this text to see which are stop words.'))

#removing stop-words and short words from every row
tweets_df.Tweets=tweets_df.Tweets.apply(remove_stop_words)

Test text see stop words .


In [None]:
def lemmatize(text):
  lemmatizer = WordNetLemmatizer()
  text_str = word_tokenize(text)
  new_words = []

  for word in text_str:
    new_words.append(lemmatizer.lemmatize(word))
  return ' '.join(new_words)

#testing the function on a single sample for explaination
print(lemmatize('apples, bananas and pears are common fruits that are eaten by humans.'))

#Performing lemmatization on every row
tweets_df.Tweets=tweets_df.Tweets.apply(lemmatize)

apple , banana and pear are common fruit that are eaten by human .


In [None]:
# Delete blank rows to fit OCTIS
tweets_df = tweets_df[tweets_df['Tweets'].str.len()>=1]
tweets_df.reset_index(inplace=True)

# Delete the index column caused by reset_index
del tweets_df['index']

# (OPTINAL) Delete nonsense sentences(like sentences with less than 3 words)
# (Discuss) Do you have a better way to extract tweets with more than 3 words? Not string.
# Check some word tokenization methods
#tweets_df = tweets_df[tweets_df['Tweets'].str.len()>=20]

In [None]:
# Uncomment this to check how the df looks like after running
tweets_df

Unnamed: 0,Tweets,partition,Topics
0,creeping wildfire smoke threat california wine...,train,forestfires
1,thought affected cyclone debbie last day corn ...,train,hurricanes
2,roshayyred know lmao totally forgot though dam...,test,floods
3,keeping aussi peep prayer woth storm still bla...,train,hurricanes
4,queenslanders beautiful twitter amp please sta...,train,hurricanes
...,...,...,...
7984,basement flood lost mine kid personal item alo...,train,floods
7985,nwsbrownsville friend sent video get home,dev,floods
7986,local agency work together wildfire training c...,train,forestfires
7987,national art centre collect donation support v...,train,forestfires


In [None]:
tweets_df.shape[0]

7989

In [None]:
# Create vocabulary.txt

def df_to_vocab(df):
    word_list = []
    for i in range(df.shape[0]):
        text = word_tokenize(df.Tweets[i])
        for word in text:
            if word not in word_list:
                word_list.append(word)
    return word_list

word_list = df_to_vocab(tweets_df)
len(word_list)

9330

In [None]:
def list_to_vocab(word_list):
    txt = open("/content/drive/MyDrive/SS_2022_Praktikum/Crisis Dataset/Dataset_12/vocabulary.txt", 'w')
    for i in range(len(word_list)):
        txt.write(word_list[i])
        txt.write('\r\n')
    txt.close()

list_to_vocab(word_list)

In [None]:
tweets_df.to_csv('/content/drive/MyDrive/SS_2022_Praktikum/Crisis Dataset/Dataset_12/corpus.tsv',\
                 sep = '\t', index=False, header = None)

In [None]:
#stop here

# OCTIS Initiation

In [None]:
!pip install octis

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import octis
from octis.models.LDA import LDA
from octis.models.NMF import NMF
from octis.dataset.dataset import Dataset
from octis.evaluation_metrics.diversity_metrics import TopicDiversity
from octis.evaluation_metrics.coherence_metrics import Coherence

# OCTIS -- LDA with crisis dataset

With our own data preprocessing method

In [None]:
dataset = Dataset()
dataset.load_custom_dataset_from_folder("/content/drive/MyDrive/SS_2022_Praktikum/Crisis Dataset/Dataset_12")

In [None]:
model = LDA(num_topics=4)  # Create model
output = model.train_model(dataset) # Train the model

In [None]:
# See what are in the model output

print(*list(output.keys()), sep="\n")

topic-word-matrix
topics
topic-document-matrix
test-topic-document-matrix


In [None]:
for t in output['topics'][:]:
  print(" ".join(t))

earthquake felt wildfire feel smoke like time sf magnitude twitter
flood wildfire mcmurray fort rain massive area california debbie heavy
debbie cyclone flood earthquake queensland school ex flash go today
california wildfire flood fire debbie northern cyclone least people home


In [None]:
# Initialize metric
npmi = Coherence(texts=dataset.get_corpus(), topk=10, measure='c_npmi')

# Initialize metric
topic_diversity = TopicDiversity(topk=10)

In [None]:
# Retrieve metrics score
topic_diversity_score = topic_diversity.score(output)
print("Topic diversity: " + str(topic_diversity_score))

npmi_score = npmi.score(output)
print("Coherence: " + str(npmi_score))

Topic diversity: 0.775
Coherence: -0.091625380409654


With OCTIS' data preprocessing method (to do)

In [None]:
'''
import os
import string
from octis.preprocessing.preprocessing import Preprocessing
os.chdir(os.path.pardir)

# Initialize preprocessing
preprocessor = Preprocessing(vocabulary=None, max_features=None,
                             remove_punctuation=True, punctuation=string.punctuation,
                             lemmatize=True, stopword_list='english',
                             min_chars=1, min_words_docs=0)
# preprocess
dataset = preprocessor.preprocess_dataset(documents_path=r'..\corpus.txt', labels_path=r'..\labels.txt')

# save the preprocessed dataset
dataset.save('hello_dataset')
'''

"\nimport os\nimport string\nfrom octis.preprocessing.preprocessing import Preprocessing\nos.chdir(os.path.pardir)\n\n# Initialize preprocessing\npreprocessor = Preprocessing(vocabulary=None, max_features=None,\n                             remove_punctuation=True, punctuation=string.punctuation,\n                             lemmatize=True, stopword_list='english',\n                             min_chars=1, min_words_docs=0)\n# preprocess\ndataset = preprocessor.preprocess_dataset(documents_path=r'..\\corpus.txt', labels_path=r'..\\labels.txt')\n\n# save the preprocessed dataset\ndataset.save('hello_dataset')\n"

# OCTIS -- LDA with 20News dataset

In [None]:
# Define dataset
dataset = Dataset()
dataset.fetch_dataset("20NewsGroup")

In [None]:
# Create Model
model = LDA(num_topics=20, alpha=0.1)

# Train the model
output = model.train_model(dataset) 

In [None]:
for t in output['topics'][:5]:
  print(" ".join(t))

work window system problem drive run disk modem cable set
list widget user include server send mail information application window
book post read write font find mail article question text
car launch space power sell make good chip satellite sale
year time day doctor water drug start disease good health


In [None]:
# Initialize metric
npmi = Coherence(texts=dataset.get_corpus(), topk=10, measure='c_npmi')

# Initialize metric
topic_diversity = TopicDiversity(topk=10)

In [None]:
# Retrieve metrics score
topic_diversity_score = topic_diversity.score(output)
print("Topic diversity: " + str(topic_diversity_score))

npmi_score = npmi.score(output)
print("Coherence: " + str(npmi_score))

# Strange, I got a different result as the example notebook

Topic diversity: 0.71
Coherence: 0.06612433724770478


# OCTIS -- NMF with crisis dataset

In [None]:
dataset = Dataset()
dataset.load_custom_dataset_from_folder("/content/drive/MyDrive/SS_2022_Praktikum/Crisis Dataset/Dataset_12")

In [None]:
model = NMF(num_topics=4)  # Create model
output = model.train_model(dataset) # Train the model

In [None]:
for t in output['topics'][:]:
  print(" ".join(t))

wildfire california fort mcmurray northern canada canadian alberta smoke via
debbie cyclone ex queensland school tropical australia closed south flooding
earthquake felt flood feel california fire today one like km


In [None]:
# Initialize metric
npmi = Coherence(texts=dataset.get_corpus(), topk=10, measure='c_npmi')

# Initialize metric
topic_diversity = TopicDiversity(topk=10)

In [None]:
# Retrieve metrics score
topic_diversity_score = topic_diversity.score(output)
print("Topic diversity: " + str(topic_diversity_score))

npmi_score = npmi.score(output)
print("Coherence: " + str(npmi_score))

Topic diversity: 0.95
Coherence: 0.07544957013802908


# OCTIS -- NMF with 20News dataset

In [None]:
# Define dataset
dataset = Dataset()
dataset.fetch_dataset("20NewsGroup")

In [None]:
# Create Model
model = NMF(num_topics=20)

# Train the model
output = model.train_model(dataset) 

In [None]:
for t in output['topics'][:5]:
  print(" ".join(t))

file system version software program display application set user read
drive disk problem system hard administration question program government official
key privacy internet encryption make information message post computer user
window widget application server subject resource motif set include run
graphic image package include send mail server object support datum


In [None]:
# Initialize metric
npmi = Coherence(texts=dataset.get_corpus(), topk=10, measure='c_npmi')

# Initialize metric
topic_diversity = TopicDiversity(topk=10)

In [None]:
# Retrieve metrics score
topic_diversity_score = topic_diversity.score(output)
print("Topic diversity: " + str(topic_diversity_score))

npmi_score = npmi.score(output)
print("Coherence: " + str(npmi_score))

Topic diversity: 0.705
Coherence: 0.08307011633933507


# NMF with sklearn (Very possibly give up)

In [None]:
assert False

AssertionError: ignored

In [None]:
from sklearn.datasets import fetch_20newsgroups

In [None]:
#download the datasets
groups = fetch_20newsgroups()

In [None]:
#type:sklearn.utils.Bunch
groups.keys()

In [None]:
#20 topics
groups['target_names']

In [None]:
#To which group the news belongs
groups.target

In [None]:
#It can be seen that the target ranges from 0 to 19
import numpy as np
np.unique(groups.target)

In [None]:
#The first news
groups.data[0]

In [None]:
#The first news belongs to the 8th topic
groups.target[0]

In [None]:
#The 8th topic is ...
groups.target_names[groups.target[0]]

In [None]:
#How long is the first news
len(groups.data[0])

In [None]:
#And the second
len(groups.data[1])

Visualization

In [None]:
import seaborn as sns
sns.distplot(groups.target)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
#delete stop words and limit the num of features to 500, CountVectorizer is used to extract features
cv = CountVectorizer(stop_words='english',max_features=500)
transformed = cv.fit_transform(groups.data)
#see what features are in the dataset
print(cv.get_feature_names())

In [None]:
#shape = Documents * Features
print(np.shape(transformed))
print('\n')

#see if the feature appears in sentence
transformed.toarray()

In [None]:
#How many times does a feature appear
sns.distplot(np.log(transformed.toarray().sum(axis=0)))
plt.xlabel('Log Count')
plt.ylabel('Frequency')
plt.title('Distribution plot of 500 word counts')
plt.show()

Data Preprocessing

In [None]:
from nltk.corpus import names
from nltk.stem import WordNetLemmatizer

In [None]:
#return True if astr only contains letters, otherwise, False
def letters_only(astr) :
  return astr.isalpha()

In [None]:
import nltk
#name dict
nltk.download('names')
cleaned=[]
all_names = set(names.words())
#Do lemmatization
lemmatizer = WordNetLemmatizer()

In [None]:
nltk.download('wordnet')
for post in groups.data:
    for word in post.split():
        if letters_only(word) and word not in all_names:
            cleaned.append(' '.join([lemmatizer.lemmatize(word.lower())]))

In [None]:
#all the words in the dataset
len(cleaned)

In [None]:
transformed = cv.fit_transform(cleaned)
print(cv.get_feature_names())

clustering

In [None]:
from nltk.corpus import names
from nltk.stem import WordNetLemmatizer
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans

In [None]:
number_of_clusters = 20
kmmodel = KMeans(n_clusters=number_of_clusters)
kmmodel.fit(transformed)

In [None]:
order_centroids = kmmodel.cluster_centers_.argsort()[:, ::-1]
terms = cv.get_feature_names()

In [None]:
for i in range(number_of_clusters):
    print("Cluster %d:" % i),
    for ind in order_centroids[i, :10]:
        print(' %s' % terms[ind])
        plt.show()

NMF

In [None]:
from nltk.corpus import names
from nltk.stem import WordNetLemmatizer
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.decomposition import NMF

In [None]:
d=5  # num topics
clf = NMF(n_components=d, random_state=1).fit(transformed)

In [None]:
for topic_idex, topic in enumerate(clf.components_):
  label = '{}: '.format(topic_idex)
  print(label, " ".join([cv.get_feature_names()[i]
                         for i in topic.argsort()[::]]))

In [None]:
!pip install octis

In [None]:
import octis
from octis.models.LDA import LDA
from octis.dataset.dataset import Dataset
from octis.evaluation_metrics.diversity_metrics import TopicDiversity
from octis.evaluation_metrics.coherence_metrics import Coherence

In [None]:
# Define dataset
dataset = Dataset()
dataset.fetch_dataset("20NewsGroup")

In [None]:
dataset.get_labels

In [None]:
# Create Model
# Alpha is the parameter in Dirichlet distribution
model = LDA(num_topics=20, alpha=0.1)

In [None]:
# Train the model using default partitioning choice 
output = model.train_model(dataset)

print(*list(output.keys()), sep="\n") # Print the output identifiers

In [None]:
import numpy as np
#20 topics, each with 10 words
np.shape(output['topics'])

In [None]:
np.shape(output['topic-document-matrix'])

In [None]:
for t in output['topics'][:5]:
  print(" ".join(t))

In [None]:
# Initialize metric
npmi = Coherence(texts=dataset.get_corpus(), topk=10, measure='c_npmi')

In [None]:
# Initialize metric
topic_diversity = TopicDiversity(topk=10)

In [None]:
# Retrieve metrics score
topic_diversity_score = topic_diversity.score(output)
print("Topic diversity: "+str(topic_diversity_score))

npmi_score = npmi.score(output)
print("Coherence: "+str(npmi_score))