In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import time

In [2]:
df = pd.read_csv("data/processed_tweet.csv")

# Use TF-IDF to Find Most Important Words

## Calculate TF-IDF

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [4]:
df = df[df["text"].notnull()]
idx = df.index
df.reset_index(drop=True, inplace=True)

In [5]:
text = df["text"]
vectoriser = TfidfVectorizer(max_df=0.9, min_df=10, stop_words='english')
matrix = vectoriser.fit_transform(text)

In [6]:
matrix.shape

(6436, 1046)

## Most Important Word

In [7]:
important_words = []
feature_names = vectoriser.get_feature_names_out()

for sentence_idx in range(matrix.shape[0]):
    tfidf_scores = matrix[sentence_idx].toarray().flatten()
    max_idx = np.argmax(tfidf_scores)
    important_word = feature_names[max_idx]
    important_words.append(important_word)

df["most_important_word"] = important_words

In [8]:
df.head()

Unnamed: 0,id,handle,text,is_retweet,original_author,time,in_reply_to_screen_name,in_reply_to_status_id,in_reply_to_user_id,is_quote_status,...,place_country_code,place_country,place_contained_within,place_attributes,place_bounding_box,source_url,truncated,entities,extended_entities,most_important_word
0,780925634159796224,HillaryClinton,question election put plans action make life b...,False,,2016-09-28T00:22:34,,,,False,...,,,,,,https://studio.twitter.com,False,{'media': [{'display_url': 'pic.twitter.com/Xr...,{'media': [{'display_url': 'pic.twitter.com/Xr...,question
1,780916180899037184,HillaryClinton,last night donald trump said paying taxes smar...,True,timkaine,2016-09-27T23:45:00,,,,False,...,,,,,,http://twitter.com,False,{'media': [{'display_url': 'pic.twitter.com/t0...,{'media': [{'display_url': 'pic.twitter.com/t0...,paying
2,780911564857761793,HillaryClinton,couldnt proud hillaryclinton vision command la...,True,POTUS,2016-09-27T23:26:40,,,,False,...,,,,,,https://about.twitter.com/products/tweetdeck,False,"{'user_mentions': [{'id_str': '1536791610', 'n...",,showed
3,780907038650068994,HillaryClinton,stand together theres nothing cant make sure y...,False,,2016-09-27T23:08:41,,,,False,...,,,,,,https://studio.twitter.com,False,{'media': [{'display_url': 'pic.twitter.com/Q3...,{'media': [{'display_url': 'pic.twitter.com/Q3...,ready
4,780897419462602752,HillaryClinton,candidates asked theyd confront racial injusti...,False,,2016-09-27T22:30:27,,,,False,...,,,,,,https://about.twitter.com/products/tweetdeck,False,"{'user_mentions': [], 'symbols': [], 'urls': [...",,answer


# Use Wordnet to get a Similarity Matrix

In [9]:
import nltk
from nltk.corpus import wordnet as wn

In [33]:
def similarity(word_pair):
    word1, word2, i, j = word_pair
    synsets1 = wn.synsets(word1)
    synsets2 = wn.synsets(word2)

    # If no synsets are found
    if not synsets1 or not synsets2:
        return i, j, 0

    # Get max similarity score between any synset pair
    max_similarity = max((s1.wup_similarity(s2) or 0) for s1 in synsets1 for s2 in synsets2)
    return i, j, max_similarity

## Use Parrallelisation because of the High Computational Time

In [32]:
import os
from multiprocessing import Pool

In [27]:
def prepare_input(important_words):
    tasks = []
    for i in range(len(important_words)):
        for j in range(i+1, len(important_words)):
            tasks.append((important_words[i], important_words[j], i, j))
    return tasks

In [None]:
n = len(important_words)
similarity_matrix = np.zeros((n, n))
start_time = time.time()

# Similarity matrix should be symmetric, so we can save half the compuation time (upper triangle = lower triangle)
tasks = prepare_input(important_words)

with Pool() as pool:
    results = pool.map(similarity,tasks)

for i, j, sim_score in results:
    similarity_matrix[i, j] = sim_score
    similarity_matrix[j, i] = sim_score

np.fill_diagonal(similarity_matrix, 1)

# for i in range(n):
#     for j in range(i, n):
#         if i != j:
#             sim_score = similarity(important_words[i], important_words[j])
#             similarity_matrix[i,j] = sim_score
#             similarity_matrix[j,i] = sim_score
#         else:
#             similarity_matrix[i, j] = 0

end_time = time.time()
print(f"Time taken to compare most important words:{end_time-start_time}")

In [None]:
np.save("data/wordnet_similarity_matrix.npy", similarity_matrix)