In [2]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /home/squispeb/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/squispeb/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer

In [4]:
# Returns clean tokens from a tweet given
def stopwords_stemmer(tweet_text):
    tokens = nltk.word_tokenize(tweet_text)
    stoplist = stopwords.words("spanish")
    stoplist += ['/…','--','RT','`','@','|','¿','?', '¡', '!', '.', ',', ';', '«', '»', ':', '(', ')', '"','#', '$', '^', '&', '*', '%','IndianArmyPeoplesArmy']
    tokens_clean = tokens.copy()

    for token in tokens:
        if token in stoplist:
            tokens_clean.remove(token)
            
    stemmer = SnowballStemmer("spanish")
    for i in range(len(tokens_clean)):
        tokens_clean[i] = stemmer.stem(tokens_clean[i])
        
    return tokens_clean

In [5]:
import os
import json
from collections import Counter

# json_path = './data/json_files/data_elecciones/'
json_path = './data/json_files/ucl/'
json_folder = os.listdir(json_path)

# Dictionaries
inverted_index = {}
terms_dict = {}
files_dict = {}
tweets_dict = {}
terms_df_dict = {}

# Variables
n_tweets = 0
n_terms = 0

# Building inverted index with every JSON file
for json_file in json_folder:
    file_id = len(files_dict)
    files_dict[file_id] = json_file
    with open(json_path + json_file,encoding="utf8") as file:
        json_content = json.load(file)
        for tweet in json_content:
            n_tweets += 1
            clean_tokens = stopwords_stemmer(tweet["text"])
            term_freq = Counter(clean_tokens)
            tweet_id = tweet["id"]
            norm = 0
            tweets_dict[tweet_id] = [file_id,{},norm] #file_id,tf_idf,norm
            for term in term_freq:
                freq = term_freq[term]
                
                if not (term in terms_dict):
                    terms_dict[term] = len(terms_dict)
                
                term_id = terms_dict[term]
                tweets_dict[tweet_id][1][term_id]  = 0.0

                if not (term_id in inverted_index):
                    inverted_index[term_id] = {tweet_id : freq}
                else:
                    inverted_index[term_id][tweet_id] = freq

for term_id in inverted_index:
    tweet_list = inverted_index[term_id]
    terms_df_dict[term_id] = len(tweet_list)

n_terms = len(terms_dict)

In [6]:
import numpy as np
import math

def getTF_IDF(tweet_list, tweet_id):
    tf = tweet_list[tweet_id]
    df = float(len(tweet_list))
    return math.log(1+tf, 10) * math.log(n_tweets / df, 10)

def setTF_IDF():
    for term_id in inverted_index:
        tweet_list = inverted_index[term_id]
        for tweet_id in tweet_list:
            tweets_dict[tweet_id][1][term_id] = getTF_IDF(tweet_list, tweet_id)

def getNorm(tweet_list):
    val = np.array(list(tweet_list.values()))
    return np.linalg.norm(val)

def setNorm():
    for tweet_id in tweets_dict:
        tweet_list = tweets_dict[tweet_id][1]
        tweets_dict[tweet_id][2] = getNorm(tweet_list)

# Setting TF_IDF and Norm for all terms
setTF_IDF()
setNorm()

In [8]:
import pickle
import os
bin_path = './data/bin/'

#Files
data = [inverted_index,terms_dict,files_dict,tweets_dict,terms_df_dict,n_tweets,n_terms]
filenames = ['inverted_index', 'terms_dict', 'files_dict','tweets_dict','terms_df_dict','n_tweets','n_terms']
i = 0

# Data Serialization
for d in data:
    file = open (bin_path + filenames[i] + '.dat','wb+')
    pickle.dump(d,file)
    file.close()
    i += 1

print("Number of tweets: ",n_tweets,n_terms)

Number of tweets:  25217 27090
