# Information Retrieval and Web Analytics



## PROJECT PART 1: Text Processing

Group Members:

*   Berta Al√≤s (228709)
*   Maria Cerezo (183213)
*   Paula Vil√† (231630)












Load packages

In [1]:
from collections import defaultdict
from array import array
import nltk
nltk.download('stopwords')
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
import math
import numpy as np
import collections
import json 
from numpy import linalg as la
import pandas as pd 
import re
from operator import itemgetter
import unicodedata
import time

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Data preparation and Text processing
The dataset is stored in the JSON file. It contains 4000 Hurricane Ian tweets. 

In [2]:
#reading json file and transforming it into pandas dataframe
tw_data = pd.read_json('tw_hurricane_data.json',lines=True)

#reading the csv file and transforming it into pandas dataframe
map_data = pd.read_csv('tweet_document_ids_map.csv',
                    sep='::', 
                    encoding='latin-1',
                    engine='python',
                    names=['docs_ids'])

tw_data = pd.DataFrame(tw_data)

In [3]:
#creating a new dataframe with the desired columns extracted from tw_data dataframe 
tw_fields = pd.DataFrame()
tw_fields['hashtags']=None
tw_fields['name']=None
tw_fields['full_text']=None
tw_fields['created_at']=None
tw_fields['favorite_count']=None
tw_fields['retweet_count']=None
tw_fields['url']=None

#creating a column with hashtag information obtained from entities column
tw_fields['hashtags'] = tw_data['entities'].apply(lambda x: x.get('hashtags'))

#creating a column with username information obtained from user column
tw_fields['name'] = tw_data['user'].apply(lambda x: x.get('name'))

#creating a column with url information obtained from entities column
for i in tw_data.index:
  tw_fields['url'][i]=tw_data['entities'][i]['media'][0]['url'] if(tw_data['entities'][i].get('media') is not None) else {}

#creating a column with the full_text information of the tw_data dataframe
tw_fields['full_text']=tw_data['full_text']

#creating a column with the created_at information of the tw_data dataframe
tw_fields['created_at']=tw_data['created_at']

#creating a column with the favorite_count information of the tw_data dataframe
tw_fields['favorite_count']=tw_data['favorite_count']

#creating a column with the retweet_count information of the tw_data dataframe
tw_fields['retweet_count']=tw_data['retweet_count']

In [4]:
#creating a new dataset containing the required columns, with it's specified column names
tw_fields = pd.DataFrame({'Tweet' : tw_fields['full_text'],'Username' : tw_fields['name'],  'Date' : tw_fields['created_at'],'Hashtags' : tw_fields['hashtags'], 'Likes' : tw_fields['favorite_count'], 'Retweets' : tw_fields['retweet_count'], 'Url' : tw_fields['url']})

In [5]:
#Function to preprocess data
def build_terms(line):
    """
    Preprocess the text removing stop words, stemming, transforming in 
    lowercase, removing URLs and emojis, removing everything it is not a digit 
    nor number and return the tokens of the text.
    
    Argument:
    line -- string (text) to be preprocessed
    
    Returns:
    line - a list of tokens corresponding to the input text after the preprocessing
    """
    stemmer = PorterStemmer()
    stop_words = set(stopwords.words("english"))
    line=re.sub(r'[\W_]+', ' ', line) #BONUS: Removing anything is not a letter or digit
    line = line.lower() 
    line = line.split()  # Tokenize the text to get a list of terms
    line = [x for x in line if x not in stop_words]  # eliminate the stopwords
    line = [stemmer.stem(word) for word in line] # perform stemming (HINT: use List Comprehension)
    line = [l for l in line if "http" not in l ] #BONUS: Removing the URLs from the line
    line = [emojis_out(l) for l in line ] ##BONUS: removing the emojis from the line
    return line

In [6]:
#Function to get rid of emojis
def emojis_out (s):
    """Preprocess the text removing stop emojis from of the string
    
    Arguments:
    s -- string (word) to be processed
    
    Returns: 
    word-- string equal to "" if it's  an emoji and "s" otherwise. 
    """
    emoji_pattern = re.compile("["  
        u"\U0001F300-\U0001F5FF"  # removing symbols & pictographs  
                           "]+", flags=re.UNICODE)
       
    word = emoji_pattern.sub(r'', s)
    return word

In [7]:
#preprocessing the Tweet and Username information
tw_fields['preprocessed_tweet']=None
for i in tw_fields.index: 
    tw_fields['preprocessed_tweet'][i] = build_terms(tw_fields['Tweet'][i]) 
    tw_fields['Username'][i]=emojis_out(tw_fields['Username'][i])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


In [8]:
#merging the tw_fields & map_data dataframes into a single dataframe named final_dataset
final_dataset=pd.merge(map_data, tw_fields, left_index=True, right_index=True)  
final_dataset['doc_id']=None
final_dataset['doc_num']=None
s = final_dataset["docs_ids"].str.split(pat = '\t',expand = True)
final_dataset['doc_id']=s[0]
final_dataset['doc_num']=s[1]
final_dataset.head(40) #displaying the first 3 rows

Unnamed: 0,docs_ids,Tweet,Username,Date,Hashtags,Likes,Retweets,Url,preprocessed_tweet,doc_id,doc_num
0,doc_1\t1575918182698979328,So this will keep spinning over us until 7 pm‚Ä¶...,Suz,2022-09-30 18:39:08+00:00,"[{'text': 'HurricaneIan', 'indices': [63, 76]}]",0,0,https://t.co/VROTxNS9rz,"[keep, spin, us, 7, pm, go, away, alreadi, hur...",doc_1,1575918182698979328
1,doc_2\t1575918151862304768,Our hearts go out to all those affected by #Hu...,Lytx,2022-09-30 18:39:01+00:00,"[{'text': 'HurricaneIan', 'indices': [43, 56]}]",0,0,{},"[heart, go, affect, hurricaneian, wish, everyo...",doc_2,1575918151862304768
2,doc_3\t1575918140839673873,Kissimmee neighborhood off of Michigan Ave. \n...,Christopher Heath,2022-09-30 18:38:58+00:00,"[{'text': 'HurricaneIan', 'indices': [45, 58]}]",0,0,https://t.co/jf7zseg0Fe,"[kissimme, neighborhood, michigan, ave, hurric...",doc_3,1575918140839673873
3,doc_4\t1575918135009738752,I have this one tree in my backyard that scare...,alex ‚ú®,2022-09-30 18:38:57+00:00,"[{'text': 'scwx', 'indices': [122, 127]}, {'te...",0,0,{},"[one, tree, backyard, scare, poltergeist, tree...",doc_4,1575918135009738752
4,doc_5\t1575918119251419136,@AshleyRuizWx @Stephan89441722 @lilmizzheidi @...,Tess,2022-09-30 18:38:53+00:00,"[{'text': 'HurricaneIan', 'indices': [159, 172]}]",0,0,{},"[ashleyruizwx, stephan89441722, lilmizzheidi, ...",doc_5,1575918119251419136
5,doc_6\t1575918105854984192,Ace Handyman Services hopes everyone was safe ...,Ace Handyman Services Brandon,2022-09-30 18:38:50+00:00,"[{'text': 'HurricaneIan', 'indices': [221, 234...",0,0,https://t.co/BfpOq7tJE0,"[ace, handyman, servic, hope, everyon, safe, h...",doc_6,1575918105854984192
6,doc_7\t1575918095008681986,"Storm surge issues in Georgetown, SC #Hurrican...",Erik Fox WX,2022-09-30 18:38:47+00:00,"[{'text': 'HurricaneIan', 'indices': [37, 50]}]",0,0,https://t.co/qWs0XJzGMx,"[storm, surg, issu, georgetown, sc, hurricanei...",doc_7,1575918095008681986
7,doc_8\t1575918088473788429,"Our thoughts are with the students, teachers, ...",Close Up Washington,2022-09-30 18:38:46+00:00,"[{'text': 'CloseUpDC', 'indices': [114, 124]},...",0,0,https://t.co/eHZ9NKhCgA,"[thought, student, teacher, parent, commun, su...",doc_8,1575918088473788429
8,doc_9\t1575918083075555329,#SouthCarolina braces for #HurricaneIan to mak...,Mamazita,2022-09-30 18:38:44+00:00,"[{'text': 'SouthCarolina', 'indices': [0, 14]}...",0,0,{},"[southcarolina, brace, hurricaneian, make, lan...",doc_9,1575918083075555329
9,doc_10\t1575918057037303808,How pissed is GOD to send #HurricaneIan to Flo...,Qanon is a Death Cult,2022-09-30 18:38:38+00:00,"[{'text': 'HurricaneIan', 'indices': [26, 39]}...",0,0,{},"[piss, god, send, hurricaneian, florida, south...",doc_10,1575918057037303808


##PROJECT PART 2: Indexing and Evaluation

###Indexing

**Inverted Index tfidf function**

In [9]:
def create_index_tfidf(dataset, num_documents):
    """
    Implement the inverted index and compute tf, df and idf
    
    Argument:
    lines -- collection of Wikipedia articles
    num_documents -- total number of documents
    
    Returns:
    index - the inverted index (implemented through a Python dictionary) containing terms as keys and the corresponding
    list of document these keys appears in (and the positions) as values.
    tf - normalized term frequency for each term in each document
    df - number of documents each term appear in
    idf - inverse document frequency of each term
    """
    index = defaultdict(list)
    tf = defaultdict(list)  # term frequencies of terms in documents (documents in the same order as in the main index)
    df = defaultdict(int)  # document frequencies of terms in the corpus
    idf = defaultdict(float)
    title_index = defaultdict(str)
    likes = defaultdict(str)
    retweets = defaultdict(str)

    for pos, line in enumerate(dataset['preprocessed_tweet']):  # Remember, lines contain all documents from file
 
        page_id = dataset['doc_id'][pos]
        title_index[page_id] = dataset['Tweet'][pos] #each of the tweets (full text before being preprocessed)
        likes[page_id] = dataset['Likes'][pos]
        retweets[page_id] = dataset['Retweets'][pos] 
        current_page_index = {}

        for position, term in enumerate(line):  ## terms contains page_title + page_text
            try:
                # if the term is already in the dict append the position to the corresponding list
                current_page_index[term][1].append(position) 
            except:
                # Add the new term as dict key and initialize the array of positions and add the position
                current_page_index[term] = [page_id, array('I', [position])]  #'I' indicates unsigned int (int in Python)

        #normalize term frequencies
        # Compute the denominator to normalize term frequencies (formula 2 above)
        # norm is the same for all terms of a document.
        norm = 0
        for term, posting in current_page_index.items():
            # posting will contain the list of positions for current term in current document. 
            # posting ==> [current_doc, [list of positions]] 
            # you can use it to infer the frequency of current term.
            norm += len(posting[1]) ** 2
        norm = math.sqrt(norm)

        # calculate the tf(dividing the term frequency by the above computed norm) and df weights
        for term, posting in current_page_index.items():
            # append the tf for current term (tf = term frequency in current doc/norm)
            tf[term].append(np.round(len(posting[1])/norm,4)) ## SEE formula (1) above
            #increment the document frequency of current term (number of documents containing the current term)
            df[term] = df[term]+1 # increment DF for current term

        #merge the current page index with the main index
        for term_page, posting_page in current_page_index.items():
            index[term_page].append(posting_page)

        # Compute IDF following the formula (3) above. HINT: use np.log
        for term in df:
            idf[term] = np.round(np.log(float(num_documents/df[term])), 4)

    return index, tf, df, idf,title_index, likes, retweets


index, tf, df, idf, title_index, likes, retweets = create_index_tfidf(final_dataset, len(final_dataset))

In [10]:
def rank_documents(terms, docs, index, idf, tf):
    """
    Perform the ranking of the results of a search based on the tf-idf weights
    
    Argument:
    terms -- list of query terms
    docs -- list of documents/tweets, to rank, matching the query
    index -- inverted index data structure
    idf -- inverted document frequencies
    tf -- term frequencies
    title_index -- mapping between page id and page title
    
    Returns:
    Print the list of ranked documents and documents scores
    """

    # We are interested only on the element of the docVector corresponding to the query terms 
    # The remaining elements would became 0 when multiplied to the query_vector
    doc_vectors = defaultdict(lambda: [0] * len(terms)) # We call doc_vectors[k] for a nonexistent key k, the key-value pair (k,[0]*len(terms)) will be automatically added to the dictionary
    query_vector = [0] * len(terms)

    # computing the norm for the query tf
    query_terms_count = collections.Counter(terms)  # get the frequency of each term in the query. 

    query_norm = la.norm(list(query_terms_count.values()))

    for termIndex, term in enumerate(terms):  #termIndex is the index of the term in the query
        if term not in index:
            continue

        ## Compute tf*idf(normalize TF as done with documents)
        query_vector[termIndex] = query_terms_count[term] / query_norm * idf[term]

        # Generate doc_vectors for matching docs
        for doc_index, (doc, postings) in enumerate(index[term]):
            # Example of [doc_index, (doc, postings)]
            # 0 (26, array('I', [1, 4, 12, 15, 22, 28, 32, 43, 51, 68, 333, 337]))
            # 1 (33, array('I', [26, 33, 57, 71, 87, 104, 109]))
            # term is in doc 26 in positions 1,4, .....
            # term is in doc 33 in positions 26,33, .....

            #tf[term][0] will contain the tf of the term "term" in the doc 26            
            if doc in docs:
                doc_vectors[doc][termIndex] = tf[term][doc_index] * idf[term]  # TODO: check if multiply for idf

    # Calculating the score of each doc 
    # Computing the cosine similarity between queyVector and each docVector:

    doc_scores = [[np.dot(curDocVec, query_vector), doc] for doc, curDocVec in doc_vectors.items()]
    doc_scores.sort(reverse=True)
    result_docs = [x[1] for x in doc_scores]
    result_scores = [x[0] for x in doc_scores]
    if len(doc_scores) == 0:
        print("No results found, try again")
        #query1=input()
        #docs = search_tf_idf(query,index)
    return result_docs, result_scores

In [11]:
def search_tf_idf(query, index):
    """
    output is the list of documents that contain any of the query terms. 
    So, we will get the list of documents for each query term, and take the union of them.
    """
    query = build_terms(query)
    docs = set()
    for term in query:
        try:
            # store in term_docs the ids of the docs that contain "term"                        
            term_docs = [posting[0] for posting in index[term]]
            
            # docs = docs Union term_docs
            docs |= set(term_docs)
        except:
            #term is not in index
            pass
    docs = list(docs)
    ranked_docs,ranked_scores = rank_documents(query, docs, index, idf, tf)
    return ranked_docs, ranked_scores

##PROJECT PART 3: Ranking

###Ranking score

####Part1

#####TF-IDF + cosine similarity:


######QUERIES TF-IDF + COSINE SIMILARITY

In [12]:
#QUERY 1
print("hurricane ian:\n")
q1 = input()
ranked_docs1, ranked_scores1 = search_tf_idf(q1, index)
top = 20

print("\n======================\nTop {} results out of {} for the searched query:\n".format(top, len(ranked_docs1)))
i=0
for d_id in ranked_docs1[:top]:
    print("page_id= {} - doc_score: {} - original_tweet: {} - likes: {} - retweets: {}\n".format(d_id, ranked_scores1[i] ,title_index[d_id], likes[d_id], retweets[d_id]))
    i = i+1   

hurricane ian:

hurricane ian

Top 20 results out of 1095 for the searched query:

page_id= doc_634 - doc_score: 1.9399946808643298 - original_tweet: Hurricane IAN #Ian #HurricaneIan #HurricanIan #Huracan #HuracanIan #Hurricane https://t.co/HbllO4Q3vB - likes: 1 - retweets: 0

page_id= doc_495 - doc_score: 1.9303480851923807 - original_tweet: Hurricane Ian be like 

‚ÄúYou all haven‚Äôt had any hurricanes around here lately so I‚Äôm just gonna take my time on the East Coast.‚Äù

#HurricaneIan #Ian #Hurricane - likes: 0 - retweets: 0

page_id= doc_1025 - doc_score: 1.8953512111269468 - original_tweet: @colbertlateshow @DirectRelief @ConvoyofHope @WCKitchen @GlobalGiving @StephenAtHome #hope7cc How to help people with the aftermath of Hurricane Ian. #ian #HurricaneIan #Hurricane_Ian #hurricane #charity #PeopleHelpingPeople https://t.co/AKcI4PlhEY - likes: 0 - retweets: 0

page_id= doc_2140 - doc_score: 1.814775192576548 - original_tweet: #hurricaneian
The perpetual hurricane Ian - likes:

In [13]:
#QUERY 2
print("south carolina:\n")
q2 = input()
ranked_docs2, ranked_scores2 = search_tf_idf(q2, index)
top = 20

print("\n======================\nTop {} results out of {} for the searched query:\n".format(top, len(ranked_docs2)))
i=0
for d_id in ranked_docs2[:top]:
    print("page_id= {} - doc_score: {} - original_tweet: {} - likes: {} - retweets: {}\n".format(d_id, ranked_scores2[i] ,title_index[d_id], likes[d_id], retweets[d_id]))
    i = i+1 

south carolina:

south carolina

Top 20 results out of 358 for the searched query:

page_id= doc_174 - doc_score: 4.945604303400691 - original_tweet: South Carolina #HurricaneIan here we go - likes: 1 - retweets: 0

page_id= doc_254 - doc_score: 4.423348488961577 - original_tweet: South Carolina #HurricaneIan https://t.co/yTA4dFUC2V - likes: 0 - retweets: 0

page_id= doc_493 - doc_score: 4.355493129784444 - original_tweet: Just south of Myrtle Beach in South Carolina. #HurricaneIan #Ian #ScWx https://t.co/ErHr5X5cQQ - likes: 1 - retweets: 0

page_id= doc_480 - doc_score: 4.037591353296323 - original_tweet: South Carolina friends, you OK so far?? #HurricaneIan - likes: 0 - retweets: 0

page_id= doc_48 - doc_score: 4.037591353296323 - original_tweet: Sending love and prayers to South Carolina #HurricaneIan üôè ‚ù§Ô∏è - likes: 0 - retweets: 0

page_id= doc_1401 - doc_score: 4.037591353296323 - original_tweet: #hurricaneian won't go away prayers for South Carolina - likes: 1 - retweets: 0

In [14]:
#QUERY 3
print("help people:\n")
q3 = input()
ranked_docs3, ranked_scores3 = search_tf_idf(q3, index)
top = 20

print("\n======================\nTop {} results out of {} for the searched query:\n".format(top, len(ranked_docs3)))
i=0
for d_id in ranked_docs3[:top]:
    print("page_id= {} - doc_score: {} - original_tweet: {} - likes: {} - retweets: {}\n".format(d_id, ranked_scores3[i] ,title_index[d_id], likes[d_id], retweets[d_id]))
    i = i+1 

help people:

help people

Top 20 results out of 556 for the searched query:

page_id= doc_3682 - doc_score: 4.729731010467683 - original_tweet: Good morning lovelies. Yesterday my employer sent an email about donations to help Florida #HurricaneIan victims. But I wanna hear from YOU, how can people help? The theme parks will be ok, I‚Äôm concerned about the PEOPLE! What can people do to help? https://t.co/ERiJukIPVN - likes: 3 - retweets: 0

page_id= doc_3454 - doc_score: 4.038435214957499 - original_tweet: Never thought I'd see the day where people say a state should receive no aid funding because of a letter next to a governor's name. Disgusting that people would rather make this tragedy political than see people get help. Some people have truly lost it.
#HurricaneIan #Florida - likes: 0 - retweets: 0

page_id= doc_812 - doc_score: 3.9145908322646146 - original_tweet: Looking for organizations that are providing help to #HurricaneIan victims in Central Florida. We're trying to conne

In [15]:
#QUERY 4
print("flood damage:\n")
q4 = input()
ranked_docs4, ranked_scores4 = search_tf_idf(q4, index)
top = 20

print("\n======================\nTop {} results out of {} for the searched query:\n".format(top, len(ranked_docs4)))
i=0
for d_id in ranked_docs4[:top]:
    print("page_id= {} - doc_score: {} - original_tweet: {} - likes: {} - retweets: {}\n".format(d_id, ranked_scores4[i] ,title_index[d_id], likes[d_id], retweets[d_id]))
    i = i+1 

flood damage:

flood damage

Top 20 results out of 495 for the searched query:

page_id= doc_3288 - doc_score: 2.841599791861431 - original_tweet: We're still without power &amp; no estimate for restoration so we went to a hotel. There is minor damage to our roof &amp; some flood damage but nothing major. Everything in our fridge &amp; freezer is gone. But it could've been much worse!! #HurricaneIan - likes: 17 - retweets: 0

page_id= doc_3909 - doc_score: 2.758932621188974 - original_tweet: #Map: Where #Hurricane #Ian Hit #Florida Hardest
Officials are still working to assess extent of #damage caused by #HurricaneIan &amp; its subsequent #floods, as reports emerge of destroyed #homes, damaged #power lines &amp; disrupted #water supplies. #stormsurge 
https://t.co/UFlmjVYDLk - likes: 0 - retweets: 0

page_id= doc_2687 - doc_score: 2.702346514124129 - original_tweet: Just over Blind Pass Bridge on #Sanibel #Captiva Rd. Significant damage here. Structures missing and damages. #Ian #Hurri

In [16]:
#QUERY 5
print("storm in florida:\n")
q5 = input()
ranked_docs5, ranked_scores5 = search_tf_idf(q5, index)
top = 20

print("\n======================\nTop {} results out of {} for the searched query:\n".format(top, len(ranked_docs5)))
i=0
for d_id in ranked_docs5[:top]:
    print("page_id= {} - doc_score: {} - original_tweet: {} - likes: {} - retweets: {}\n".format(d_id, ranked_scores5[i] ,title_index[d_id], likes[d_id], retweets[d_id]))
    i = i+1 

storm in florida:

storm in florida

Top 20 results out of 1163 for the searched query:

page_id= doc_3189 - doc_score: 2.29995780442169 - original_tweet: Storm Surge as #Ian moves towards South Carolina

#SouthCarolina #HurricaneIan #Hurricane #Storm #Climate #Viral #Rain #Tornado #Tropicswx #SCwx 

VC: Palmetto Storm Chasers https://t.co/Vh1YQuwDET - likes: 1 - retweets: 1

page_id= doc_2226 - doc_score: 2.1240859636958525 - original_tweet: #HurricaneIan is no anomaly. The #ClimateCrisis is making #storms more powerful. These kind of storms will be the #NewNormal

Hurricane Ian is the fifth most powerful storm to ever hit USA. Only four other hurricanes have made landfall with sustained winds of more than 155mph - likes: 5 - retweets: 2

page_id= doc_1714 - doc_score: 2.002709622913232 - original_tweet: VDOT completing storm preparations in advance of Tropical Storm Ian's arrival https://t.co/7WU3pFba3s #ian #hurricaneian - likes: 0 - retweets: 0

page_id= doc_1822 - doc_score: 1.946

#####Your-score + cosine similarity:

In [17]:
ourN = len(final_dataset['doc_id'])
likest = dict(zip(final_dataset['doc_id'],final_dataset['Likes']))
retweetst = dict(zip(final_dataset['doc_id'],final_dataset['Retweets']))

In [18]:
def our_rank_documents(terms, docs, index, idf, tf, mg, rt):
    """
    Perform the ranking of the results of a search based on the tf-idf weights and a popularity score
    
    Argument:
    terms -- list of query terms
    docs -- list of documents/tweets, to rank, matching the query
    index -- inverted index data structure
    idf -- inverted document frequencies
    tf -- term frequencies
    mg -- total number of likes the tweet has
    rt -- total number of retweets the tweet has
    

    
    Returns:
    Print the list of ranked documents and documents scores
    """

    # We are interested only on the element of the docVector corresponding to the query terms 
    # The remaining elements would became 0 when multiplied to the query_vector
    doc_vectors = defaultdict(lambda: [0] * len(terms)) # We call doc_vectors[k] for a nonexistent key k, the key-value pair (k,[0]*len(terms)) will be automatically added to the dictionary
    query_vector = [0] * len(terms)

    # computing the norm for the query tf
    query_terms_count = collections.Counter(terms)  # get the frequency of each term in the query. 

    query_norm = la.norm(list(query_terms_count.values()))

    for termIndex, term in enumerate(terms):  #termIndex is the index of the term in the query
        if term not in index:
            continue

        ## Compute tf*idf(normalize TF as done with documents)
        query_vector[termIndex] = query_terms_count[term] / query_norm * idf[term]

        # Generate doc_vectors for matching docs
        for doc_index, (doc, postings) in enumerate(index[term]):
            # Example of [doc_index, (doc, postings)]
            # 0 (26, array('I', [1, 4, 12, 15, 22, 28, 32, 43, 51, 68, 333, 337]))
            # 1 (33, array('I', [26, 33, 57, 71, 87, 104, 109]))
            # term is in doc 26 in positions 1,4, .....
            # term is in doc 33 in positions 26,33, .....

            #tf[term][0] will contain the tf of the term "term" in the doc 26            
            if doc in docs:
                likes = mg[doc] 
                retweets = rt[doc]
                popularity_score = 0.5*math.log2(1+likes)+ 1*math.log2(1+retweets)
                doc_vectors[doc][termIndex] = tf[term][doc_index] * idf[term] + 0.5*(popularity_score)

    # Calculating the score of each doc 
    # Computing the cosine similarity between queyVector and each docVector:

    doc_scores = [[np.dot(curDocVec, query_vector), doc] for doc, curDocVec in doc_vectors.items()]
    doc_scores.sort(reverse=True)
    result_docs = [x[1] for x in doc_scores]
    result_scores = [x[0] for x in doc_scores]
    if len(doc_scores) == 0:
        print("No results found, try again")
        #query1=input()
        #docs = search_tf_idf(query,index)
    return result_docs, result_scores

In [19]:
def our_search(query, index, mg, rt):
    """
    output is the list of documents that contain any of the query terms. 
    So, we will get the list of documents for each query term, and take the union of them.
    """
    query = build_terms(query)
    docs = set()
    for term in query:
        try:
            # store in term_docs the ids of the docs that contain "term"                        
            term_docs = [posting[0] for posting in index[term]]
            
            # docs = docs Union term_docs
            docs |= set(term_docs)
        except:
            #term is not in index
            pass
    docs = list(docs)
    ranked_docs,ranked_scores = our_rank_documents(query, docs, index, idf, tf, mg, rt)
    return ranked_docs, ranked_scores

######QUERIES OUR SCORE + COSINE SIMILARITY

In [20]:
 #QUERY 1:
print("hurricane ian:\n")
our_query = input()
our_ranking_docs1,our_ranking_scores1 = our_search(our_query, index, likest, retweetst)
top = 20

print("\n======================\nTop {} results out of {} for the searched query:\n".format(top, len(our_ranking_docs1)))
i=0
for d_id in our_ranking_docs1[:top]:
    print("page_id= {} - doc_score: {} - original_tweet: {} - likes: {} - retweets: {}\n".format(d_id, our_ranking_scores1[i] ,title_index[d_id], likes[d_id], retweets[d_id]))
    i = i+1

hurricane ian:

hurricane ian

Top 20 results out of 1095 for the searched query:

page_id= doc_3636 - doc_score: 11.17114744109913 - original_tweet: Hurricane Ian heads towards South Carolina after striking Florida.

The death toll in Florida has risen to 21. We are praying from Israel for all those affected. Our hearts are with you. #HurricaneIan https://t.co/ArU7CFOZwh - likes: 151 - retweets: 46

page_id= doc_1950 - doc_score: 10.349467236370877 - original_tweet: Wow! You can start to see #flooding from #HurricaneIan on one of the Charleston, #SouthCarolina livecams right now. Lets hope it doesn't get much worse. #Hurricane #Ian https://t.co/2pXRcxaBjg - likes: 67 - retweets: 40

page_id= doc_1613 - doc_score: 9.725796725837302 - original_tweet: Flooding in downtown Charleston South Carolina happening right now due to Hurricane Ian. Many of the roads around the city are impassible, and more rain expected. I will continue covering the flood levels and showing the conditions. #hurric

In [21]:
#QUERY 2:
print("south carolina:\n")
our_query2 = input()
our_ranking_docs2,our_ranking_scores2 = our_search(our_query2, index, likes, retweets)
top = 20

print("\n======================\nTop {} results out of {} for the searched query:\n".format(top, len(our_ranking_docs2)))
i=0
for d_id in our_ranking_docs2[:top]:
    print("page_id= {} - doc_score: {} - original_tweet: {} - likes: {} - retweets: {}\n".format(d_id, our_ranking_scores2[i] ,title_index[d_id], likes[d_id], retweets[d_id]))
    i = i+1

south carolina:

south carolina

Top 20 results out of 358 for the searched query:

page_id= doc_3636 - doc_score: 19.2676408266899 - original_tweet: Hurricane Ian heads towards South Carolina after striking Florida.

The death toll in Florida has risen to 21. We are praying from Israel for all those affected. Our hearts are with you. #HurricaneIan https://t.co/ArU7CFOZwh - likes: 151 - retweets: 46

page_id= doc_2684 - doc_score: 18.5337572514716 - original_tweet: 1140a: Eye of #HurricaneIan just 40-50 miles offshore South Carolina. Landfall between Charleston and Myrtle Beach next few hours. https://t.co/XyOP10MTHE - likes: 84 - retweets: 42

page_id= doc_1613 - doc_score: 16.729819518069114 - original_tweet: Flooding in downtown Charleston South Carolina happening right now due to Hurricane Ian. Many of the roads around the city are impassible, and more rain expected. I will continue covering the flood levels and showing the conditions. #hurricaneian #charleston #southcarolina #scwx

In [22]:
#QUERY 3:
print("help people:\n")
our_query3 = input()
our_ranking_docs3,our_ranking_scores3 = our_search(our_query3, index, likes, retweets)
top = 20

print("\n======================\nTop {} results out of {} for the searched query:\n".format(top, len(our_ranking_docs3)))
i=0
for d_id in our_ranking_docs3[:top]:
    print("page_id= {} - doc_score: {} - original_tweet: {} - likes: {} - retweets: {}\n".format(d_id, our_ranking_scores3[i] ,title_index[d_id], likes[d_id], retweets[d_id]))
    i = i+1

help people:

help people

Top 20 results out of 556 for the searched query:

page_id= doc_2534 - doc_score: 16.961465696074306 - original_tweet: Saw Trump saying, ‚ÄúI hope you can help and help out" and actually thought he was asking for donations for the people of Florida--where he lives--who have been devastated by #HurricaneIan. 

Nope--he's asking for supporters to pay his many legal bills.   
https://t.co/2iu0GQVB3W - likes: 74 - retweets: 26

page_id= doc_1202 - doc_score: 16.107045927844027 - original_tweet: Thank you to everyone across the nation and state who want to help the people devastated by #HurricaneIan, just make sure you aren‚Äôt being scammed. #FloridaStrong https://t.co/bQ2jJUzPrI - likes: 65 - retweets: 21

page_id= doc_3938 - doc_score: 15.827667031015098 - original_tweet: I can go home, but for so many people, this is home. If you want to help, please donate money to your assistance charity of choice. It‚Äôs really the best way to give a hand. @weathernetwork @

In [23]:
#QUERY 4:
print("flood damage:\n")
our_query4 = input()
our_ranking_docs4,our_ranking_scores4 = our_search(our_query4, index, likes, retweets)
top = 20

print("\n======================\nTop {} results out of {} for the searched query:\n".format(top, len(our_ranking_docs4)))
i=0
for d_id in our_ranking_docs4[:top]:
    print("page_id= {} - doc_score: {} - original_tweet: {} - likes: {} - retweets: {}\n".format(d_id, our_ranking_scores4[i] ,title_index[d_id], likes[d_id], retweets[d_id]))
    i = i+1

flood damage:

flood damage

Top 20 results out of 495 for the searched query:

page_id= doc_3743 - doc_score: 23.060783581984037 - original_tweet: . @GovRonDeSantis is requesting quick insurance claim settlements for Floridians. 

We don‚Äôt need the National Flood Insurance Program (NFIP) fighting w/ private insurance carriers over wind vs. flood damage. I will be watching this closely. 

Pray for our state.  #HurricaneIan - likes: 315 - retweets: 85

page_id= doc_1561 - doc_score: 13.321172102797032 - original_tweet: In January of this year, an EF2 tornado caused major damage in Iona (southern Fort Myers). Flooding likely just totaled everything left. Below is a before, after tornado, and after Ian view of the Tropicana 55+ Mobile Home Community. #hurricaneian @SRHelicity @spann https://t.co/Xz0IKOzjmz - likes: 24 - retweets: 11

page_id= doc_1562 - doc_score: 12.535099903039299 - original_tweet: #HurricaneIan, with intense storm surges and rapid flooding, has devasted communities a

In [24]:
#QUERY 5:
print("storm in florida:\n")
our_query5 = input()
our_ranking_docs5,our_ranking_scores5 = our_search(our_query5, index, likes, retweets)
top = 20

print("\n======================\nTop {} results out of {} for the searched query:\n".format(top, len(our_ranking_docs5)))
i=0
for d_id in our_ranking_docs5[:top]:
    print("page_id= {} - doc_score: {} - original_tweet: {} - likes: {} - retweets: {}\n".format(d_id, our_ranking_scores5[i] ,title_index[d_id], likes[d_id], retweets[d_id]))
    i = i+1

storm in florida:

storm in florida

Top 20 results out of 1163 for the searched query:

page_id= doc_2794 - doc_score: 11.83210091605128 - original_tweet: Storm chaser Reed Timmer (@ReedTimmerAccu) recorded a chaotic scene with flooding storm surge and howling winds in Pine Island, Florida, as he entered the eye of #HurricaneIan on Sept. 28. https://t.co/GGFQFnP47q https://t.co/bY8LFDl5Py - likes: 75 - retweets: 16

page_id= doc_2041 - doc_score: 10.300183160198436 - original_tweet: The damage to southwest and central #Florida is overwhelming after #HurricaneIan made landfall as a Category 4 storm earlier this week. In the wake of the devastation, our U.S. Disaster Relief team will be caring for hurting homeowners in Jesus' Name. https://t.co/G2MBTStteR https://t.co/d9cEOZMc7f - likes: 35 - retweets: 16

page_id= doc_2304 - doc_score: 10.169224507226573 - original_tweet: #HurricaneIan has devastated Florida and our thoughts continue to be with all those affected. 

We transported at-r

#####BM25

In [25]:
#Counting the words in each row of preprocessed_tweets column
final_dataset['tw_length']=None
for i in final_dataset.index: 
    final_dataset['tw_length'][i] = len(final_dataset['preprocessed_tweet'][i])

#Computing the Average Length
Total_length = final_dataset['tw_length'].sum() 
N = len(final_dataset['doc_id'])
Lave = round(Total_length/ N)
ld = dict(zip(final_dataset['doc_id'],final_dataset['tw_length']))

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


In [26]:
def BM25_rank_documents(terms, docs, index, idf, tf, N, k1, b, Ld, Lave):
    """
    Perform the ranking of the results of a search 
    
    Argument:
    terms -- list of query terms
    docs -- list of documents/tweets, to rank, matching the query
    index -- inverted index data structure
    idf -- inverted document frequencies
    tf -- term frequencies
    N -- Total number of documents
    k1 -- tunning parameter controlling the document term frequency scaling
    b -- tunning parameter controlling the scaling by document term frequency scaling 
    Ld -- lenght of document d
    Lave -- average document lenght in the whole collection

    
    Returns:
    Print the list of ranked documents and documents scores
    """

    # We are interested only on the element of the docVector corresponding to the query terms 
    # The remaining elements would became 0 when multiplied to the query_vector
    doc_vectors = defaultdict(lambda: [0] * len(terms)) # We call doc_vectors[k] for a nonexistent key k, the key-value pair (k,[0]*len(terms)) will be automatically added to the dictionary
    #query_vector = [0] * len(terms)

    # computing the norm for the query tf
    #query_terms_count = collections.Counter(terms)  # get the frequency of each term in the query. 

    #query_norm = la.norm(list(query_terms_count.values()))

    for termIndex, term in enumerate(terms):  #termIndex is the index of the term in the query
        if term not in index:
            continue

        ## Compute tf*idf(normalize TF as done with documents)
        #query_vector[termIndex] = query_terms_count[term] / query_norm * idf[term]
 
        # Generate doc_vectors for matching docs
        for doc_index, (doc, postings) in enumerate(index[term]):      
            if doc in docs:
                df = len(index[term])
                ld = Ld[doc] 
                idf = np.log(N/df)
                nominator = (k1+1)*tf[term][doc_index]
                denominator = k1*((1-b)+(ld/Lave))+tf[term][doc_index]
                doc_vectors[doc][termIndex] = idf * ((nominator)/(denominator))

    # Calculating the score of each doc 
    # Computing the cosine similarity between queyVector and each docVector:

    doc_scores = [[np.sum(curDocVec), doc] for doc, curDocVec in doc_vectors.items()]
    doc_scores.sort(reverse=True)
    result_docs = [x[1] for x in doc_scores]
    result_scores = [x[0] for x in doc_scores]
    if len(doc_scores) == 0:
        print("No results found, try again")
        #query1=input()
        #docs = search_tf_idf(query,index)
    return result_docs, result_scores

In [27]:
def BM25_search(query, index, N, k1, b, Ld, Lave ):
    """
    output is the list of documents that contain any of the query terms. 
    So, we will get the list of documents for each query term, and take the union of them.
    """
    query = build_terms(query)
    docs = set()
    for term in query:
        try:
            # store in term_docs the ids of the docs that contain "term"                        
            term_docs = [posting[0] for posting in index[term]]
            
            # docs = docs Union term_docs
            docs |= set(term_docs)
        except:
            #term is not in index
            pass
    docs = list(docs)
    ranked_docs,ranked_scores = BM25_rank_documents(query, docs, index, idf, tf, N, k1, b, Ld, Lave)
    return ranked_docs, ranked_scores

######QUERIES BM25

In [28]:
#QUERY 1:
print("hurricane ian:\n")
BM25_query1 = input()
BM25_ranking_docs,BM25_ranking_scores = BM25_search(BM25_query1, index, N, 1.5, 1, ld ,Lave)
top = 20

print("\n======================\nTop {} results out of {} for the searched query:\n".format(top, len(BM25_ranking_docs)))
i=0
for d_id in BM25_ranking_docs[:top]:
    print("page_id= {} - doc_score: {} - original_tweet: {} - likes: {} - retweets: {}\n".format(d_id, BM25_ranking_scores[i] ,title_index[d_id], likes[d_id], retweets[d_id]))
    i = i+1 


hurricane ian:

hurricane ian

Top 20 results out of 1095 for the searched query:

page_id= doc_2140 - doc_score: 4.806010837334956 - original_tweet: #hurricaneian
The perpetual hurricane Ian - likes: 0 - retweets: 0

page_id= doc_1217 - doc_score: 4.806010837334956 - original_tweet: Hurricane Ian on tourüò≠
#HurricaneIan - likes: 0 - retweets: 0

page_id= doc_640 - doc_score: 4.146565917688284 - original_tweet: Hurricane Ian before and after #HurricaneIan https://t.co/XZstkI2pN2 - likes: 2 - retweets: 0

page_id= doc_2109 - doc_score: 3.600186493063438 - original_tweet: Praying for everyone impacted in hurricane Ian.#HurricaneIan - likes: 3 - retweets: 1

page_id= doc_634 - doc_score: 3.1300265536195115 - original_tweet: Hurricane IAN #Ian #HurricaneIan #HurricanIan #Huracan #HuracanIan #Hurricane https://t.co/HbllO4Q3vB - likes: 1 - retweets: 0

page_id= doc_2744 - doc_score: 2.776080488894473 - original_tweet: Hope everyone affected by Hurricane Ian is staying safeüíñ #HurricaneIa

In [29]:
#QUERY 2:
print("south carolina:\n")
BM25_query2 = input()
BM25_ranking_docs2,BM25_ranking_scores2 = BM25_search(BM25_query2, index, N, 1.5, 1, ld ,Lave)
top = 20

print("\n======================\nTop {} results out of {} for the searched query:\n".format(top, len(BM25_ranking_docs2)))
i=0
for d_id in BM25_ranking_docs2[:top]:
    print("page_id= {} - doc_score: {} - original_tweet: {} - likes: {} - retweets: {}\n".format(d_id, BM25_ranking_scores2[i] ,title_index[d_id], likes[d_id], retweets[d_id]))
    i = i+1 


south carolina:

south carolina

Top 20 results out of 358 for the searched query:

page_id= doc_174 - doc_score: 7.931362905024203 - original_tweet: South Carolina #HurricaneIan here we go - likes: 1 - retweets: 0

page_id= doc_254 - doc_score: 6.843080512283574 - original_tweet: South Carolina #HurricaneIan https://t.co/yTA4dFUC2V - likes: 0 - retweets: 0

page_id= doc_480 - doc_score: 5.941390181734711 - original_tweet: South Carolina friends, you OK so far?? #HurricaneIan - likes: 0 - retweets: 0

page_id= doc_48 - doc_score: 5.941390181734711 - original_tweet: Sending love and prayers to South Carolina #HurricaneIan üôè ‚ù§Ô∏è - likes: 0 - retweets: 0

page_id= doc_1401 - doc_score: 5.941390181734711 - original_tweet: #hurricaneian won't go away prayers for South Carolina - likes: 1 - retweets: 0

page_id= doc_2874 - doc_score: 5.197737826108093 - original_tweet: From .@ABCNews4 in Charleston, South Carolina - #HurricaneIan https://t.co/TLCRoQ3VvD - likes: 0 - retweets: 0

page_i

In [30]:
#QUERY 3:
print("help people:\n")
BM25_query3 = input()
BM25_ranking_docs3,BM25_ranking_scores3 = BM25_search(BM25_query3, index, N, 1.5, 1, ld ,Lave)
top = 20

print("\n======================\nTop {} results out of {} for the searched query:\n".format(top, len(BM25_ranking_docs3)))
i=0
for d_id in BM25_ranking_docs3[:top]:
    print("page_id= {} - doc_score: {} - original_tweet: {} - likes: {} - retweets: {}\n".format(d_id, BM25_ranking_scores3[i] ,title_index[d_id], likes[d_id], retweets[d_id]))
    i = i+1 

help people:

help people

Top 20 results out of 556 for the searched query:

page_id= doc_191 - doc_score: 3.9865987582590328 - original_tweet: @SenGaryPeters @SenStabenow
Just wondering why you are quiet about the helping the people in Florida and SC after the #HurricaneIan - likes: 0 - retweets: 0

page_id= doc_3428 - doc_score: 3.79650493860049 - original_tweet: Why people hurt people‚Äôs feelings? #PonniyinSelvan #HurricaneIan #CovidIsNotOver - likes: 0 - retweets: 0

page_id= doc_2688 - doc_score: 3.6849380106240917 - original_tweet: Thinking about the people who have lost anything to #HurricaneIan - likes: 0 - retweets: 0

page_id= doc_898 - doc_score: 3.5641522955915836 - original_tweet: @DonaldJTrumpJr Don't even TRY to open a charity to "help the people of Florida recover from #HurricaneIan..." üòÇüòÇüòÇ - likes: 1 - retweets: 0

page_id= doc_2027 - doc_score: 3.5352673151211267 - original_tweet: @KEYCNewsNow Thanks for sharing how the Red Cross is helping those affected b

In [31]:
#QUERY 4:
print("flood damage:\n")
BM25_query4 = input()
BM25_ranking_docs4,BM25_ranking_scores4 = BM25_search(BM25_query4, index, N, 1.5, 1, ld ,Lave)
top = 20

print("\n======================\nTop {} results out of {} for the searched query:\n".format(top, len(BM25_ranking_docs4)))
i=0
for d_id in BM25_ranking_docs4[:top]:
    print("page_id= {} - doc_score: {} - original_tweet: {} - likes: {} - retweets: {}\n".format(d_id, BM25_ranking_scores4[i] ,title_index[d_id], likes[d_id], retweets[d_id]))
    i = i+1 

flood damage:

flood damage

Top 20 results out of 495 for the searched query:

page_id= doc_857 - doc_score: 2.9447822765642053 - original_tweet: Flooding outside Iona. #HurricaneIan https://t.co/ZIMzpJE2ge - likes: 0 - retweets: 0

page_id= doc_3488 - doc_score: 2.9447822765642053 - original_tweet: Power is staring to flicker on and off and the pool is about to flood over #HurricaneIan - likes: 0 - retweets: 0

page_id= doc_1672 - doc_score: 2.9447822765642053 - original_tweet: Edgewater Florida Flooding. #HurricaneIan https://t.co/eK3lFQ0yLq - likes: 4 - retweets: 0

page_id= doc_1389 - doc_score: 2.9447822765642053 - original_tweet: Intense flooding on so many streets in my town. #HurricaneIan - likes: 0 - retweets: 0

page_id= doc_3735 - doc_score: 2.798935603680742 - original_tweet: Satellite imagery of the damage https://t.co/NggqAbpfnT

#hurricaneian #sanibel - likes: 1 - retweets: 0

page_id= doc_3658 - doc_score: 2.798935603680742 - original_tweet: We can‚Äôt get into plantat

In [32]:
#QUERY 5:
print("storm in florida:\n")
BM25_query5 = input()
BM25_ranking_docs5,BM25_ranking_scores5 = BM25_search(BM25_query5, index, N, 1.5, 1, ld ,Lave)
top = 20

print("\n======================\nTop {} results out of {} for the searched query:\n".format(top, len(BM25_ranking_docs5)))
i=0
for d_id in BM25_ranking_docs5[:top]:
    print("page_id= {} - doc_score: {} - original_tweet: {} - likes: {} - retweets: {}\n".format(d_id, BM25_ranking_scores5[i] ,title_index[d_id], likes[d_id], retweets[d_id]))
    i = i+1 

storm in florida:

storm in florida

Top 20 results out of 1163 for the searched query:

page_id= doc_3475 - doc_score: 3.127147970679615 - original_tweet: Yesterday after the storm #HurricaneIan https://t.co/igI6Z0mvGp - likes: 0 - retweets: 0

page_id= doc_3311 - doc_score: 3.127147970679615 - original_tweet: Doing good here but the storm is picking up for sure. #HurricaneIan - likes: 3 - retweets: 0

page_id= doc_132 - doc_score: 2.7150939136952186 - original_tweet: #HurricaneIan is here, a beautiful calm before the storm https://t.co/axB9kjuLax - likes: 1 - retweets: 0

page_id= doc_1172 - doc_score: 2.7150939136952186 - original_tweet: Storm cleanup! #jchsramfam #hurricaneian https://t.co/X9tpgc1kKR - likes: 1 - retweets: 0

page_id= doc_470 - doc_score: 2.2593069866886726 - original_tweet: Thank God. üò¢üôèüèæ #HurricaneIan 
#Florida - likes: 1 - retweets: 0

page_id= doc_1676 - doc_score: 2.1968477838177103 - original_tweet: Florida‚Äôs ‚Äòdeadliest storm‚Äô challenges some #

####Part 2

##### Word2_vec + cosine similarity

In [34]:
from gensim.models import Word2Vec
tweets = []
for i, tweet in final_dataset.iterrows():
    tweets.append((tweet['preprocessed_tweet']))
model = Word2Vec(tweets, min_count = 1, vector_size = 100, window = 5) #tota la coleccio de paraules de tots els tweets


tweet2vec = {}
for i, tweet in final_dataset.iterrows():
    doc_vec = [model.wv[term] for term in (tweet['preprocessed_tweet'])]
    tweet2vec[tweet['doc_id']] = np.mean(doc_vec, axis=0) #per cada tweet fem una llista amb tots les vectors, fem la mitja de totes les paraules que te aquell tweett

In [35]:
def word2vec_rank(query, docs, tweet2vec, model):
    """
    Rank documents based on the cosine similarity with the query.
    
    Argument:
    query -- list of terms in the query
    docs -- documents having the query terms
    tweet2vec -- dictionary having each document tweet2vec representation
    model -- word2vec model
    
    Returns:
    list of ranked documents and scores
    """

    #Computing the query_vector
    query_vector = np.array([model.wv[term] for term in query if term in model.wv.key_to_index])
    #Computing the query_vector average
    query_vector = np.mean(query_vector, axis=0)

    # Calculating the score of each doc 
    # Computing the cosine similarity between queyVector and each docVector:
    doc_scores=[[np.dot(doc_vec, query_vector), doc] for doc, doc_vec in tweet2vec.items() if doc in docs]
    doc_scores.sort(reverse=True)
    result_docs = [x[1] for x in doc_scores]
    result_scores = [x[0] for x in doc_scores]
    
    if len(doc_scores) == 0:
        print("No results found, try again")
        #query1=input()
        #docs = search_tf_idf(query,index)
    return result_docs, result_scores

In [36]:
def word2vec_search(query, tweet2vec, model, index):

    """
    output is the list of documents that contain any of the query terms. 
    So, we will get the list of documents for each query term, and take the union of them.
    """
    
    query = build_terms(query)
    docs = set()

    for term in query:
        try:
            # store in term_docs the ids of the docs that contain "term"                        
            term_docs=[posting[0] for posting in index[term]]
            # docs = docs Union term_docs
            docs = docs.union(term_docs)
        except:
            # Term is not in index
            pass
    docs = list(docs)
    ranked_docs,ranked_scores = word2vec_rank(query, docs, tweet2vec, model)
    return ranked_docs, ranked_scores
    

######QUERIES WORD2_VEC + COSINE SIMILARITY

In [37]:
#QUERY 1:
print("hurricane ian:\n")
word2vec_query1 = input()
word2vec_ranking_docs,word2vec_ranking_scores = word2vec_search(word2vec_query1,tweet2vec, model, index)
top = 20

print("\n======================\nTop {} results out of {} for the searched query:\n".format(top, len(word2vec_ranking_docs)))
i=0
for d_id in word2vec_ranking_docs[:top]:
    print("page_id= {} - doc_score: {} - original_tweet: {} -  likes: {} - retweets: {}\n".format(d_id, word2vec_ranking_scores[i] ,title_index[d_id], likes[d_id], retweets[d_id]))
    i = i+1

hurricane ian:

hurricane ian

Top 20 results out of 1095 for the searched query:

page_id= doc_640 - doc_score: 47.054290771484375 - original_tweet: Hurricane Ian before and after #HurricaneIan https://t.co/XZstkI2pN2 -  likes: 2 - retweets: 0

page_id= doc_1163 - doc_score: 43.35344314575195 - original_tweet: Ian is making landfall again. #HurricaneIan https://t.co/NCxy8gqWLD -  likes: 2 - retweets: 1

page_id= doc_2877 - doc_score: 42.77740478515625 - original_tweet: Hurricane Ian From Cuba through Florida to South Carolina.
#HurricaneIan https://t.co/7TypUjrtlW -  likes: 0 - retweets: 0

page_id= doc_902 - doc_score: 41.802738189697266 - original_tweet: #SouthCarolina right now be like...

#HurricaneIan
#Ian https://t.co/VLxhRLiWRe -  likes: 1 - retweets: 0

page_id= doc_2109 - doc_score: 41.258575439453125 - original_tweet: Praying for everyone impacted in hurricane Ian.#HurricaneIan -  likes: 3 - retweets: 1

page_id= doc_2744 - doc_score: 41.23740005493164 - original_tweet: Hope

In [39]:
#QUERY 2:
print("south carolina:\n")
word2vec_query2 = input()
word2vec_ranking_docs2,word2vec_ranking_scores2 = word2vec_search(word2vec_query2,tweet2vec, model, index)
top = 20

print("\n======================\nTop {} results out of {} for the searched query:\n".format(top, len(word2vec_ranking_docs2)))
i=0
for d_id in word2vec_ranking_docs2[:top]:
    print("page_id= {} - doc_score: {} - original_tweet: {} -  likes: {} - retweets: {}\n".format(d_id, word2vec_ranking_scores2[i] ,title_index[d_id], likes[d_id], retweets[d_id]))
    i = i+1

south carolina:

south carolina

Top 20 results out of 358 for the searched query:

page_id= doc_174 - doc_score: 35.081485748291016 - original_tweet: South Carolina #HurricaneIan here we go -  likes: 1 - retweets: 0

page_id= doc_254 - doc_score: 33.366905212402344 - original_tweet: South Carolina #HurricaneIan https://t.co/yTA4dFUC2V -  likes: 0 - retweets: 0

page_id= doc_2877 - doc_score: 33.206207275390625 - original_tweet: Hurricane Ian From Cuba through Florida to South Carolina.
#HurricaneIan https://t.co/7TypUjrtlW -  likes: 0 - retweets: 0

page_id= doc_2505 - doc_score: 31.92981719970703 - original_tweet: ‚ö†Ô∏è #HurricaneIan about to make landfall on South Carolina! https://t.co/X61NAwpsx6 -  likes: 2 - retweets: 1

page_id= doc_249 - doc_score: 31.907285690307617 - original_tweet: #HurricaneIan makes landfall in South Carolina... https://t.co/vKik1OyOgQ -  likes: 1 - retweets: 0

page_id= doc_2545 - doc_score: 31.055496215820312 - original_tweet: Businesses that are open i

In [40]:
#QUERY 3:
print("help people:\n")
word2vec_query3 = input()
word2vec_ranking_docs3,word2vec_ranking_scores3 = word2vec_search(word2vec_query3,tweet2vec, model, index)
top = 20

print("\n======================\nTop {} results out of {} for the searched query:\n".format(top, len(word2vec_ranking_docs3)))
i=0
for d_id in word2vec_ranking_docs3[:top]:
    print("page_id= {} - doc_score: {} - original_tweet: {} -  likes: {} - retweets: {}\n".format(d_id, word2vec_ranking_scores3[i] ,title_index[d_id], likes[d_id], retweets[d_id]))
    i = i+1

help people:

help people

Top 20 results out of 556 for the searched query:

page_id= doc_477 - doc_score: 39.82735824584961 - original_tweet: Help is on the way #HurricaneIan https://t.co/MnGoOB2Gzg -  likes: 0 - retweets: 0

page_id= doc_1458 - doc_score: 37.33622360229492 - original_tweet: Want to help those affected by #HurricaneIan? Here's how to help!
https://t.co/vmA6kBgGER -  likes: 0 - retweets: 0

page_id= doc_2183 - doc_score: 36.44008255004883 - original_tweet: Here's how you can help those affected by Hurricane Ian in Florida via/ @USATODAY #HurricaneIan https://t.co/jRNPo8JHFr https://t.co/M1iBwcpNWi -  likes: 0 - retweets: 0

page_id= doc_2108 - doc_score: 36.43523406982422 - original_tweet: Here's how you can help those affected by Hurricane Ian in Florida via/ @USATODAY #HurricaneIan https://t.co/nwZj6hUSQU https://t.co/5SUmNrY8TM -  likes: 0 - retweets: 0

page_id= doc_1993 - doc_score: 36.42539978027344 - original_tweet: Here's how you can help those affected by Hur

In [41]:
#QUERY 4:
print("flood damage:\n")
word2vec_query4 = input()
word2vec_ranking_docs4,word2vec_ranking_scores4 = word2vec_search(word2vec_query4,tweet2vec, model, index)
top = 20

print("\n======================\nTop {} results out of {} for the searched query:\n".format(top, len(word2vec_ranking_docs4)))
i=0
for d_id in word2vec_ranking_docs4[:top]:
    print("page_id= {} - doc_score: {} - original_tweet: {} -  likes: {} - retweets: {}\n".format(d_id, word2vec_ranking_scores4[i] ,title_index[d_id], likes[d_id], retweets[d_id]))
    i = i+1

flood damage:

flood damage

Top 20 results out of 495 for the searched query:

page_id= doc_1672 - doc_score: 35.91739273071289 - original_tweet: Edgewater Florida Flooding. #HurricaneIan https://t.co/eK3lFQ0yLq -  likes: 4 - retweets: 0

page_id= doc_823 - doc_score: 35.866973876953125 - original_tweet: Storm surge flooding in Myrtle Beach. #HurricaneIan https://t.co/8Pp9qUl8Vz -  likes: 5 - retweets: 2

page_id= doc_3503 - doc_score: 34.873817443847656 - original_tweet: . #HurricaneIan is making its way into the Carolinas as Florida works to survey the damage. 

https://t.co/m3ABKeOxBG -  likes: 0 - retweets: 0

page_id= doc_2782 - doc_score: 34.27949905395508 - original_tweet: Streets are starting to flood. #HurricaneIan #Charleston https://t.co/GgIvRbdlpQ -  likes: 6 - retweets: 0

page_id= doc_2901 - doc_score: 33.96710968017578 - original_tweet: 9/30/2022  #HurricaneIan Florida Rain, Flood Reports
https://t.co/fGv2hNZv1p https://t.co/vn0EcsA4oy -  likes: 0 - retweets: 0

page_id

In [42]:
#QUERY 5:
print("storm in florida:\n")
word2vec_query5 = input()
word2vec_ranking_docs5,word2vec_ranking_scores5 = word2vec_search(word2vec_query5,tweet2vec, model, index)
top = 20

print("\n======================\nTop {} results out of {} for the searched query:\n".format(top, len(word2vec_ranking_docs5)))
i=0
for d_id in word2vec_ranking_docs5[:top]:
    print("page_id= {} - doc_score: {} - original_tweet: {} -  likes: {} - retweets: {}\n".format(d_id, word2vec_ranking_scores5[i] ,title_index[d_id], likes[d_id], retweets[d_id]))
    i = i+1

storm in florida:

storm in florida

Top 20 results out of 1163 for the searched query:

page_id= doc_470 - doc_score: 46.780784606933594 - original_tweet: Thank God. üò¢üôèüèæ #HurricaneIan 
#Florida -  likes: 1 - retweets: 0

page_id= doc_3848 - doc_score: 45.888851165771484 - original_tweet: Be careful #Florida! 
#hurricaneian https://t.co/ghPKhAcjbN -  likes: 0 - retweets: 0

page_id= doc_3665 - doc_score: 45.72542953491211 - original_tweet: #HurricaneIan #Florida 

We see. 
We vote. https://t.co/lt5PFrmNkb -  likes: 2 - retweets: 1

page_id= doc_2877 - doc_score: 45.32656478881836 - original_tweet: Hurricane Ian From Cuba through Florida to South Carolina.
#HurricaneIan https://t.co/7TypUjrtlW -  likes: 0 - retweets: 0

page_id= doc_70 - doc_score: 44.28199005126953 - original_tweet: LOVE ‚ù§Ô∏è our #Florida @GovRonDeSantis ...

#HurricaneIan https://t.co/mmzGE6IZnY -  likes: 1 - retweets: 0

page_id= doc_111 - doc_score: 42.995845794677734 - original_tweet: Florida‚Äôs Long Ro