# Information Retrieval and Web Analytics

# PROJECT PART 1


Group Members:

*   Berta Alòs (228709)
*   Maria Cerezo (183213)
*   Paula Vilà (231630)












Load packages

In [1]:
from collections import defaultdict
from array import array
import nltk
nltk.download('stopwords')
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
import math
import numpy as np
import collections
import json 
from numpy import linalg as la
import pandas as pd 
import re
from operator import itemgetter
import unicodedata

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Data preparation and Text processing
The dataset is stored in the JSON file. It contains 4000 Hurricane Ian tweets. 

In [2]:
#reading json file and transforming it into pandas dataframe
tw_data = pd.read_json('tw_hurricane_data.json',lines=True)

#reading the csv file and transforming it into pandas dataframe
map_data = pd.read_csv('tweet_document_ids_map.csv',
                    sep='::', 
                    encoding='latin-1',
                    engine='python',
                    names=['documents IDs'])

In [3]:
#creating a new dataframe with the desired columns extracted from tw_data dataframe 
tw_fields = pd.DataFrame()
tw_fields['hashtags']=None
tw_fields['name']=None
tw_fields['full_text']=None
tw_fields['created_at']=None
tw_fields['favorite_count']=None
tw_fields['retweet_count']=None
tw_fields['url']=None

#creating a column with hashtag information obtained from entities column
tw_fields['hashtags'] = tw_data['entities'].apply(lambda x: x.get('hashtags'))

#creating a column with username information obtained from user column
tw_fields['name'] = tw_data['user'].apply(lambda x: x.get('name'))

#creating a column with url information obtained from entities column
for i in tw_data.index:
  tw_fields['url'][i]=tw_data['entities'][i]['media'][0]['url'] if(tw_data['entities'][i].get('media') is not None) else {}

#creating a column with the full_text information of the tw_data dataframe
tw_fields['full_text']=tw_data['full_text']

#creating a column with the created_at information of the tw_data dataframe
tw_fields['created_at']=tw_data['created_at']

#creating a column with the favorite_count information of the tw_data dataframe
tw_fields['favorite_count']=tw_data['favorite_count']

#creating a column with the retweet_count information of the tw_data dataframe
tw_fields['retweet_count']=tw_data['retweet_count']

In [4]:
#creating a new dataset containing the required columns, with it's specified column names
tw_fields = pd.DataFrame({'Tweet' : tw_fields['full_text'],'Username' : tw_fields['name'],  'Date' : tw_fields['created_at'],'Hashtags' : tw_fields['hashtags'], 'Likes' : tw_fields['favorite_count'], 'Retweets' : tw_fields['retweet_count'], 'Url' : tw_fields['url']})

In [5]:
#Function to preprocess data
def build_terms(line):
    """
    Preprocess the text removing stop words, stemming, transforming in 
    lowercase, removing URLs and emojis, removing everything it is not a digit 
    nor number and return the tokens of the text.
    
    Argument:
    line -- string (text) to be preprocessed
    
    Returns:
    line - a list of tokens corresponding to the input text after the preprocessing
    """
    stemmer = PorterStemmer()
    stop_words = set(stopwords.words("english"))
    line = str(unicodedata.normalize('NFKD', line).encode('ASCII', 'ignore')) [2:-1] #BONUS: Romanizing the text
    line=re.sub(r'[\W_]+', ' ', line) #BONUS: Removing anything is not a letter or digit
    line = line.lower() 
    line = line.split()  # Tokenize the text to get a list of terms
    line = [x for x in line if x not in stop_words]  # eliminate the stopwords
    line = [stemmer.stem(word) for word in line] # perform stemming (HINT: use List Comprehension)
    line = [l for l in line if "https://" not in l ] #BONUS: Removing the URLs from the line
    line = [emojis_out(l) for l in line ] ##BONUS: removing the emojis from the line
    return line

In [6]:
#Function to get rid of emojis
def emojis_out (s):
    """Preprocess the text removing stop emojis from of the string
    
    Arguments:
    s -- string (word) to be processed
    
    Returns: 
    word-- string equal to "" if it's  an emoji and "s" otherwise. 
    """
    emoji_pattern = re.compile("["  
        u"\U0001F300-\U0001F5FF"  # removing symbols & pictographs  
                           "]+", flags=re.UNICODE)
       
    word = emoji_pattern.sub(r'', s)
    return word

In [None]:
#preprocessing the Tweet and Username information
for i in tw_fields.index: 
    tw_fields['Tweet'][i] = build_terms(tw_fields['Tweet'][i]) 
    tw_fields['Username'][i]=emojis_out(tw_fields['Username'][i])

In [8]:
#merging the tw_fields & map_data dataframes into a single dataframe named final_dataset
final_dataset=pd.merge(map_data, tw_fields, left_index=True, right_index=True)  
final_dataset.head(3) #displaying the first 3 rows

Unnamed: 0,documents IDs,Tweet,Username,Date,Hashtags,Likes,Retweets,Url
0,doc_1\t1575918182698979328,"[keep, spin, us, 7, pm, go, away, alreadi, hur...",Suz,2022-09-30 18:39:08+00:00,"[{'text': 'HurricaneIan', 'indices': [63, 76]}]",0,0,https://t.co/VROTxNS9rz
1,doc_2\t1575918151862304768,"[heart, go, affect, hurricaneian, wish, everyo...",Lytx,2022-09-30 18:39:01+00:00,"[{'text': 'HurricaneIan', 'indices': [43, 56]}]",0,0,{}
2,doc_3\t1575918140839673873,"[kissimme, neighborhood, michigan, ave, n, hur...",Christopher Heath,2022-09-30 18:38:58+00:00,"[{'text': 'HurricaneIan', 'indices': [45, 58]}]",0,0,https://t.co/jf7zseg0Fe
