# **INFORMATION RETRIEVAL AND WEB ANALYTICS**

### PART 1: TEXT PROCESSING

1. Reading and Loading the dataset

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import nltk
nltk.download('stopwords')
nltk.download('omw-1.4')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [3]:
from collections import defaultdict
from array import array
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
import math
from nltk.stem import WordNetLemmatizer
import numpy as np
import collections
import pandas as pd
import json
from numpy import linalg as la
import re

In [4]:
data_path = "/content/drive/MyDrive/4to/IRWA/IRWA - Project-20221021/data"
docs_path = data_path + '/tw_hurricane_data.json'
with open(docs_path) as fp:
  lines = fp.readlines()
lines = [l.strip().replace(' +',' ')for l in lines]

In [5]:
print("Total number of docs in the corpus: {}".format(len(lines)))

Total number of docs in the corpus: 4000


2. Get the asked information

Id | Tweet | Username | Date | Hashtags | Likes | Retweets | Url

In [6]:
def getId(tweet):

  return tweet ['id']

In [7]:
def text(tweet):

  return tweet['full_text']

In [8]:
def username(tweet):

  return tweet['user']['name']

In [9]:
def date(tweet):

  return tweet['created_at']

In [10]:
def hashtags(tweet):

  return [hashtag['text'] for hashtag in tweet['entities']['hashtags']]

In [11]:
def likes(tweet):

  k = tweet ['favorite_count']

  if k: 
    return k
  else:
    return 0

In [12]:
def retweets(tweet):

  k = tweet['retweet_count']

  if k:
    return k
  else:
    return 0

In [13]:
def url(tweet):

  return 'https://twitter.com/_/status/'+str(tweet['id'])

In [14]:
def final_output(tweet):

  output = {}
  tweet = json.loads(tweet)
  output['Tweet'] = text(tweet) #This is not tokenized/stemmed with build_terms()
  output['Id'] = getId(tweet)
  output['Username'] = username (tweet)
  output['Date'] = date(tweet)
  output['Hashtags'] = hashtags(tweet)
  output['Likes'] = likes(tweet)
  output['Retweets'] = retweets(tweet)
  output['Url'] = url(tweet)
  return output, str(output['Tweet'])

3. Pre-Processing the text

In [15]:
def build_terms(line):
    """
    Preprocess the text removing stop words, stemming,
    transforming in lowercase rfeturn the tokens of the text... 
    """

    stemmer = PorterStemmer()
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words("english"))

    line = line.lower()
    line = re.sub(r'http\S+', ' ', line) # delete urls
    line = re.sub(r'@\S+',' ', line) # delete tags
    line = re.sub(r'[^\w\s]', ' ', line) # delete punctuation
    line = line.split()  # Tokenize the text to get a list of terms
    line = [re.sub(r'\\n', '', x) for x in line]
    line = [x.replace('#', '') for x in line] # delete hashtag symbol
    line = [x for x in line if x not in stop_words]  # delete the stopwords
    empty = ['', ' ']
    line = [x for x in line if x not in empty] #delete empty strings '', ' '
    line = [stemmer.stem(word) for word in line] # perform stemming (HINT: use List Comprehension)
    line = [lemmatizer.lemmatize(x) for x in line] # lemmatize words
    return line

In [16]:
processed_documents = []
for i in range(len(lines)):
  tweet, line = final_output(lines[i])
  line = build_terms(line)
  tweet['Tweet'] = line
  processed_documents.append(tweet)

In [17]:
# Example of final output after pre-processing the text of the tweets
print(processed_documents[7]['Tweet'])
print(processed_documents[7]['Username'])
print(processed_documents[7]['Date'])
print(processed_documents[7]['Hashtags'])
print(processed_documents[7]['Likes'])
print(processed_documents[7]['Retweets'])
print(processed_documents[7]['Url'])

['thought', 'student', 'teacher', 'parent', 'commun', 'suffer', 'wake', 'hurrican', 'ian', 'closeupdc', 'hurricaneian', 'florida', 'georgia', 'northcarolina', 'southcarolina']
Close Up Washington
Fri Sep 30 18:38:46 0000 2022
['CloseUpDC', 'HurricaneIan', 'Florida', 'Georgia', 'NorthCarolina', 'SouthCarolina']
0
0
https://twitter.com/_/status/1575918088473788429


In [18]:
import pandas as pd
docs_path = '/content/drive/MyDrive/4to/IRWA/IRWA - Project-20221021/data/tweet_document_ids_map.csv' # CHANGE according to the path
df = pd.read_csv(docs_path, sep='\t', header = None)
df

Unnamed: 0,0,1
0,doc_1,1575918182698979328
1,doc_2,1575918151862304768
2,doc_3,1575918140839673873
3,doc_4,1575918135009738752
4,doc_5,1575918119251419136
...,...,...
3995,doc_3996,1575856268022992896
3996,doc_3997,1575856245650919424
3997,doc_3998,1575856228886089728
3998,doc_3999,1575856226139017216


In [19]:
#checking tweet id with the document ids
print(processed_documents[0]['Id'])
print(processed_documents[1]['Id'])
print(processed_documents[2]['Id'])
print(processed_documents[4]['Id'])

1575918182698979328
1575918151862304768
1575918140839673873
1575918119251419136
