# Load data

In [5]:
import pandas as pd

In [6]:
from pathlib import Path

def load_data(path: str) -> pd.DataFrame:
    data = pd.read_csv(Path(path))
    return data

In [7]:
data = load_data("../data/data.csv")

In [8]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        7613 non-null   int64 
 1   keyword   7552 non-null   object
 2   location  5080 non-null   object
 3   text      7613 non-null   object
 4   target    7613 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 297.5+ KB


# Preprocessing

In [9]:
from warnings import filterwarnings
filterwarnings("ignore")

In [10]:
from sentence_transformers import SentenceTransformer

# intfloat/e5-small-v2 or intfloat/e5-base-v2 or intfloat/e5-large-v2
model = SentenceTransformer("intfloat/e5-small-v2")

In [11]:
embeddings = model.encode(data['text'], normalize_embeddings=True, show_progress_bar=True)

Batches:   0%|          | 0/238 [00:00<?, ?it/s]

In [12]:
nb_comp = len(embeddings[0])
nb_data = len(embeddings)
print(f"embedding size : {nb_comp}")

embedding size : 384


In [13]:
for num_comp in range(nb_comp):
    data['x_' + str(num_comp)] = [embeddings[k][num_comp] for k in range(nb_data)]

Created a column for every data of embedding created, 384 in this case

In [24]:
data.head(5)

Unnamed: 0,id,keyword,location,text,target,x_0,x_1,x_2,x_3,x_4,...,x_374,x_375,x_376,x_377,x_378,x_379,x_380,x_381,x_382,x_383
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,-0.085014,0.037619,0.043141,-0.013582,-0.039502,...,-0.015111,-0.091739,-0.006207,-0.06588,0.001196,0.028524,0.04149,0.022493,-0.052284,0.005658
1,4,,,Forest fire near La Ronge Sask. Canada,1,-0.007675,0.052327,0.060509,0.003969,0.033272,...,-0.004133,-0.068609,-0.015161,-0.026652,-0.062071,-0.040363,-0.057321,0.003138,-0.054532,0.047691
2,5,,,All residents asked to 'shelter in place' are ...,1,-0.032239,0.059961,0.045192,-0.004393,-0.003297,...,-0.055649,-0.042831,0.021676,-0.023298,-0.012054,0.014243,-0.049301,0.052759,-0.047772,0.034149
3,6,,,"13,000 people receive #wildfires evacuation or...",1,-0.045641,0.0171,0.038876,-0.006642,-0.008925,...,0.007952,-0.084324,0.016554,-0.052065,-0.057295,0.017911,-0.006158,-0.018141,-0.032954,0.009808
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,-0.03552,0.02388,0.044553,0.024645,-5.9e-05,...,-0.024434,-0.097317,0.00667,-0.035037,-0.04308,-0.027862,0.004018,-0.050536,-0.073364,0.069326


In [15]:
data_keywords = data['keyword']

In [16]:
data_keywords.size

7613

In [25]:
data_keywords.value_counts(normalize=True)


keyword
fatalities               0.005959
deluge                   0.005561
armageddon               0.005561
sinking                  0.005429
damage                   0.005429
                           ...   
forest%20fire            0.002516
epicentre                0.001589
threat                   0.001457
inundation               0.001324
radiation%20emergency    0.001192
Name: proportion, Length: 221, dtype: float64

In [18]:
columns = ['keyword']
data_dummy = pd.get_dummies(data, columns=columns, drop_first=True)
data_dummy.replace({False: 0, True: 1}, inplace=True)

In [26]:
data_dummy.head(5)

Unnamed: 0,id,location,text,target,x_0,x_1,x_2,x_3,x_4,x_5,...,keyword_weapons,keyword_whirlwind,keyword_wild%20fires,keyword_wildfire,keyword_windstorm,keyword_wounded,keyword_wounds,keyword_wreck,keyword_wreckage,keyword_wrecked
0,1,,Our Deeds are the Reason of this #earthquake M...,1,-0.085014,0.037619,0.043141,-0.013582,-0.039502,0.03018,...,0,0,0,0,0,0,0,0,0,0
1,4,,Forest fire near La Ronge Sask. Canada,1,-0.007675,0.052327,0.060509,0.003969,0.033272,0.049459,...,0,0,0,0,0,0,0,0,0,0
2,5,,All residents asked to 'shelter in place' are ...,1,-0.032239,0.059961,0.045192,-0.004393,-0.003297,0.037312,...,0,0,0,0,0,0,0,0,0,0
3,6,,"13,000 people receive #wildfires evacuation or...",1,-0.045641,0.0171,0.038876,-0.006642,-0.008925,0.028679,...,0,0,0,0,0,0,0,0,0,0
4,7,,Just got sent this photo from Ruby #Alaska as ...,1,-0.03552,0.02388,0.044553,0.024645,-5.9e-05,0.047627,...,0,0,0,0,0,0,0,0,0,0


In [20]:
data_dummy.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Columns: 608 entries, id to keyword_wrecked
dtypes: float32(384), int64(222), object(2)
memory usage: 24.2+ MB


In [21]:
from geopy.geocoders import Nominatim

geolocator = Nominatim(user_agent="my_app")
data_dummy_wNan = data_dummy.dropna()

def get_coordinates(location):
    try:
        location = geolocator.geocode(location)
        return location.latitude, location.longitude
    except:
        return None

data_tmp = data_dummy_wNan.head(10)
for location in data_tmp['location']:
    print(location, get_coordinates(location))
# data_tmp['coordinates'] = data1['location'].apply(get_coordinates)


Birmingham (52.4796992, -1.9026911)
Est. September 2012 - Bristol None
AFRICA (11.5024338, 17.7578122)
Philadelphia, PA (39.9527237, -75.1635262)
London, UK (51.5074456, -0.1277653)
Pretoria (-25.7459277, 28.1879101)
World Wide!! (28.3376729, -81.55752250357412)
Paranaque City (14.4573274, 121.03323686843908)
Live On Webcam None
milky way (-7.1280388, 52.7448393)


In [22]:
# data_dummy_wNan.apply(lambda x: get_coordinates(x['location']), axis=1)

In [23]:
# import seaborn as sns
# import matplotlib.pyplot as plt

# data_dummy_wNan = data_dummy.dropna()
# corr = data_dummy_wNan.corr()
# sns.heatmap(corr, annot=False, cmap='coolwarm', linewidths=0.5)
