### Import libraries

In [17]:
import pandas as pd
import numpy as np
import openai
import ast


pd.set_option('display.max_rows', None)

### Read dataset_option

In [68]:
df = pd.read_json('data/meteo_news_updated.json')

### Data inspection

Check dataframe for missing values

In [70]:
nan_df = df[df.isna().any(axis=1)]
nan_df

Unnamed: 0,article_url,article_title,article_lead,article_text,article_date
887,https://stirileprotv.ro/stiri/vremea/vremea-as...,"Vremea astăzi, 5 iulie. Țara se împarte între ...",,"[Vremea în țară, În Dobrogea şi Bărăgan, la am...",05-07-2022 07:10
942,https://stirileprotv.ro/stiri/vremea/vremea-as...,"Vremea astăzi, 11 iunie. Prognoza meteo pentru...",,"[\nSursa: , \n, \nEtichete:\n, ,\n, ,\n, ,\n, ...",11-06-2022 07:19
1291,https://stirileprotv.ro/stiri/vremea/vremea-12...,"Vremea, 12 Octombrie. Vremea se menține deoseb...",,"[ , Vreme ceva mai bună găsim doar în vest, î...",12-10-2021 07:24
2400,https://vacantalamunte.stirileprotv.ro/stiri/v...,,,[],
2435,https://vacantalamunte.stirileprotv.ro/stiri/n...,,,[],
3367,https://vacantalamare.stirileprotv.ro/stiri/ma...,,,[],
4149,https://vacantalamunte.stirileprotv.ro/stiri/s...,,,[],
6009,https://stirileprotv.ro/stiri/meteo/afla-cum-e...,Afla cum e vremea in Romania din ora in ora,,"[Sursa: ANM, \nSursa: , \n, \nEtichete:\n, ,\n...",26-10-2011 20:02
6906,https://stirileprotv.ro/stiri/meteo/vom-avea-t...,Vom avea temperaturi de peste 33 de grade in s...,,"[\nMasa de aer cald, tropical, va domina jumat...",23-05-2009 16:33


In [71]:
# Check shape
nan_df.shape

(9, 5)

In [72]:
# After reviewing those articles, we have determined that the URL is broken, the article has been deleted, and so on. 
# Therefore, we have decided to remove them, as they constitute only a small percentage of our dataframe.

Drop the NaN's

In [73]:
df.dropna(inplace=True)

Convert "article_date" to datetime datatype with pandas

In [74]:
df['article_date'] = pd.to_datetime(df['article_date'], format='%d-%m-%Y %H:%M')

# Sort values by article_date
df = df.sort_values('article_date').reset_index(drop=True)

Check dataframe shape

In [75]:
df.shape

(7088, 5)

In [76]:
# Our dataset is pretty consistent in order to build a POC or to prove our hypothesis

Check dtypes for each column in our dataframe

In [77]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7088 entries, 0 to 7087
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   article_url    7088 non-null   object        
 1   article_title  7088 non-null   object        
 2   article_lead   7088 non-null   object        
 3   article_text   7088 non-null   object        
 4   article_date   7088 non-null   datetime64[ns]
dtypes: datetime64[ns](1), object(4)
memory usage: 277.0+ KB


Remove useless elements from article_text

In [78]:
def remove_sursa(text_list):
    for i in range(len(text_list)):
        if text_list[i].startswith('\nSursa:'):
            return text_list[:i]
    return text_list

df['article_text'] = df['article_text'].apply(remove_sursa)

Clean text

In [79]:
def clean_text_list(text_list):
    cleaned_list = [text.replace('\xa0', '').strip() for text in text_list if text.strip() != '']
    return cleaned_list

df['article_text'] = df['article_text'].apply(clean_text_list)

# Join sentences
df['article_text'] = df['article_text'].apply(lambda sentences: ' '.join(sentences))

Remove diacritics from our dataframe

In [80]:
def remove_diacritics(text):
    return unidecode(text)

df = df.applymap(lambda x: remove_diacritics(str(x)))

Read the dataframe containing geo locations

In [81]:
# Read the dataframe containing the counties and the regions
df_romania = pd.read_excel('data/romania_geo.xlsx')
counties = df_romania['Judet'].to_list()
regions = df_romania['Regiune'].dropna().to_list()

Lemmatize article_text

In [82]:
# Function to lemmatize a text
def lemmatize_text(text):
    doc = nlp(text)
    return " ".join([token.lemma_ for token in doc])

# Lemmatize the 'article_text' column
df['article_text_lemmatized'] = df['article_text'].apply(lemmatize_text)

Stem article_text

In [83]:
# Function to stem text
def stem_text(text):
    return " ".join(stemmer.stem(word) for word in text.split())

# Stem the 'article_text' column
df['article_text_stemmed'] = df['article_text'].apply(stem_text)

Lowercase the words again

In [84]:
# df_preprocessed = df_preprocessed.applymap(lambda x: remove_diacritics(str(x)))

Create a copy of the original dataframe

In [85]:
df_preprocessed = df.copy()

Extract from lemmatized text

In [86]:
lemmatized_regions = [lemmatize_text(region) for region in regions]
lemmatized_counties = [lemmatize_text(county) for county in counties]

location_mapping_counties_lemma = dict(zip([lemma.lower() for lemma in lemmatized_counties], counties))
location_mapping_regions_lemma = dict(zip([lemma.lower() for lemma in lemmatized_regions], regions))

def extract_lemmatized_location(text, lemmatized_locations, location_mapping):
    words = text.lower().split()
    detected = [location_mapping[word] for word in words if word in lemmatized_locations]
    return list(set(detected))

df_preprocessed['article_text_county_lemma'] = df_preprocessed['article_text_lemmatized'].apply(extract_lemmatized_location, lemmatized_locations=[lemma.lower() for lemma in lemmatized_counties], location_mapping=location_mapping_counties_lemma)
df_preprocessed['article_text_region_lemma'] = df_preprocessed['article_text_lemmatized'].apply(extract_lemmatized_location, lemmatized_locations=[lemma.lower() for lemma in lemmatized_regions], location_mapping=location_mapping_regions_lemma)

Extract from stemmed text

In [87]:
stemmed_regions = [stem_text(region) for region in regions]
stemmed_counties = [stem_text(county) for county in counties]

location_mapping_counties = dict(zip([stem.lower() for stem in stemmed_counties], counties))
location_mapping_regions = dict(zip([stem.lower() for stem in stemmed_regions], regions))

def extract_stemmed_locations(text, stemmed_locations, location_mapping):
    words = text.lower().split()
    detected = [location_mapping[word] for word in words if word in stemmed_locations]
    return list(set(detected))

df_preprocessed['article_text_county_stem'] = df_preprocessed['article_text_stemmed'].apply(extract_stemmed_locations, stemmed_locations=[stem.lower() for stem in stemmed_counties], location_mapping=location_mapping_counties)
df_preprocessed['article_text_region_stem'] = df_preprocessed['article_text_stemmed'].apply(extract_stemmed_locations, stemmed_locations=[stem.lower() for stem in stemmed_regions], location_mapping=location_mapping_regions)

Read the dataframe containing extreme phenomena

In [88]:
df_phenomena = pd.read_excel('data/extreme_phenomena.xlsx')
extreme_phenomena = df_phenomena['Fenomene'].to_list()

Extract from lemma and stem extreme phenomena

In [89]:
lemmatized_phenomena = [lemmatize_text(phenomena) for phenomena in extreme_phenomena]
phenomena_mapping_lemma = dict(zip([lemma.lower() for lemma in lemmatized_phenomena], extreme_phenomena))

stemmed_phenomena = [stem_text(phenomena) for phenomena in extreme_phenomena]
phenomena_mapping_stem = dict(zip([stem.lower() for stem in stemmed_phenomena], extreme_phenomena))

df_preprocessed['article_text_phenomen_lemma'] = df_preprocessed['article_text_lemmatized'].apply(extract_lemmatized_location, lemmatized_locations=[lemma.lower() for lemma in lemmatized_phenomena], location_mapping=phenomena_mapping_lemma)
df_preprocessed['article_text_phenomen_stem'] = df_preprocessed['article_text_stemmed'].apply(extract_stemmed_locations, stemmed_locations=[stem.lower() for stem in stemmed_phenomena], location_mapping=phenomena_mapping_stem)

In [90]:
df_preprocessed.tail(5)

Unnamed: 0,article_url,article_title,article_lead,article_text,article_date,article_text_lemmatized,article_text_stemmed,article_text_county_lemma,article_text_region_lemma,article_text_county_stem,article_text_region_stem,article_text_phenomen_lemma,article_text_phenomen_stem
7083,https://stirileprotv.ro/stiri/vremea/furtuni-u...,"Ploi torentiale, vijelii si grindina in vest, ...","Meteorologii anunta, ca, pana joi dimineata, e...",In anumite zone din tara vantul va sufla cu pu...,2023-08-16 10:47:00,in anumit zonă din tară vant vrea sufla cu put...,in anum zon din tar vant va sufl cu putere. de...,[],[Dobrogea],[],[Dobrogea],"[Grindină, Caniculă]","[Vijelie, Caniculă]"
7084,https://stirileprotv.ro/stiri/vremea/alerta-de...,Avertisment de canicula in mai multe regiuni a...,"Sudul si sud-estul tarii se va afla, de joi pa...","Astfel, in perioada 17 - 20 august 2023, va fi...",2023-08-17 08:22:00,"astfel , in perioadă 17 - 20 august 2023 , vre...","astfel, in perioad 17 - 20 august 2023, va fi ...",[],[],[],[],[Caniculă],[Caniculă]
7085,https://stirileprotv.ro/stiri/vremea/vremea-az...,"Vremea azi, 17 august. Canicula in sud si in s...","Vremea se incalzeste, iar amiaza aduce canicul...","Aversele apar, local, la deal, la munte, de as...",2023-08-17 08:48:00,"avers apărea , local , la deal , la munte , de...","aver apar, local, la deal, la munte, de asemen...",[Vaslui],"[Moldova, Bucovina, Transilvania, Oltenia, Dob...",[],"[Moldova, Bucovina, Transilvania, Oltenia, Dob...","[Fulger, Caniculă, Grindină]",[Caniculă]
7086,https://stirileprotv.ro/stiri/vremea/vremea-az...,"Vremea azi, 18 august. Disconfortul termic est...","Ziua de vineri ne aduce vreme foarte calda, ca...",Maximele pornesc de la 28 de grade pe litoral ...,2023-08-18 10:44:00,Maximele porni de la 28 de grad pe litoral si ...,maxim porn de la 28 de grad pe litoral si ajun...,[],"[Muntenia, Moldova, Bucovina, Transilvania, Ol...",[],"[Muntenia, Moldova, Bucovina, Transilvania, Ol...","[Fulger, Grindină, Caniculă]","[Grindină, Caniculă]"
7087,https://stirileprotv.ro/stiri/vremea/vremea-az...,"Vremea azi, 19 august. Disconfort termic ridic...","Ziua de astazi ne-aduce vreme calda, disconfor...",Maximele pleaca de la 28 de grade in sudul lit...,2023-08-19 11:09:00,Maximele pleacae de la 28 de grad in sud litor...,maxim pleac de la 28 de grad in sud litoral si...,[],"[Muntenia, Moldova, Bucovina, Transilvania, Do...",[],"[Moldova, Dobrogea, Transilvania, Muntenia]","[Fulger, Grindină, Caniculă]",[Grindină]


### Export dataframe

In [91]:
df_preprocessed.to_excel('data/data_preprocessed.xlsx', index=False)

In [146]:
df_viteza = pd.read_excel('data/vant.xlsx')

In [147]:
df_viteza


Unnamed: 0,Data,Viteza1,Viteza2,Viteza3,Viteza4,Viteza5,Viteza6,Viteza7,Viteza8
0,2009-01-01 01:00:00,0.5,*,-,2.9,1.8,1.7,2,*
1,2009-01-01 02:00:00,0.6,*,-,1.9,1.5,1.3,1.9,*
2,2009-01-01 03:00:00,0.4,*,-,2,1.2,1.7,1.9,*
3,2009-01-01 04:00:00,0.3,*,-,1.8,1.1,2,1.7,*
4,2009-01-01 05:00:00,0.3,*,-,2,0.6,1.8,1.3,*
...,...,...,...,...,...,...,...,...,...
8730,2009-12-30 20:00:00,0.1,*,-,0,0.4,0.7,0,*
8731,2009-12-30 21:00:00,0.3,*,-,0,0.6,0.7,0,*
8732,2009-12-30 22:00:00,0.6,*,-,0,0.7,0.5,0,*
8733,2009-12-30 23:00:00,0.7,*,-,0,0.7,0.4,0,*


In [148]:
# Convert 'Data' column to datetime format
df_viteza['Data'] = pd.to_datetime(df_viteza['Data'])

# Replace '-' with NaN
df_viteza.replace('-', np.nan, inplace=True)

# Convert 'Viteza' columns to numeric
viteza_columns = ['Viteza1', 'Viteza2', 'Viteza3', 'Viteza4', 'Viteza5', 'Viteza6', 'Viteza7', 'Viteza8']
df_viteza[viteza_columns] = df_viteza[viteza_columns].apply(pd.to_numeric, errors='coerce')

# Group by date and calculate daily average wind speed
daily_average = df_viteza.groupby(df_viteza['Data'].dt.date)[viteza_columns].mean()


In [149]:
daily_average.to_excel('data/ani_vant/2009.xlsx')

#### Use OpenAI API in order to map the region with the associated event

In [2]:
df_news = pd.read_excel('data/data_preprocessed.xlsx')

In [3]:
df_news

Unnamed: 0,article_url,article_title,article_lead,article_text,article_date
0,https://stirileprotv.ro/stiri/social/vezi-cum-...,Vezi cum va fi vremea pe continent si in tara ...,Temperaturi scazute in tara! - 10 grade Celsiu...,In Europa precipitatiile vor fi consistente in...,2009-02-19 17:24:00
1,https://stirileprotv.ro/stiri/social/vezi-cum-...,Vezi cum va fi vremea in urmatoarele trei zile!,Ziua de astazi va aduce inca un pic de ninsoar...,"Maine se va insenina in nord-vest, in schimb a...",2009-02-19 17:24:00
2,https://stirileprotv.ro/stiri/social/vezi-aici...,Vezi aici cum va fi vremea in primele trei zil...,"Incepem saptamana cu vreme rece, chiar geroasa...","In Europa, presiunea atmosferica se va mentine...",2009-02-19 17:25:00
3,https://stirileprotv.ro/stiri/social/vezi-aici...,Vezi aici cum va fi vremea pe continent si in ...,Pe continent precipitatiile vor fi indeosebi s...,"Masa de aer rece, de origine polara se va depl...",2009-02-19 17:25:00
4,https://stirileprotv.ro/stiri/social/meteo-afl...,Meteo: afla cum va fi vremea in urmatoarele zi...,Cine a pariat pe primavara s-a cam grabit. Vre...,In Europa precipitatiile se concentreaza pe zo...,2009-02-19 17:27:00
...,...,...,...,...,...
7101,https://stirileprotv.ro/stiri/vremea/furtuni-u...,"Ploi torentiale, vijelii si grindina in vest, ...","Meteorologii anunta, ca, pana joi dimineata, e...",In anumite zone din tara vantul va sufla cu pu...,2023-08-16 10:47:00
7102,https://stirileprotv.ro/stiri/vremea/alerta-de...,Avertisment de canicula in mai multe regiuni a...,"Sudul si sud-estul tarii se va afla, de joi pa...","Astfel, in perioada 17 - 20 august 2023, va fi...",2023-08-17 08:22:00
7103,https://stirileprotv.ro/stiri/vremea/vremea-az...,"Vremea azi, 17 august. Canicula in sud si in s...","Vremea se incalzeste, iar amiaza aduce canicul...","Aversele apar, local, la deal, la munte, de as...",2023-08-17 08:48:00
7104,https://stirileprotv.ro/stiri/vremea/vremea-az...,"Vremea azi, 18 august. Disconfortul termic est...","Ziua de vineri ne aduce vreme foarte calda, ca...",Maximele pornesc de la 28 de grade pe litoral ...,2023-08-18 10:44:00


In [48]:
for day, group in grouped_by_day:
    filename = f'group_{day}.csv'
    group.to_csv(filename, index=False)

In [49]:
combined_events_per_day = {}
for day, group in grouped_by_day:
    combined_events = {}  # To store the combined events for this day
    for index, row in group.iterrows():
        weather_events_str = row['weather_event']
        weather_events = ast.literal_eval(weather_events_str)
        for region, events in weather_events.items():
            if region not in combined_events:
                combined_events[region] = []
            combined_events[region].extend(events)
    
    combined_events_per_day[day] = combined_events

In [50]:
combined_events_per_day

{datetime.date(2009, 2, 22): {'Transilvania': ['Ninsori', 'Fulguieli'],
  'Oltenia': ['Ninsori slabe'],
  'Bucuresti': ['Vreme rece', 'Innorari temporare'],
  'Muntenia': ['Innorari temporare', 'Fulguieli izolate']},
 datetime.date(2009, 2, 23): {'Dobrogea': ['Vreme insorita',
   'Incalzire',
   'Ploi',
   'Lapovita'],
  'Moldova': ['Moment insorit', 'Incalzire', 'Fulguieli'],
  'Transilvania': ['Vreme rece',
   'Fulgui',
   'Ninsori',
   'Fulguieli',
   'Ninsori',
   'Fulguieli'],
  'Maramures': ['Vreme rece', 'Fulgui', 'Ninsori', 'Fulguieli'],
  'Vest': ['Fulgui'],
  'Centru': ['Ceata', 'Frig'],
  'Sud-vest': ['Ceata', 'Soare', 'Fulgui'],
  'Sud': ['Vreme rece', 'Fulgui'],
  'Bucuresti': ['Vreme rece',
   'Innorari temporare',
   'Ceata',
   'Innorari sporadice'],
  'Munte': ['Ninsori rare', 'Soare', 'Ceata'],
  'Muntenia': ['Fulguieli', 'Ceata'],
  'Bucovina': ['Fulguieli', 'Fulguieli'],
  'Banat': ['Fulguieli', 'Ninsori', 'Vant mai nervos'],
  'Delta': ['Vant mai nervos'],
  'Olten