# Main Notebook
This notebook aims to analyze and visualize combined data as well as implement additional features from the project requirements.  
US Stations is from US Linear Relative Sea Level Trends.  
Storms is from the Natural Disasters data and contains the US storm information.  
Beach contains the beach project data.  

In [None]:
import pandas as pd 
import numpy as np 
import plotly.express as px
import seaborn as sns
import os

import string
import nltk

nltk.download('punkt', download_dir='/nltk_data')
              
nltk.download('stopwords', download_dir='/nltk_data')

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

from collections import Counter

usstations = pd.read_excel('data/usstations.xlsx')
beach = pd.read_excel('data/beach_data.xlsx')
storms = pd.read_excel('data/cleanusstorms.xlsx')


Group data by the region column and aggregate

In [None]:
stationsgroup = usstations.groupby('Region').size().reset_index(name='counts')
stationsgroup = stationsgroup.sort_values('counts', ascending=False)
stationsgroup = stationsgroup.head(10)
print(stationsgroup)


In [None]:
beachgroup = beach.groupby('Region').size().reset_index(name='counts')
beachgroup = beachgroup.sort_values('counts', ascending=False)
beachgroup = beachgroup.head(10)
print(beachgroup)

# No projects are listed for Hawaii.

In [None]:
stormsgroup = storms.groupby('Region').size().reset_index(name='counts')
stormsgroup = stormsgroup.sort_values('counts', ascending=False)
stormsgroup = stormsgroup.head(10)
print(stormsgroup)

In [None]:
merge_df = pd.merge(usstations, beach, on='State', how='outer')

df = pd.merge(merge_df, storms, on='State', how='outer')
print(df.head(5))
print(df.columns)
df.to_csv('data/merged_data.csv', index=False)

In [None]:
df.info()

In [None]:
# Aggregate MSL trends by state
avg_msl_trends = usstations.groupby('State')['MSL Trends mm per yr'].mean()

# Aggregate beach nourishment volume by state
total_nourishment_volume = beach.groupby('State')['Volume (CY)'].sum()

sealevel_Beachvolume = pd.DataFrame({
    'Avg MSL Trend (mm/yr)': avg_msl_trends,
    'Total Nourishment Volume (CY)': total_nourishment_volume
}).reset_index()
print(sealevel_Beachvolume)


In [None]:
#sealevel_Beachvolume['Total Nourishment Volume (CY)'] = sealevel_Beachvolume['Total Nourishment Volume (CY)'].fillna(0)
sealevel_Beachvolume = sealevel_Beachvolume.dropna(subset=['Avg MSL Trend (mm/yr)', 'Total Nourishment Volume (CY)'])



fig = px.scatter(sealevel_Beachvolume,
                 x='Avg MSL Trend (mm/yr)',
                 y='Total Nourishment Volume (CY)',
                 size='Total Nourishment Volume (CY)',
                 hover_name='State',  
                 title='Relationship between MSL Trends and Beach Nourishment Volume by State')

fig.update_traces(textposition='top center')
fig.update_layout(xaxis_title='Avg MSL Trend (mm per year)',
                  yaxis_title='Total Nourishment Volume (Cubic Yards)',
                  xaxis_showgrid=False,
                  yaxis_showgrid=False)


fig.show()


## Feature: Summarize text from separate documents.

In [None]:
merged_df = pd.read_csv('data/merged_data.csv', low_memory=False) #Suppress low memory warning

# nltk.download('punkt')
# nltk.download('stopwords')

def preprocess_text(text):
    # Convert text to lowercase
    text = text.lower()
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Tokenize text
    tokens = word_tokenize(text)
    # Remove stopwords
    tokens = [word for word in tokens if word not in stopwords.words('english')]
    return ' '.join(tokens)  # Returning a string of processed tokens

# Process all text columns and concatenate them into a new column
merged_df['CombinedText'] = merged_df.apply(lambda row: ' '.join([preprocess_text(str(row[col])) for col in df.columns if pd.api.types.is_string_dtype(df[col])]), axis=1)

# Tokenize the combined text for frequency analysis
all_tokens = word_tokenize(' '.join(merged_df['CombinedText'].tolist()))

# Calculate word frequencies
word_freq = Counter(all_tokens)

# Convert to DataFrame for the top N words
df_freq = pd.DataFrame(word_freq.most_common(10), columns=['Word', 'Frequency'])

print(df_freq)

# Exporting the frequencies to CSV
df_freq.to_csv('data/word_frequencies.csv', index=False)
