In [4]:
import pandas as pd
import numpy as np 
import seaborn as sns
# for stats tests
import scipy.stats as st
# for regression metrics
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_absolute_percentage_error
# to find influential data points
from statsmodels.stats.outliers_influence import OLSInfluence
# for diagnostic tests
import statsmodels.stats.diagnostic as di
import statsmodels.stats.stattools as stt
# for general plotting
import matplotlib.pyplot as plt
import plotly.graph_objects as go
# for the linear regression model and splitting data
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from ISLP import confusion_table
from ISLP.models import contrast
from sklearn. discriminant_analysis import \
( LinearDiscriminantAnalysis as LDA ,
QuadraticDiscriminantAnalysis as QDA)
from sklearn. naive_bayes import GaussianNB
from sklearn. neighbors import KNeighborsClassifier
from sklearn. preprocessing import StandardScaler
from matplotlib .pyplot import subplots
import statsmodels .api as sm
from ISLP import load_data
from ISLP.models import ( ModelSpec as MS ,
summarize )
from sklearn. model_selection import train_test_split
from sklearn. linear_model import LogisticRegression

In [None]:
df_sea_level = pd.read_csv('dataset2.csv')

In [None]:
df_sea_level.head()

In [None]:
#Checking the size of the dataset
df_sea_level.shape

In [None]:
#Checking the values for the columns
df_sea_level['Indicator'].value_counts()
df_sea_level['Source'].value_counts()
df_sea_level['CTS Name'].value_counts()

In [None]:
#Select the most importants columns for the study
sea_levels = df_sea_level[['Indicator','Source','Measure','Date','Value']]

In [None]:
df_sea_level.sort_values(by=['Date'], inplace=True)
df_sea_level.isnull()

In [None]:
#In this case, the NAN values are not in a important column for the statistical or machine learning models, so, i will delete them
#axis 1 its to specify that it is a column and all its to say all the records founded
#Dont forget to add to the same dataframe the change
df_sea_level = df_sea_level.dropna(axis=1, how = 'all')


In [None]:
#Checking if all the NAn values was removed
df_sea_level.isnull()

In [None]:
#There is too much categories in the column of Measure, trying the inverse
df_sea_level.plot.scatter(x="Value", y="Measure", alpha=0.5)

In [None]:
#In the Baltic Sea there is a big difference in the values, check for outliers
outliers = df_sea_level['Value'].value_counts()
print(outliers)

In [None]:
sns.boxplot(y=df_sea_level['Value'])

In [None]:
#Aplying Statistics tests, the previous box plot show that the values above more or less 200 are outliers

upper_limit = df_sea_level['Value'].mean() + 3* df_sea_level['Value'].std() # Right from the mean
lower_limit = df_sea_level['Value'].mean() - 3* df_sea_level['Value'].std() # Left from the mean
print(upper_limit)
print(lower_limit)

In [None]:
#heck how many outliers are in the Value column with the help of the loc method.
df_sea_level.loc[df_sea_level['Value'] >= upper_limit, 'Value']

In [None]:
#Check for outliers using the IQR
IQR = df_sea_level['Value'].quantile(0.75) - df_sea_level['Value'].quantile(0.25)
IQR

In [None]:
sns.distplot(df_sea_level['Value'], color="maroon")
plt.xlabel("Value", labelpad=14)
plt.ylabel("probability of occurence", labelpad=14)
plt.title("Distribution of Changing in the Sea Level in Millimeters", y=1.015, fontsize=20);

In [None]:
#distplot its deprecated, so lets use other option
sns.histplot(df_sea_level['Value'], kde=True)

In [None]:
#Calculating the mean
sea_level_mean = df_sea_level['Value'].mean()
sea_level_mean

In [None]:
#Calculating the Standard Deviation
sea_level_std = df_sea_level['Value'].std()
sea_level_std

In [None]:
#Addying Z-Score column in the dataset
df_sea_level['Z-Score'] =(df_sea_level['Value']-sea_level_mean )/sea_level_std

In [None]:
df_sea_level.head()

In [None]:
df_sea_level['Z-Score'].hist(color='blue')
plt.title("Standard Normal Distribution", y=1.015, fontsize=22)
plt.xlabel("z-score", labelpad=14)
plt.ylabel("frequency", labelpad=14);

In [None]:
z_score_distribution_std_dev = round(df_sea_level['Z-Score'].std(), 2)
z_score_distribution_std_dev

In [None]:
df_sea_level['Value'].plot()

In [None]:
df_sea_level['Measure'].value_counts()

In [None]:
#The Date isnt in the datetime format, so, we need to convert to datetime and do more analysis 
# Remove the 'D' and convert the 'Date' column to datetime format
df_sea_level['Date'] = pd.to_datetime(df_sea_level['Date'].str[1:], format='%m/%d/%Y')

In [None]:
baltic_sea = df_sea_level[df_sea_level['Measure'] == 'Baltic Sea']
baltic_sea

In [None]:
baltic_sea = baltic_sea.sort_values(by='Date')
baltic_sea

In [None]:
# Plot the Measure values over time for the Baltic Sea, the Baltic Sea had the marjority of the outliers
plt.figure(figsize=(12, 6))
plt.plot(baltic_sea['Date'], baltic_sea['Value'], label='Value', color='blue')
plt.xlabel('Date')
plt.ylabel('Value')
plt.title('Sea Level Measure Over Time for the Baltic Sea')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
#Checking the North Sea, in this article, say that the marjority of the changings in the Baltic Sea are influenced by the North Sea
north_sea = df_sea_level[df_sea_level['Measure'] == 'North Sea']
north_sea

In [None]:
#Sort by Date the North Sea
north_sea = north_sea.sort_values(by='Date')

In [None]:
# Plot the Measure values over time for the North Sea
plt.figure(figsize=(12, 6))
plt.plot(north_sea['Date'], north_sea['Value'], label='Value', color='blue')
plt.xlabel('Date')
plt.ylabel('Value')
plt.title('Sea Level Measure Over Time for the North Sea')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
#Now trying to put the two together, to better visualization
# Plot the Measure values over time for the North Sea
plt.figure(figsize=(12, 6))
plt.plot(baltic_sea['Date'], baltic_sea['Value'], label='Baltics Sea Values', color='blue')
plt.plot(north_sea['Date'], north_sea['Value'], label='North Sea Values', color='red')
plt.xlabel('Date')
plt.ylabel('Value')
plt.title('Sea Level Measure Over Time for the North Sea and Baltic Sea')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
#Checking the correlation between them nortic sea values and the north sea values

# Calculate the daily change in 'Measure' for each sea
baltic_sea['Value_Change'] = baltic_sea['Value'].diff()
north_sea['Value_Change'] = north_sea['Value'].diff()

# Merge the two datasets on 'Date' to align the dates
merged_data = pd.merge(baltic_sea[['Date', 'Value_Change']], 
                       north_sea[['Date', 'Value_Change']], 
                       on='Date', suffixes=('_Baltic', '_North'))

# Calculate the correlation between the changes
correlation = merged_data['Value_Change_Baltic'].corr(merged_data['Value_Change_North'])

print(f"Correlation between changes in the Baltic Sea and North Sea measures: {correlation}")


In [None]:
# Create a scatter plot
plt.figure(figsize=(10, 6))
sns.scatterplot(data=merged_data, x='Value_Change_Baltic', y='Value_Change_North', color='blue')
plt.title(f'Scatter Plot of Sea Level Changes\nBaltic Sea vs North Sea\nCorrelation: {correlation:.2f}')
plt.xlabel('Baltic Sea Measure Change')
plt.ylabel('North Sea Measure Change')
plt.axhline(0, color='red', linestyle='--', lw=1)  # Add horizontal line at y=0
plt.axvline(0, color='red', linestyle='--', lw=1)  # Add vertical line at x=0
plt.grid()
plt.show()

In [None]:
plt.figure(figsize=(12, 6))
plt.plot(baltic_sea['Date'], baltic_sea['Value_Change'], label='Baltic Sea Change', color='blue')
plt.plot(north_sea['Date'], north_sea['Value_Change'], label='North Sea Change', color='orange')
plt.title('Sea Level Changes Over Time')
plt.xlabel('Date')
plt.ylabel('Measure Change')
plt.axhline(0, color='gray', linestyle='--', lw=1)
plt.legend()
plt.grid()
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()


In [None]:
# Heatmap
# Create a DataFrame for correlation
correlation_df = pd.DataFrame({
    'Baltic Sea Change': merged_data['Value_Change_Baltic'],
    'North Sea Change': merged_data['Value_Change_North']
})

# Calculate correlation matrix
correlation_matrix = correlation_df.corr()

# Plotting the heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(correlation_matrix, annot=True, fmt=".2f", cmap='coolwarm', square=True)
plt.title('Correlation Heatmap: Baltic Sea vs North Sea')
plt.show()


In [None]:
# Pivot the data to create a wide format DataFrame
pivot_df = df_sea_level.pivot_table(index='Date', columns='Measure', values='Value')

# Calculate the correlation matrix for the sea values
correlation_matrix = pivot_df.corr()

# Plotting the heatmap
plt.figure(figsize=(14, 10))
sns.heatmap(correlation_matrix, annot=True, fmt=".2f", cmap='coolwarm', square=True, 
            cbar_kws={"shrink": .8}, xticklabels=correlation_matrix.columns, 
            yticklabels=correlation_matrix.columns)
plt.title('Multivariate Correlation Heatmap of Sea Values')
plt.show()


In [None]:
# Pivot the data to create a wide format DataFrame
pivot_df = df_sea_level.pivot_table(index='Date', columns='Measure', values='Value')
pivot_df.head()

In [None]:
pivot_df = pivot_df.dropna()

In [None]:
pivot_df.head()

In [None]:
# Select the dependent variable (e.g., 'Baltic')
y = pivot_df['Baltic Sea']

# Select independent variables (all other seas)
X = pivot_df.drop(columns='Baltic Sea')

# Add a constant to the model (intercept)
X = sm.add_constant(X)

# Fit the OLS model
model = sm.OLS(y, X).fit()

# Display the model summary
print(model.summary())

In [None]:
# Filter for Baltic Sea and other relevant seas
seas_of_interest = ['Baltic Sea', 'North Sea']  # Add any other seas you want to analyze
filtered_df = df_sea_level[df_sea_level['Measure'].isin(seas_of_interest)]

# Check for NaNs in the filtered DataFrame
print("Checking for NaNs in the filtered DataFrame:")
print(filtered_df.isna().sum())

# Drop rows with NaN values in the 'Value' column
filtered_df = filtered_df.dropna(subset=['Value'])

# Check if there are any remaining NaN values after filtering
print("Remaining NaNs after dropping:")
print(filtered_df.isna().sum())

# Ensure the DataFrame contains only the seas of interest
baltic_values = filtered_df[filtered_df['Measure'] == 'Baltic Sea'][['Date', 'Value']]
north_values = filtered_df[filtered_df['Measure'] == 'North Sea'][['Date', 'Value']]

# Merge the two DataFrames on the 'Date' column
merged_df = pd.merge(baltic_values, north_values, on='Date', suffixes=('_Baltic', '_North'))

# Drop any rows with NaN values after merging
merged_df = merged_df.dropna()

# Check if we have enough data for regression
if merged_df.shape[0] < 2:
    print("Not enough data for regression analysis.")
else:
    # Prepare the dependent variable (Baltic Sea values)
    y = merged_df['Value_Baltic']

    # Prepare the independent variable (North Sea values)
    X = merged_df['Value_North'].values  # Convert to numpy array for statsmodels
    X = sm.add_constant(X)  # Add constant for the intercept

    # Fit the OLS model
    model = sm.OLS(y, X).fit()

    # Display the model summary
    print(model.summary())

In [None]:
# Specify the seas of interest
seas_of_interest = [
    'Southern Ocean', 'Indian Ocean', 'Nino', 'Atlantic Ocean',
    'Tropics', 'North Pacific', 'Bering Sea', 'Pacific Ocean',
    'Indonesian', 'Arabian Sea', 'South China', 'North Atlantic',
    'Caribbean Sea', 'Baltic Sea', 'Gulf Mexico', 'Mediterranean',
    'North Sea', 'Sea Okhotsk', 'Sea Japan', 'Bay Bengal',
    'Yellow Sea', 'Andaman Sea', 'Adriatic Sea', 'Persian Gulf'
]

# Filter for the specified seas
filtered_df = df_sea_level[df_sea_level['Measure'].isin(seas_of_interest)]

# Sample 10% of the filtered DataFrame (you can adjust this fraction)
sampled_df = filtered_df.sample(frac=0.1, random_state=1)  # random_state for reproducibility

# Display the shape of the sampled DataFrame
print("Sampled DataFrame shape:", sampled_df.shape)
