In [13]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [14]:
from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [15]:
data = pd.read_csv("/content/drive/MyDrive/UALR/GA/GA/Repository/paraview/input_data/input_beach_full_data.csv")

In [16]:
data.head()

Unnamed: 0,Beach Name,Measurement Timestamp,Water Temperature,Turbidity,Wave Height,Wave Period,Battery Life,UMAP-0,UMAP-1
0,0,1379498400,0,0,0,0,0,3.562704,12.782455
1,0,1402574400,2,0,-2,3,2,-1.486517,10.547984
2,0,1402578000,0,0,2,4,2,-1.706453,10.520443
3,0,1402581600,-1,0,0,0,2,-1.777682,10.513829
4,0,1402585200,-1,0,0,0,1,-10.421374,4.833833


In [17]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40056 entries, 0 to 40055
Data columns (total 9 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Beach Name             40056 non-null  int64  
 1   Measurement Timestamp  40056 non-null  int64  
 2   Water Temperature      40056 non-null  int64  
 3   Turbidity              40056 non-null  int64  
 4   Wave Height            40056 non-null  int64  
 5   Wave Period            40056 non-null  int64  
 6   Battery Life           40056 non-null  int64  
 7   UMAP-0                 40056 non-null  float64
 8   UMAP-1                 40056 non-null  float64
dtypes: float64(2), int64(7)
memory usage: 2.8 MB


In [18]:
data.describe()

Unnamed: 0,Beach Name,Measurement Timestamp,Water Temperature,Turbidity,Wave Height,Wave Period,Battery Life,UMAP-0,UMAP-1
count,40056.0,40056.0,40056.0,40056.0,40056.0,40056.0,40056.0,40056.0,40056.0
mean,2.430772,1457432000.0,-0.021295,0.084507,-0.026213,-0.167291,0.027761,2.889112,3.221502
std,1.325607,52313440.0,1.039224,0.993663,1.0432,1.092478,1.019656,7.61304,7.5935
min,0.0,1377850000.0,-6.0,0.0,-2.0,-2.0,-15.0,-14.580515,-14.027887
25%,1.0,1409162000.0,-1.0,0.0,-1.0,-1.0,-1.0,-2.725035,-2.533139
50%,3.0,1439683000.0,0.0,0.0,0.0,-1.0,0.0,2.723743,3.074875
75%,3.0,1472256000.0,1.0,0.0,0.0,0.0,1.0,8.636177,9.001334
max,5.0,1630080000.0,3.0,55.0,15.0,4.0,3.0,20.268482,20.712067


In [19]:
# Count missing values in each column
print(data.isna().sum())

Beach Name               0
Measurement Timestamp    0
Water Temperature        0
Turbidity                0
Wave Height              0
Wave Period              0
Battery Life             0
UMAP-0                   0
UMAP-1                   0
dtype: int64


Checked that the Transducer Depth column has many null values, hence dropped it.

In [20]:
data.drop(columns='Transducer Depth',inplace=True)

KeyError: ignored

In [None]:
data.duplicated()

In [None]:
data['Measurement Timestamp'] = pd.to_datetime(data['Measurement Timestamp'])

In [None]:
type(data['Measurement Timestamp'])

In [None]:
data.head()

In [None]:
data.set_index('Measurement Timestamp')

In [None]:
# Checking whether Measurement Timestamp column is datetime / timeseries column 
pd.api.types.is_datetime64_any_dtype(data['Measurement Timestamp'])

In [None]:
data.columns

Beach Name, Measurement Timestamp Label, and Measurement ID columns looks doubtful to me. rest all of the columns are useful

Useful columns:
'Measurement Timestamp', 'Water Temperature', 'Turbidity','Transducer Depth', 'Wave Height', 'Wave Period', 'Battery Life'

In [None]:
# Ouliers Detection

# Create box plots for each column in the DataFrame
plt.figure(figsize=(10, 6))  # Adjust the figure size if needed

data[['Battery Life','Water Temperature','Wave Height','Wave Period']].boxplot()

# Set the title and labels
plt.title('Box Plot of Column Values')
plt.xlabel('Columns')
plt.ylabel('Values')

# Show the plot
plt.show()

Few Values were in the range of -999 for Wave Height column. They does not look appropriate, so removing them.

In [None]:
data = data.drop(data.loc[data['Wave Height']<-999,:].index)

In [None]:
data

Dropping the columns **Beach Name, Measurement Timestamp Label,Measurement ID** because they are not numericals and are not helpful in dimensionality reduction.

In [None]:
data1 = data.drop(columns=['Beach Name', 'Measurement Timestamp Label','Measurement ID'])

In [None]:
data1.head()

In [None]:
data.head()

In [None]:
data[data['Beach Name'] == 'Ohio Street Beach']

In [None]:
data[data['Beach Name'] == 'Calumet Beach']

### Sorting

In [None]:
data_sorted = data.sort_values(by=['Beach Name','Measurement Timestamp'])
# data_sorted = data.sort_values(by=['Measurement Timestamp','Beach Name'])

In [None]:
data_sorted.head()

In [None]:
data_sorted['Beach Name'].unique()

In [None]:
data_sorted['Beach Name'].value_counts().plot(kind='bar')
plt.xlabel('Beach Name')
plt.ylabel('Count')
plt.title('Data Distribution w.r.t. Beaches')
plt.show()

In [None]:
data_sorted_ohio_b = data_sorted[data_sorted['Beach Name']=='Ohio Street Beach']

In [None]:
data_sorted_ohio_b

In [None]:
data_sorted.to_csv("/content/drive/MyDrive/UALR/GA/GA/Data1/Cleaned/Cleaned and Sorted/data_sorted.csv")

In [None]:
data_sorted_ohio_b.to_csv("/content/drive/MyDrive/UALR/GA/GA/Data1/Cleaned/Cleaned and Sorted/data_sorted_ohio_b.csv")

In [None]:
pip install umap-learn

### Encoding:

Doing the encoding using the get_dummies() method in pandas because UMAP does not allow categorical data.

In [None]:
encoded_data = pd.get_dummies(data_sorted, columns=['Beach Name'])
encoded_data

In [None]:
import umap.umap_ as umap
from sklearn.preprocessing import StandardScaler
from sklearn.impute import KNNImputer

In [None]:
num_cols = ['Water Temperature','Turbidity','Wave Height','Wave Period','Battery Life']

In [None]:
encoded_data.columns

Dropping the columns **Measurement Timestamp Label, Measurement ID**


In [None]:
encoded_data_pre = encoded_data.drop(columns=['Measurement Timestamp Label','Measurement ID'])  

In [None]:
encoded_data_pre

###Standardizing the dataset

In [None]:
scaler = StandardScaler()
encoded_data_pre[num_cols] = scaler.fit_transform(encoded_data_pre[num_cols])

##Converting the measurement timestamp to a numeric value:

In [None]:
# timeseries_data = encoded_data_pre['Measurement Timestamp']

# reference_time = timeseries_data.min()

# encoded_data_pre['Measurement Timestamp'] = (timeseries_data - reference_time).dt.total_seconds() / 60

In [None]:
# Convert datetime to numeric representation
# encoded_data_pre['Measurement Timestamp'] = pd.to_datetime(encoded_data_pre['Measurement Timestamp'])
encoded_data_pre['Measurement Timestamp'] = encoded_data_pre['Measurement Timestamp'].apply(lambda x: x.timestamp())

In [None]:
encoded_data_pre.isna().sum()

There are few null values present in Wave Height and Wave Period Column, we need to fill them or else eliminate them. I am filling them using KNNImputer

In [None]:
# Using KNNImputer to impute the missing values with the nearest 3 neighbors and creating a new dataframe to store 
# updated data

imputer=KNNImputer(n_neighbors=3, weights='uniform',missing_values=np.nan)

new_array=imputer.fit_transform(encoded_data_pre) # impute the missing values
    # convert the nd-array returned in the step above to a Dataframe

encoded_data_pre=pd.DataFrame(data=np.round(new_array), columns=encoded_data_pre.columns)

In [None]:
encoded_data_pre.head()

random_state --> This parameter is used to set the random seed for reproducibility

n_neighbors --> This parameter determines the number of nearest neighbors used to construct the local neighborhood for each data point. In UMAP, the algorithm builds a graph representation of the data, and n_neighbors specifies the number of nearest neighbors to consider when constructing the graph. Increasing this value may result in a more global view of the data, while decreasing it may provide a more local perspective.


n_components --> This parameter sets the number of dimensions in the lower-dimensional space to which the data will be projected

min_dist: This parameter controls the minimum distance between points in the lower-dimensional embedding. A higher value of min_dist enforces greater spacing between points, potentially resulting in a clearer visualization

In [None]:
# Apply UMAP to the entire DataFrame
SEED = 43
reducer = umap.UMAP(random_state=SEED, 
                    n_neighbors = 12, 
                    n_components = 2,
                    min_dist = 0.5)

# reducer = umap.UMAP(random_state=43)
umap_result = reducer.fit_transform(encoded_data_pre)

In [None]:
umap_result

## Finding the patterens with respect to the Measurement Timestamp:

In [None]:
fig, ax = plt.subplots(figsize=(9, 7))

scatter = plt.scatter(umap_result[:, 0], umap_result[:, 1], c=encoded_data_pre['Measurement Timestamp'], s=4, cmap='Spectral');

# Add a legend
plt.legend(*scatter.legend_elements(), title='Measurement Timestamp',bbox_to_anchor=(1.05, 1), loc='upper left')

# plt.show()

## Finding the patterens with respect to the beaches:

In [None]:
fig, ax = plt.subplots(figsize=(9, 7))

beach_names = ['Beach Name_63rd Street Beach', 'Beach Name_Calumet Beach', 'Beach Name_Montrose Beach',
               'Beach Name_Ohio Street Beach', 'Beach Name_Osterman Beach', 'Beach Name_Rainbow Beach']
colors = ['red', 'blue', 'green', 'orange', 'purple', 'cyan']

# Create a scatter plot of the UMAP embedding
for i, beach in enumerate(beach_names):
    indices = encoded_data_pre[beach].astype(bool)
    plt.scatter(umap_result[indices, 0], umap_result[indices, 1], s=4,color=colors[i], label=beach)

# Add a legend
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')

# Show the plot
plt.show()

In [None]:
encoded_data_pre

In [None]:
encoded_data = pd.get_dummies(data_sorted, columns=['Beach Name'])
encoded_data

#Using Label Encoder to convert categorical values into numeric

In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
data_sorted

In [None]:
data_sorted_label_enc = data_sorted
data_sorted_label_enc.head()

In [None]:
lblEn = LabelEncoder()

data_sorted_label_enc['Beach Name'] =lblEn.fit_transform(data_sorted_label_enc['Beach Name'])

In [None]:
lblEn.inverse_transform(np.array([0, 1, 2, 3, 4, 5]))
# data_sorted_label_enc['Beach Name'].unique()

In [None]:
num_cols = ['Water Temperature','Turbidity','Wave Height','Wave Period','Battery Life']

In [None]:
data_sorted_label_enc = data_sorted_label_enc.drop(columns=['Measurement Timestamp Label','Measurement ID']) 

In [None]:
scaler = StandardScaler()
data_sorted_label_enc[num_cols] = scaler.fit_transform(data_sorted_label_enc[num_cols])

In [None]:
timeseries_data = data_sorted_label_enc['Measurement Timestamp']

In [None]:
# reference_time = timeseries_data.min()
# data_sorted_label_enc['Measurement Timestamp'] = (timeseries_data - reference_time).dt.total_seconds() / 60
# data_sorted_label_enc.isna().sum()

In [None]:
# Converting the timestamp to numerical values

data_sorted_label_enc['Measurement Timestamp'] = data_sorted_label_enc['Measurement Timestamp'].apply(lambda x: x.timestamp())

In [None]:
data_sorted_label_enc['Measurement Timestamp']

In [None]:
# Using KNNImputer to impute the missing values with the nearest 3 neighbors and creating a new dataframe to store 
# updated data

imputer=KNNImputer(n_neighbors=3, weights='uniform',missing_values=np.nan)

new_array=imputer.fit_transform(data_sorted_label_enc) # impute the missing values
    # convert the nd-array returned in the step above to a Dataframe

data_sorted_label_enc=pd.DataFrame(data=np.round(new_array), columns=data_sorted_label_enc.columns)

In [None]:
# Apply UMAP to the entire DataFrame
SEED = 43
# reducer = umap.UMAP(random_state=SEED, 
#                     n_neighbors = 5, 
#                     n_components = 2,
#                     min_dist = 0.7)
reducer = umap.UMAP(random_state=SEED)

umap_result = reducer.fit_transform(data_sorted_label_enc)

umap_result

In [None]:
fig, ax = plt.subplots(figsize=(9, 7))

scatter = plt.scatter(umap_result[:, 0], umap_result[:, 1], c=data_sorted_label_enc['Measurement Timestamp'], s=4, cmap='Spectral');

# Add a legend
plt.legend(*scatter.legend_elements(), title='Measurement Timestamp',bbox_to_anchor=(1.05, 1), loc='upper left')

# plt.show()

In [None]:
fig, ax = plt.subplots(figsize=(9, 7))


scatter = plt.scatter(umap_result[:, 0], umap_result[:, 1], c=data_sorted_label_enc['Beach Name'] , s=5, cmap='Spectral')

# Create a legend using the unique beach names
legend_labels = ['63rd Street Beach', 'Calumet Beach', 'Montrose Beach', 'Ohio Street Beach', 'Osterman Beach', 'Rainbow Beach']
plt.legend(handles=scatter.legend_elements()[0], labels=legend_labels,bbox_to_anchor=(1.05, 1), loc='upper left')

# Show the plot
plt.show()

### Exporting data for the paraview.

In [None]:
data_sorted_label_enc

In [None]:
# Creating the dataframe from the array umap_result
umap_result_df = pd.DataFrame(umap_result,columns=['UMAP-0','UMAP-1'])

In [None]:
# Saving the two pandas dataframes data_sorted_label_enc and umap_result_df into data_sorted_label_enc_out using concat function of pandas

data_sorted_label_enc_out = pd.concat([data_sorted_label_enc,umap_result_df], axis=1)

In [None]:
# Exporting the csv file
data_sorted_label_enc_out.to_csv("/content/drive/MyDrive/UALR/GA/GA/Repository/paraview_data/data_sorted_label_enc_out.csv")