In [31]:
import pandas as pd
from sklearn.linear_model import LinearRegression
import numpy as np
import plotly.express as px
import plotly.graph_objects as go

# Load data
data = pd.read_csv('cleaned_sensor_data.csv')

# Extract the Temperature and Humidity columns
X = data['Temperature'].values.reshape(-1, 1)
y = data['Humidity'].values

# Create and train the linear regression model
model = LinearRegression()
model.fit(X, y)

# Generate test temperature values (min to max temperature in the dataset)
min_temp = X.min()
max_temp = X.max()
test_temps = np.linspace(min_temp, max_temp, 100).reshape(-1, 1)

# Predict humidity using the trained model
predicted_humidity = model.predict(test_temps)

# Create scatter plot
fig = px.scatter(data, x='Temperature', y='Humidity', opacity=0.65)

# Add the regression line
fig.add_traces(go.Scatter(x=test_temps.flatten(), y=predicted_humidity, name='Trend Line', mode='lines'))

# Show the plot
fig.show()


In [32]:
# Define thresholds
min_temp_threshold = 20 
max_temp_threshold = 21 

# Filter data based on the thresholds
filtered_data = data[(data['Temperature'] > min_temp_threshold) & (data['Temperature'] < max_temp_threshold)]

# Calculate the 5th and 95th percentiles for Temperature
min_temp_threshold = data['Temperature'].quantile(0.05)
max_temp_threshold = data['Temperature'].quantile(0.95)

# Filter the data based on these percentiles
filtered_data = data[(data['Temperature'] > min_temp_threshold) & (data['Temperature'] < max_temp_threshold)]

X_filtered = filtered_data['Temperature'].values.reshape(-1, 1)
y_filtered = filtered_data['Humidity'].values

model_filtered = LinearRegression()
model_filtered.fit(X_filtered, y_filtered)

predicted_humidity_filtered = model_filtered.predict(test_temps)

# Plot with the filtered data
fig_filtered = px.scatter(filtered_data, x='Temperature', y='Humidity', opacity=0.65)
fig_filtered.add_traces(go.Scatter(x=test_temps.flatten(), y=predicted_humidity_filtered, name='Filtered Trend Line', mode='lines'))
fig_filtered.show()


In [33]:
# Step 8: Further Filtering of Outliers
# Define new thresholds based on more aggressive filtering (e.g., more restrictive percentiles)

# For example, let's filter out the top and bottom 2.5% of temperatures:
min_temp_threshold = data['Temperature'].quantile(0.025)
max_temp_threshold = data['Temperature'].quantile(0.975)

# Filter the data based on these new percentiles
filtered_data_more = data[(data['Temperature'] > min_temp_threshold) & (data['Temperature'] < max_temp_threshold)]

# Extract filtered data for training the model
X_filtered_more = filtered_data_more['Temperature'].values.reshape(-1, 1)
y_filtered_more = filtered_data_more['Humidity'].values

# Train a new linear regression model on the further filtered data
model_filtered_more = LinearRegression()
model_filtered_more.fit(X_filtered_more, y_filtered_more)

# Predict humidity using the new model
predicted_humidity_filtered_more = model_filtered_more.predict(test_temps)

# Create the scatter plot with the new trend line for the further filtered data
fig_filtered_more = px.scatter(filtered_data_more, x='Temperature', y='Humidity', opacity=0.65)
fig_filtered_more.add_traces(go.Scatter(x=test_temps.flatten(), y=predicted_humidity_filtered_more, name='Filtered Trend Line', mode='lines'))
fig_filtered_more.show()
