In [5]:
%matplotlib inline
import pandas as pd

df = pd.read_csv('IOT-temp.csv')

In [None]:
# Find missing values in data

In [6]:
print(df.isnull().sum())

id            0
room_id/id    0
noted_date    0
temp          0
out/in        0
dtype: int64


In [8]:
# Fill missing temperature values with the mean

In [7]:
df['temp'] = df['temp'].fillna(df['temp'].mean())

In [9]:
# Z-score method: Remove rows where temperature is more than 3 standard deviations from the mean.

In [14]:
from scipy import stats
import numpy as np

z_scores = np.abs(stats.zscore(df['temp']))
df_z = df[z_scores <= 3]

In [11]:
# IQR method: Remove rows outside 1.5*IQR from Q1 and Q3.

In [16]:
Q1 = df['temp'].quantile(0.25)
Q3 = df['temp'].quantile(0.75)
IQR = Q3 - Q1
lower = Q1 - 1.5 * IQR
upper = Q3 + 1.5 * IQR
df_iqr = df[(df['temp'] >= lower) & (df['temp'] <= upper)]

In [17]:
print("After Z-score outlier removal:", df_z.shape)
print("After IQR outlier removal:", df_iqr.shape)

After Z-score outlier removal: (97606, 5)
After IQR outlier removal: (97606, 5)


In [18]:
import matplotlib.pyplot as plt
import seaborn as sns

In [19]:
# Scatter plot: temp vs noted_date


In [None]:
plt.figure(figsize=(10,5))
sns.scatterplot(x='noted_date', y='temp', hue='out_of_range', data=df_z)
plt.xticks(rotation=45)
plt.title('Temperature vs Date')
plt.tight_layout()
plt.show()

In [None]:
# Histogram: distribution of temperature


In [None]:
plt.figure(figsize=(6,4))
sns.histplot(df_clean['temp'], bins=15, kde=True)
plt.title('Temperature Distribution')
plt.xlabel('Temperature')
plt.ylabel('Frequency')
plt.tight_layout()
plt.show()