In [None]:
import numpy as np 
import pandas as pd 

In [None]:
df = pd.read_csv("/kaggle/input/temperature-readings-iot-devices/IOT-temp.csv")
print(f'IOT-temp.csv : {df.shape}')
df.head()

In [None]:
df['id_room/reg_id'].value_counts()

In [None]:
df.drop('id_room/reg_id', axis=1, inplace=True)
df.head()

In [None]:
df.rename(columns={'date_def':'date', 'in_out':'place'}, inplace=True)
df.head()

In [None]:
df['date'] = pd.to_datetime(df['date'], format='%d-%m-%Y %H:%M')

In [None]:
df['year'] = df['date'].dt.year  
df['month'] = df['date'].dt.month 
df['day'] = df['date'].dt.day  
df['weekday'] = df['date'].dt.day_name()  
df['weekofyear'] = df['date'].dt.isocalendar().week  
df['hour'] = df['date'].dt.hour  
df['minute'] = df['date'].dt.minute  


df.head()

In [None]:
def month2seasons(x):
    if x in [12, 1, 2]:
        return 'Winter'
    elif x in [3, 4, 5]:
        return 'Pre-Summer'  
    elif x in [6, 7, 8, 9]:
        return 'Rainy Season'  
    elif x in [10, 11]:
        return 'Post-Rainy Season'  


df['season'] = df['month'].apply(month2seasons)


df.head()


In [None]:
def hours2timing(x):
    if x in [22, 23, 0, 1, 2, 3]:
        return 'Night'
    elif x in range(4, 12):
        return 'Morning'
    elif x in range(12, 17):
        return 'Afternoon'
    elif x in range(17, 22):
        return 'Evening'


df['timing'] = df['hour'].apply(hours2timing)

df.head()


In [None]:
duplicate_rows = df[df.duplicated()]

if not duplicate_rows.empty:
    print(f"Found {len(duplicate_rows)} duplicate rows:")
    print(duplicate_rows)
else:
    print("No duplicates found.")


In [None]:
df = df.drop_duplicates()  


print("Data after removing duplicates:")
df.head()

In [None]:
duplicate_rows = df[df.duplicated()]

In [None]:
df['id'].apply(lambda x : x.split('_')[6]).nunique() == len(df)

In [None]:
df['id'] = df['id'].apply(lambda x : int(x.split('_')[6]))
df.head()

In [None]:
df.loc[df['id'].isin(range(4000, 4011))].sort_values(by='id')

In [None]:

plt.figure(figsize=(10, 6))
sns.histplot(df['temp'], kde=True)  
plt.title('Temperature Distribution')
plt.xlabel('Temperature')
plt.ylabel('Frequency')
plt.show()

In [None]:
place_cnt = np.round(
    df['place'].value_counts(normalize=True) * 100
)


hv.Bars(place_cnt).opts(
    title="Readings Place Count",
    color="purple",
    xlabel="Place",
    ylabel="Percentage",
    yformatter='%d%%',
    width=700,
    height=300,
    tools=['hover'],
    show_grid=True
)

In [None]:
season_cnt = np.round(
    df['season'].value_counts(normalize=True) * 100
)


hv.Bars(season_cnt).opts(
    title="Season Count",
    color="purple",
    xlabel="Season",
    ylabel="Percentage",
    yformatter='%d%%',
    width=700,
    height=300,
    tools=['hover'],
    show_grid=True
)

In [None]:
import numpy as np
import pandas as pd
import holoviews as hv
hv.extension('bokeh')


scatter_temp_time = hv.Scatter(df, 'date', 'temp').opts(
    title="Temperature Over Time",
    xlabel="Date",
    ylabel="Temperature",
    color="purple",
    tools=['hover'],
    width=700,
    height=400,
    show_grid=True
)


scatter_temp_time

In [None]:
heatmap_month_place = hv.HeatMap(df.groupby(['month', 'place'])['temp'].mean().reset_index()).opts(
    title="Average Temperature by Month and Place",
    xlabel="Month",
    ylabel="Place",
    cmap='Viridis',  
    tools=['hover'],
    width=700,
    height=400,
    show_grid=True
)


heatmap_month_place

In [None]:
import holoviews as hv
hv.extension('bokeh')


dist_in = hv.Distribution(df[df['place'] == 'In']['temp'], label='In').opts(color='purple')
dist_out = hv.Distribution(df[df['place'] == 'Out']['temp'], label='Out').opts(color='pink')


(dist_in * dist_out).opts(
    title="Temperature by Place Distribution",
    xlabel="Temperature",
    ylabel="Density",
    width=700,
    height=300,
    tools=['hover'],
    show_grid=True
)


In [None]:

season_agg = df.groupby('season').agg({'temp': ['min', 'max']})


season_maxmin = pd.merge(season_agg['temp']['max'], season_agg['temp']['min'], right_index=True, left_index=True)


season_maxmin = pd.melt(season_maxmin.reset_index(), ['season']).rename(
    columns={'season': 'Season', 'variable': 'Max/Min'}
)


hv.Bars(season_maxmin, ['Season', 'Max/Min'], 'value').opts(
    title="Temperature by Season Max/Min",
    ylabel="Temperature",
    width=700,
    height=300,
    tools=['hover'],
    show_grid=True
)

In [None]:
import pandas as pd
import plotly.express as px


fig = px.line(
    df,
    x='date', 
    y='temp',  
    color='place',  
    labels={'temp': 'Temperature', 'date': 'Date'},  
    title='Temperature Trend Over Time',
    hover_name='place',  
    hover_data={'date': '|%Y-%m-%d'},  
)


fig.update_layout(
    xaxis_title='Date',
    yaxis_title='Temperature',
    hovermode='x unified',  
    width=700,  
    height=400,  
)

fig.show()  


In [None]:
tsdf = df.drop_duplicates(subset=['date','place']).sort_values('date').reset_index(drop=True)
tsdf['temp'] = df.groupby(['date','place'])['temp'].mean().values
tsdf.drop('id', axis=1, inplace=True)
tsdf.head(3)

In [None]:
# Calculate monthly temperature means for 'In' and 'Out'
in_month = tsdf[tsdf['place'] == 'In'].groupby('month').agg({'temp': 'mean'})
out_month = tsdf[tsdf['place'] == 'Out'].groupby('month').agg({'temp': 'mean'})

# Plot monthly temperature means with interactive hover
monthly_trend = hv.Curve(in_month, label='In') * hv.Curve(out_month, label='Out')
monthly_trend.opts(
    title="Monthly Temperature Mean",
    ylabel="Temperature",
    xlabel="Month",
    width=700,
    height=300,
    tools=['hover'],
    show_grid=True
)


monthly_trend


In [None]:

tsdf['daily'] = tsdf['date'].apply(lambda x: pd.to_datetime(x.strftime('%Y-%m-%d')))
in_day = tsdf[tsdf['place'] == 'In'].groupby('daily').agg({'temp': 'mean'})
out_day = tsdf[tsdf['place'] == 'Out'].groupby('daily').agg({'temp': 'mean'})

#daily temperature means
daily_trend = hv.Curve(in_day, label='In') * hv.Curve(out_day, label='Out')
daily_trend.opts(
    title="Daily Temperature Mean",
    ylabel="Temperature",
    xlabel="Day",
    width=700,
    height=300,
    tools=['hover'],
    show_grid=True
)

daily_trend

In [None]:
weekdays = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']

in_weekday = tsdf[tsdf['place'] == 'In'].groupby('weekday').agg({'temp': 'mean'})
out_weekday = tsdf[tsdf['place'] == 'Out'].groupby('weekday').agg({'temp': 'mean'})

in_weekday['day_num'] = [weekdays.index(w) for w in in_weekday.index]
out_weekday['day_num'] = [weekdays.index(w) for w in out_weekday.index]

in_weekday = in_weekday.sort_values('day_num').drop('day_num', axis=1)
out_weekday = out_weekday.sort_values('day_num').drop('day_num', axis=1)

#weekday temperature trends
weekday_trend = hv.Curve(in_weekday, label='In') * hv.Curve(out_weekday, label='Out')
weekday_trend.opts(
    title="Weekday Temperature Mean",
    ylabel="Temperature",
    xlabel="Weekday",
    width=700,
    height=300,
    tools=['hover'],
    show_grid=True
)

weekday_trend


In [None]:
in_weekof = tsdf[tsdf['place'] == 'In'].groupby('weekofyear').agg({'temp': 'mean'})
out_weekof = tsdf[tsdf['place'] == 'Out'].groupby('weekofyear').agg({'temp': 'mean'})

#week-of-year temperature trends
weekofyear_trend = hv.Curve(in_weekof, label='In') * hv.Curve(out_weekof, label='Out')
weekofyear_trend.opts(
    title="Week-of-Year Temperature Mean",
    ylabel="Temperature",
    xlabel="Week of Year",
    width=700,
    height=300,
    tools=['hover'],
    show_grid=True
)

weekofyear_trend


In [None]:
import holoviews as hv
hv.extension('bokeh')


def create_curve(data, title, ylabel, xlabel, color):
    return hv.Curve(data).opts(
        title=title,
        ylabel=ylabel,
        xlabel=xlabel,
        color=color,
        width=400,
        height=300,
        tools=['hover'],
        show_grid=True
    )

ylabel = "Temperature"
xlabel = "Time"


in_tsdf = tsdf[tsdf['place'] == 'In'].set_index('date')
out_tsdf = tsdf[tsdf['place'] == 'Out'].set_index('date')


in_trend = create_curve(in_tsdf['temp'], "[In] Temperature Trend", ylabel, xlabel, "purple")
out_trend = create_curve(out_tsdf['temp'], "[Out] Temperature Trend", ylabel, xlabel, "pink")


in_tsdf_int = in_tsdf['temp'].resample('1min').interpolate('nearest')
out_tsdf_int = out_tsdf['temp'].resample('1min').interpolate('nearest')

in_interp_trend = create_curve(in_tsdf_int, "[In] Temperature Interpolated", ylabel, xlabel, "purple")
out_interp_trend = create_curve(out_tsdf_int, "[Out] Temperature Interpolated", ylabel, xlabel, "pink")


combined_plot = (in_trend + in_interp_trend + out_trend + out_interp_trend).opts(
    shared_axes=False
).cols(2)


combined_plot

In [None]:
tsdf

In [None]:
import pandas as pd

# Ensure 'daily' is a valid datetime
tsdf['daily'] = pd.to_datetime(tsdf['daily'], errors='coerce')

# Set 'daily' as the index
tsdf.set_index('daily', inplace=True)

# Identify duplicate indices
duplicates = tsdf.index[tsdf.index.duplicated()]
print("Duplicate indices:", duplicates)

# Drop duplicate indices to ensure unique index
tsdf = tsdf[~tsdf.index.duplicated(keep='first')]

# Ensure a continuous date range to avoid gaps in forecasting
full_date_range = pd.date_range(start=tsdf.index.min(), end=tsdf.index.max(), freq='D')

# Reindex and fill missing values with forward-fill
tsdf = tsdf.reindex(full_date_range, method='ffill')

# Check for missing values after reindexing
missing_values = tsdf.isnull().sum()
print("Missing values after reindexing:", missing_values)


In [None]:
print("DataFrame structure:", tsdf.head())


In [None]:
duplicates = tsdf.index[tsdf.index.duplicated()]
print("Duplicate indices:", duplicates)

In [None]:
tsdf = tsdf[~tsdf.index.duplicated(keep='first')]

In [None]:
tsdf.sort_index(inplace=True)

In [None]:
full_date_range = pd.date_range(start=tsdf.index.min(), end=tsdf.index.max(), freq='D')
tsdf = tsdf.reindex(full_date_range, method='ffill')

In [None]:
missing_values = tsdf.isnull().sum()
print("Missing values after reindexing:", missing_values)

In [None]:
# Remove duplicate indices
tsdf = tsdf[~tsdf.index.duplicated(keep='first')]

In [None]:
if 'daily' in tsdf.columns:
    tsdf['daily'] = pd.to_datetime(tsdf['daily'])
    tsdf.set_index('daily', inplace=True)

In [None]:
# Create a complete date range to ensure continuity
full_date_range = pd.date_range(start=tsdf.index.min(), end=tsdf.index.max(), freq='D')
tsdf = tsdf.reindex(full_date_range, method='ffill')

In [None]:
# Check for any missing values
missing_values = tsdf.isnull().sum()
print("Missing values after reindexing:", missing_values)

In [None]:
import matplotlib.pyplot as plt
plt.plot(tsdf['temp'])
plt.title("Temperature Time-Series")
plt.xlabel("Date")
plt.ylabel("Temperature")
plt.show()

In [None]:
tsdf.index = pd.to_datetime(tsdf.index, errors='coerce')  
tsdf.sort_index(inplace=True) 


full_date_range = pd.date_range(start=tsdf.index.min(), end=tsdf.index.max(), freq='D')  # Daily frequency
tsdf = tsdf.reindex(full_date_range, method='ffill')  

In [None]:
from statsmodels.tsa.statespace.sarimax import SARIMAX
import matplotlib.pyplot as plt

# Define SARIMA parameters
sarima_order = (1, 1, 1)  # ARIMA order (p, d, q)
seasonal_order = (1, 1, 1, 12)  # Seasonal ARIMA order (P, D, Q, S)


def run_sarima(series, order, seasonal_order, prediction_periods=30):
    
    sarima_model = SARIMAX(series, order=order, seasonal_order=seasonal_order)
    sarima_fit = sarima_model.fit(disp=False)
    
    
    forecast = sarima_fit.forecast(steps=prediction_periods)
    
   
    plt.figure(figsize=(12, 8))
    plt.plot(series, label='Historical Data')
    plt.plot(forecast.index, forecast, linestyle='--', color='orange', label='Forecasted Data')
    plt.title("SARIMA Forecast")
    plt.xlabel("Date")
    plt.ylabel("Temperature")
    plt.legend()
    plt.show()

  
    sarima_fit.plot_diagnostics(figsize=(12, 8))
    plt.show()

    return sarima_fit, forecast


In [None]:
sarima_fit, forecast = run_sarima(
    tsdf['temp'], 
    sarima_order, 
    seasonal_order, 
    prediction_periods=30
)

#SARIMA forecast
plt.figure(figsize=(12, 8))
plt.plot(tsdf['temp'], label='Historical Data')
plt.plot(forecast.index, forecast, linestyle='--', color='orange', label='Forecasted Data')
plt.title("SARIMA Forecast")
plt.xlabel("Date")
plt.ylabel("Temperature")
plt.legend()
plt.show()


In [None]:
sarima_model = SARIMAX(tsdf['temp'], order=sarima_order, seasonal_order=seasonal_order)
sarima_fit = sarima_model.fit()

#check residuals and model fit
sarima_fit.plot_diagnostics(figsize=(12, 8))
plt.show()


In [None]:
residuals = sarima_fit.resid
plt.figure(figsize=(12, 6))
plt.plot(residuals, label='Residuals')
plt.title("SARIMA Model Residuals")
plt.xlabel("Date")
plt.ylabel("Residual")
plt.show()

In [None]:
print("Date range in the DataFrame:")
print("Min date:", tsdf.index.min())
print("Max date:", tsdf.index.max())

In [None]:
#continuous date range
full_date_range = pd.date_range(start=tsdf.index.min(), end=tsdf.index.max(), freq='D')

# Reindex 
tsdf = tsdf.reindex(full_date_range, method='ffill')

In [None]:
#SARIMA parameters
sarima_order = (1, 1, 1)  # ARIMA order (p, d, q)
seasonal_order = (1, 1, 1, 12)  # Seasonal ARIMA order (P, D, Q, S)


df_in = tsdf[tsdf['place'] == 'In']
df_out = tsdf[tsdf['place'] == 'Out']

#avoid gaps
full_date_range = pd.date_range(start=df_in.index.min(), end=df_out.index.max(), freq='D')
df_in = df_in.reindex(full_date_range, method='ffill')
df_out = df_out.reindex(full_date_range, method='ffill')


In [None]:
#for 'In' temperature data
sarima_fit_in, forecast_in = run_sarima(
    df_in['temp'],
    sarima_order,
    seasonal_order,
    prediction_periods=30
)

#for 'Out' temperature data
sarima_fit_out, forecast_out = run_sarima(
    df_out['temp'],
    sarima_order,
    seasonal_order,
    prediction_periods=30
)