In [1]:
import pandas as pd

# Load the dataset
df_main = pd.read_csv('IOT-temp.csv')

# Display the first few rows of the dataset
df_main.head()

Unnamed: 0,id,room_id/id,noted_date,temp,out/in
0,__export__.temp_log_196134_bd201015,Room Admin,08-12-2018 09:30,29,In
1,__export__.temp_log_196131_7bca51bc,Room Admin,08-12-2018 09:30,29,In
2,__export__.temp_log_196127_522915e3,Room Admin,08-12-2018 09:29,41,Out
3,__export__.temp_log_196128_be0919cf,Room Admin,08-12-2018 09:29,41,Out
4,__export__.temp_log_196126_d30b72fb,Room Admin,08-12-2018 09:29,31,In


In [3]:
# Check for missing values
missing_values = df_main.isnull().sum()

# Describe the dataset to get an overview of the data distribution
data_description = df_main.describe()

missing_values, data_description

(id            0
 room_id/id    0
 noted_date    0
 temp          0
 out/in        0
 dtype: int64,
                temp
 count  97606.000000
 mean      35.053931
 std        5.699825
 min       21.000000
 25%       30.000000
 50%       35.000000
 75%       40.000000
 max       51.000000)

In [5]:
# Calculate IQR
Q1 = df_main['temp'].quantile(0.25)
Q3 = df_main['temp'].quantile(0.75)
IQR = Q3 - Q1

# Define outlier thresholds
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Identify outliers
outliers = df_main[(df_main['temp'] < lower_bound) | (df_main['temp'] > upper_bound)]

# Number of outliers
num_outliers = outliers.shape[0]

# Display the number of outliers and some examples
num_outliers, outliers.head()

(0,
 Empty DataFrame
 Columns: [id, room_id/id, noted_date, temp, out/in]
 Index: [])

In [6]:
# Display the number of outliers and some examples
print("Number of outliers:", num_outliers)
print(outliers.head())

Number of outliers: 0
Empty DataFrame
Columns: [id, room_id/id, noted_date, temp, out/in]
Index: []


In [8]:
from sklearn.preprocessing import MinMaxScaler

# Initialize the scaler
scaler = MinMaxScaler()

# Fit and transform the temperature data
df_main['temp_normalized'] = scaler.fit_transform(df_main[['temp']])

# Display the first few rows of the normalized data
print(df_main.head())

                                    id  room_id/id        noted_date  temp  \
0  __export__.temp_log_196134_bd201015  Room Admin  08-12-2018 09:30    29   
1  __export__.temp_log_196131_7bca51bc  Room Admin  08-12-2018 09:30    29   
2  __export__.temp_log_196127_522915e3  Room Admin  08-12-2018 09:29    41   
3  __export__.temp_log_196128_be0919cf  Room Admin  08-12-2018 09:29    41   
4  __export__.temp_log_196126_d30b72fb  Room Admin  08-12-2018 09:29    31   

  out/in  temp_normalized  
0     In         0.266667  
1     In         0.266667  
2    Out         0.666667  
3    Out         0.666667  
4     In         0.333333  


In [9]:
from sklearn.model_selection import train_test_split

# Assuming 'temp_normalized' is the feature and 'noted_date' is the target
X = df_main[['temp_normalized']]
y = df_main['noted_date']

# Split the data (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Training set size:", X_train.shape)
print("Testing set size:", X_test.shape)

Training set size: (78084, 1)
Testing set size: (19522, 1)
