In [36]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report


import matplotlib.pyplot as plt

## Loading Data

In [52]:
train_file_path = "datatrain.csv"
test_file_path = "datatest.csv"

train_df = pd.read_csv(train_file_path)
test_df = pd.read_csv(test_file_path)
train_df

Unnamed: 0,id,date,Temperature,Humidity,Light,CO2,HumidityRatio,Occupancy
0,1,02/04/15 17:51,23.180,27.2720,426.00,721.25,0.004793,1
1,2,02/04/15 17:51,23.150,27.2675,429.50,714.00,0.004783,1
2,3,02/04/15 17:53,23.150,27.2450,426.00,713.50,0.004779,1
3,4,02/04/15 17:54,23.150,27.2000,426.00,708.25,0.004772,1
4,5,02/04/15 17:55,23.100,27.2000,426.00,704.50,0.004757,1
...,...,...,...,...,...,...,...,...
17890,17891,02/18/15 9:15,20.815,27.7175,429.75,1505.25,0.004213,1
17891,17892,02/18/15 9:16,20.865,27.7450,423.50,1514.50,0.004230,1
17892,17893,02/18/15 9:16,20.890,27.7450,423.50,1521.50,0.004237,1
17893,17894,02/18/15 9:17,20.890,28.0225,418.75,1632.00,0.004279,1


## Data Preprocessing

In [53]:
# Get basic information about the dataset
num_observations = train_df.shape[0]
num_columns = train_df.shape[1]
column_info = train_df.info()
missing_data = train_df.isnull().sum()
summary_statistics = train_df.describe()

print("\nObservations: ", num_observations)
print("\nNumber of Columns: ", num_columns)
print(column_info)
print(missing_data)
print(summary_statistics)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17895 entries, 0 to 17894
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   id             17895 non-null  int64  
 1   date           17895 non-null  object 
 2   Temperature    17895 non-null  float64
 3   Humidity       17895 non-null  float64
 4   Light          17895 non-null  float64
 5   CO2            17895 non-null  float64
 6   HumidityRatio  17895 non-null  float64
 7   Occupancy      17895 non-null  int64  
dtypes: float64(5), int64(2), object(1)
memory usage: 1.1+ MB

Observations:  17895

Number of Columns:  8
None
id               0
date             0
Temperature      0
Humidity         0
Light            0
CO2              0
HumidityRatio    0
Occupancy        0
dtype: int64
                 id   Temperature      Humidity         Light           CO2  \
count  17895.000000  17895.000000  17895.000000  17895.000000  17895.000000   
mean    8948.0000

## Displaying CPTs

In [57]:

# Group feature values in increments of 0.5 for better accuracy
train_df["Temperature_group"] = (train_df["Temperature"] // 0.5) * 0.5
train_df["Humidity_group"] = (train_df["Humidity"] // 0.5) * 0.5
train_df["Light_group"] = (train_df["Light"] // 10) * 10  # Group Light values every 10 units
train_df["CO2_group"] = (train_df["CO2"] // 100) * 100  # Group CO2 values every 100 ppm

test_df["Temperature_group"] = (test_df["Temperature"] // 0.5) * 0.5
test_df["Humidity_group"] = (test_df["Humidity"] // 0.5) * 0.5
test_df["Light_group"] = (test_df["Light"] // 10) * 10  # Group Light values every 10 units
test_df["CO2_group"] = (test_df["CO2"] // 100) * 100  # Group CO2 values every 100 ppm

# Compute conditional probabilities by averaging for grouped values
cpt_grouped = {}

for feature in ["Temperature_group", "Humidity_group", "Light_group", "CO2_group", "HumidityRatio"]:
    # Calculate the probability of each Occupancy value for each grouped feature value
    cpt = train_df.groupby(feature)["Occupancy"].mean().to_frame()
    cpt["P(1)"] = cpt["Occupancy"]  # Probability of occupancy = 1
    cpt["P(0)"] = 1 - cpt["P(1)"]  # Probability of occupancy = 0
    cpt = cpt.drop(columns=["Occupancy"])  # Drop original column for clarity
    cpt_grouped[feature] = cpt

# Display the grouped CPTs using pandas
for feature, cpt in cpt_grouped.items():
    print(f"\nConditional Probability Table: P(Occupancy | {feature.replace('_group', '')})")
    display(cpt)










Conditional Probability Table: P(Occupancy | Temperature)


Unnamed: 0_level_0,P(1),P(0)
Temperature_group,Unnamed: 1_level_1,Unnamed: 2_level_1
19.0,0.0,1.0
19.5,0.010455,0.989545
20.0,0.014214,0.985786
20.5,0.073204,0.926796
21.0,0.283019,0.716981
21.5,0.662475,0.337525
22.0,0.745262,0.254738
22.5,0.359756,0.640244
23.0,0.498141,0.501859
23.5,0.58194,0.41806



Conditional Probability Table: P(Occupancy | Humidity)


Unnamed: 0_level_0,P(1),P(0)
Humidity_group,Unnamed: 1_level_1,Unnamed: 2_level_1
16.5,0.0,1.0
17.0,0.0,1.0
17.5,0.0,1.0
18.0,0.0,1.0
18.5,0.009509,0.990491
19.0,0.076246,0.923754
19.5,0.274194,0.725806
20.0,0.428571,0.571429
20.5,0.454545,0.545455
21.0,0.498507,0.501493



Conditional Probability Table: P(Occupancy | Light)


Unnamed: 0_level_0,P(1),P(0)
Light_group,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,0.000088,0.999912
10.0,0.000000,1.000000
20.0,0.000000,1.000000
30.0,0.007246,0.992754
40.0,0.000000,1.000000
...,...,...
1020.0,1.000000,0.000000
1380.0,1.000000,0.000000
1450.0,0.000000,1.000000
1540.0,0.000000,1.000000



Conditional Probability Table: P(Occupancy | CO2)


Unnamed: 0_level_0,P(1),P(0)
CO2_group,Unnamed: 1_level_1,Unnamed: 2_level_1
400.0,0.006477,0.993523
500.0,0.093124,0.906876
600.0,0.26388,0.73612
700.0,0.207018,0.792982
800.0,0.517827,0.482173
900.0,0.86128,0.13872
1000.0,0.888676,0.111324
1100.0,0.679412,0.320588
1200.0,0.388462,0.611538
1300.0,0.579592,0.420408



Conditional Probability Table: P(Occupancy | HumidityRatio)


Unnamed: 0_level_0,P(1),P(0)
HumidityRatio,Unnamed: 1_level_1,Unnamed: 2_level_1
0.002674,0.0,1.0
0.002678,0.0,1.0
0.002682,0.0,1.0
0.002684,0.0,1.0
0.002684,0.0,1.0
...,...,...
0.006456,1.0,0.0
0.006461,1.0,0.0
0.006464,1.0,0.0
0.006472,1.0,0.0


# Model 1: Bayesian Model for P(Occupancy | Temperature)

In [58]:
# Ensure temperature grouping is applied consistently
train_df["Temperature_group"] = (train_df["Temperature"] // 0.5) * 0.5
test_df["Temperature_group"] = (test_df["Temperature"] // 0.5) * 0.5
test_df

Unnamed: 0,id,date,Temperature,Humidity,Light,CO2,HumidityRatio,Occupancy,Temperature_group,Humidity_group,Light_group,CO2_group
0,140,02/02/15 14:19,23.700000,26.272000,585.200000,749.200000,0.004764,1,23.5,26.0,580.0,700.0
1,141,02/02/15 14:19,23.718000,26.290000,578.400000,760.400000,0.004773,1,23.5,26.0,570.0,700.0
2,142,02/02/15 14:21,23.730000,26.230000,572.666667,769.666667,0.004765,1,23.5,26.0,570.0,700.0
3,143,02/02/15 14:22,23.722500,26.125000,493.750000,774.750000,0.004744,1,23.5,26.0,490.0,700.0
4,144,02/02/15 14:23,23.754000,26.200000,488.600000,779.000000,0.004767,1,23.5,26.0,480.0,700.0
...,...,...,...,...,...,...,...,...,...,...,...,...
2660,2800,02/04/15 10:38,24.290000,25.700000,808.000000,1150.250000,0.004829,1,24.0,25.5,800.0,1100.0
2661,2801,02/04/15 10:40,24.330000,25.736000,809.800000,1129.200000,0.004848,1,24.0,25.5,800.0,1100.0
2662,2802,02/04/15 10:40,24.330000,25.700000,817.000000,1125.800000,0.004841,1,24.0,25.5,810.0,1100.0
2663,2803,02/04/15 10:41,24.356667,25.700000,813.000000,1123.000000,0.004849,1,24.0,25.5,810.0,1100.0


### Compute Likelihood and Prior Probability

In [59]:
# Compute Likelihood P(Temperature | Occupancy) from the training data
train_likelihood_temperature = train_df.groupby(["Temperature_group", "Occupancy"]).size().unstack(fill_value=0)
train_likelihood_temperature = train_likelihood_temperature.div(train_likelihood_temperature.sum(axis=0), axis=1)


# Compute Prior Probability
train_prior_occupancy = train_df["Occupancy"].value_counts(normalize=True)



### Prediction Function

In [47]:
# Compute Posterior Probability P(Occupancy | Temperature) using Bayes' Theorem
bayesian_temperature_cpt_train = train_likelihood_temperature.copy()
bayesian_temperature_cpt_train[0] = bayesian_temperature_cpt_train[0] * train_prior_occupancy[0]
bayesian_temperature_cpt_train[1] = bayesian_temperature_cpt_train[1] * train_prior_occupancy[1]

# Normalize so P(0) + P(1) = 1
bayesian_temperature_cpt_train = bayesian_temperature_cpt_train.div(bayesian_temperature_cpt_train.sum(axis=1), axis=0)
bayesian_temperature_cpt_train.columns = ["P(0)", "P(1)"]  # Ensure correct column names

# Define the Prediction Function
def predict_occupancy(temperature):
    """Predict occupancy based on temperature using Bayesian CPT."""
    # Find the closest temperature bin in the trained CPT
    closest_temp = bayesian_temperature_cpt_train.index[
        np.abs(bayesian_temperature_cpt_train.index - temperature).argmin()
    ]

    # Get probabilities from CPT
    probabilities = bayesian_temperature_cpt_train.loc[closest_temp]

    # Predict the most probable class
    return 1 if probabilities["P(1)"] > probabilities["P(0)"] else 0

### Evaluate Results

In [60]:
# Apply Predictions on the Test Set
test_df["Predicted_Occupancy"] = test_df["Temperature_group"].apply(predict_occupancy)

# Evaluate Model Performance
accuracy = accuracy_score(test_df["Occupancy"], test_df["Predicted_Occupancy"])
classification_rep = classification_report(test_df["Occupancy"], test_df["Predicted_Occupancy"])

# Display Results
print(f"\n Model Accuracy: {accuracy:.4f}")
print("\n Classification Report:\n", classification_rep)


 Model Accuracy: 0.6698

 Classification Report:
               precision    recall  f1-score   support

           0       0.68      0.91      0.78      1693
           1       0.61      0.26      0.36       972

    accuracy                           0.67      2665
   macro avg       0.65      0.58      0.57      2665
weighted avg       0.66      0.67      0.63      2665

