# Import the libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_selection import mutual_info_classif

# Read the Data

Follow these steps to read the data from the project:

1. Download the <a href="https://github.com/academic-initiative/skillsbuild/blob/main/watsonx/files/iot_sensor_dataset.csv" target="_blank">iot_sensor_dataset.csv</a> file, and upload it to the project.
1. Select the cell below.
1. Click the **Code snippets** icon **< / >** in the toolbar.
2. Select **Read Data**.
3. Select **Select data from project**.
   1. Select **Data asset > iot_sensor_dataset.csv**.
   2. Click **Select**.
5. From the *Load as* drop-down list, select **pandas DataFrame**.
6. Click **Insert code to cell**.
7. Rename the dataframe from `df_1` to `df`.

In [3]:
import pandas as pd

url = 'https://raw.githubusercontent.com/academic-initiative/skillsbuild/refs/heads/main/watsonx/files/iot_sensor_dataset.csv'
df = pd.read_csv(url)
print(df.head(10))

   footfall  atemp  selfLR  ClinLR  DoleLR  PID  outpressure  inpressure  \
0         0      7       7       1       6    6           36           3   
1       190      1       3       3       5    1           20           4   
2        31      7       2       2       6    1           24           6   
3        83      4       3       4       5    1           28           6   
4       640      7       5       6       4    0           68           6   
5       110      3       3       4       6    1           21           4   
6       100      7       5       6       4    1           77           4   
7        31      1       5       4       5    4           21           4   
8       180      7       4       6       3    3           31           4   
9      2800      0       3       3       7    0           39           3   

   temp  fail  
0     1     1  
1     1     0  
2     1     0  
3     1     0  
4     1     0  
5     1     0  
6     1     0  
7     1     0  
8     1     0  
9  

# Data Analysis

## Are there missing values?

In [4]:
df.isnull().sum()

footfall       0
atemp          0
selfLR         0
ClinLR         0
DoleLR         0
PID            0
outpressure    0
inpressure     0
temp           0
fail           0
dtype: int64

## How balanced is our dataset?
- Our target variable is **fail**. 
- fail can either be **0** or **1**.
- In a **very** balanced dataset, the number of 1s and 0s would be about equal.
- Consider a ratio range of **60/40 - 80/20** to be of mild imbalance


In [5]:
df['fail'].value_counts()

fail
0    551
1    393
Name: count, dtype: int64

In [6]:
#To get the percentages
df['fail'].value_counts(normalize=True) * 100

fail
0    58.368644
1    41.631356
Name: proportion, dtype: float64

## Is the data categorical or numerical?
At this point, the following information is unknown:
- If the predictors are categorical or numerical
- Information about the variables

In this next section, you check how many unique values exist for each variable.

In [7]:
print(df.nunique())

footfall       99
atemp           8
selfLR          7
ClinLR          7
DoleLR          7
PID             7
outpressure    71
inpressure      7
temp           24
fail            2
dtype: int64


## Analyze the correlation between the predictor and target variable
Without additional information, assume that the predictors are categorical, except **footfall**, **outpressure**, and **temp**.

Separate the categorical and numerical variables as well as the target. 

In [8]:
X = df.drop(columns=['footfall', 'outpressure', 'temp', 'fail'])
X_numeric = df[['footfall', 'outpressure', 'temp']]
y = df['fail']

Encode the categorical variables by converting them to category codes.

In [9]:
X_encoded = X.apply(lambda col: col.astype('category').cat.codes)

Mutual Information (mi) measures the dependency between two variable.

Calculate the dependency between the target and the categorical variables.

In [10]:
mi_scores = mutual_info_classif(X_encoded, y, discrete_features=True)

Calculate the dependencies between the target and the numerical variables.

In [11]:
mi_scores_cont = mutual_info_classif(X_numeric, y, discrete_features=False) 

Print the dependencies for the numerical variables.

In [12]:
for feature, score in zip(X_numeric.columns, mi_scores_cont):
    print(f"mi between {feature} and Target: {score:.4f}")

mi between footfall and Target: 0.0151
mi between outpressure and Target: 0.0000
mi between temp and Target: 0.0000


Print the dependencies for the categorical variables.

In [13]:
for feature, score in zip(X.columns, mi_scores):
    print(f"mi between {feature} and Target: {score:.4f}")

mi between atemp and Target: 0.0065
mi between selfLR and Target: 0.2116
mi between ClinLR and Target: 0.1402
mi between DoleLR and Target: 0.0636
mi between PID and Target: 0.4031
mi between inpressure and Target: 0.0062


Theoretically, you could omit any features that exhibit very low mutual information with the target. However, you’ll see later that AutoAI automatically carries out this feature selection in a more sophisticated manner.

## Extract a subset of the data

The small dataset will be used to test the deployed model.

In [14]:
print(len(df))
np.random.seed(42)
df_fail_0 = df[df['fail'] == 0].sample(n=min(3, len(df[df['fail'] == 0])), replace=False)
df_fail_1 = df[df['fail'] == 1].sample(n=min(1, len(df[df['fail'] == 1])), replace=False)
remaining_after_fail = df.drop(df_fail_0.index).drop(df_fail_1.index)
df_random = remaining_after_fail.sample(n=min(6, len(remaining_after_fail)), replace=False)
df_selected = pd.concat([df_fail_0, df_fail_1, df_random])
df = df[~df.index.isin(df_selected.index)].reset_index(drop=True)
df_test = df_selected.drop('fail', axis=1)
print(len(df))
print(len(df_selected))

944
934
10


# Save the data to your project space

1. Select the empty cell below.
1. Insert your project information: click **More > Insert project token** on the menu bar.<br/>
![ws-project.mov](https://media.giphy.com/media/jSVxX2spqwWF9unYrs/giphy.gif)

In [15]:
# @hidden_cell
# The project token is an authorization token that is used to access project resources like data sources, connections, and used by platform APIs.
from project_lib import Project
project = Project(project_id='f168d5b9-7d61-43d8-a4d8-11210b2c23b0', project_access_token='p-2+kzkmoe76D1yubCBURnvTUg==;d5c5TDKpizFQiwpiYbTzKg==:RnjSS2J/Ua7wOZrFEZ8ITmHiFmhyOvcwAmD6MPEHQECJxU16Rs8SeV12uVJRHcwEMDEHq90UqlthlZicVCcSWUtXkttEaItJXw==')
pc = project.project_context

from ibm_watson_studio_lib import access_project_or_space
wslib = access_project_or_space({'token':'p-2+kzkmoe76D1yubCBURnvTUg==;d5c5TDKpizFQiwpiYbTzKg==:RnjSS2J/Ua7wOZrFEZ8ITmHiFmhyOvcwAmD6MPEHQECJxU16Rs8SeV12uVJRHcwEMDEHq90UqlthlZicVCcSWUtXkttEaItJXw=='})


#### Save the file you will use for the AutoAI experiment.

In [17]:
wslib.save_data("iot_sensor_data_training.csv", df.to_csv(index=False).encode(), overwrite=True)

{'name': 'iot_sensor_data_training.csv',
 'asset_type': 'data_asset',
 'asset_id': 'c7aa928a-420c-4c7a-a042-7bb5ea4ed7f6',
 'attachment_id': '8f38d817-a4ea-4ab6-b844-fd81029d81af',
 'filepath': 'iot_sensor_data_training.csv',
 'data_size': 21354,
 'mime': 'text/csv',
 'summary': ['looked up asset',
  'selected attachment',
  'overwritten file',
  'updated attachment'],
 'access_count': 1}

Save the file you will use to test the application.

In [18]:
wslib.save_data("iot_sensor_data_test.csv", df_test.to_csv(index=False).encode(), overwrite=True)

{'name': 'iot_sensor_data_test.csv',
 'asset_type': 'data_asset',
 'asset_id': 'c1afc06e-b5d0-457d-a7ce-29e1bd9fc36a',
 'attachment_id': '098d2a2c-97fb-46f2-98ee-08d4ff400f59',
 'filepath': 'iot_sensor_data_test.csv',
 'data_size': 278,
 'mime': 'text/csv',
 'summary': ['looked up asset',
  'selected attachment',
  'overwritten file',
  'updated attachment'],
 'access_count': 1}