# Import the libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_selection import mutual_info_classif

# Read the Data

The following cell reads the <a href="https://github.com/academic-initiative/skillsbuild/blob/main/watsonx/files/iot_sensor_dataset.csv" target="_blank">iot_sensor_dataset.csv</a> data set from the GitHub repository.

In [None]:
import pandas as pd

url = 'https://raw.githubusercontent.com/academic-initiative/skillsbuild/refs/heads/main/watsonx/files/iot_sensor_dataset.csv'
df = pd.read_csv(url)
print(df.head(10))

# Data Analysis

## Are there missing values?

In [None]:
df.isnull().sum()

## How balanced is our dataset?
- Our target variable is **fail**. 
- fail can either be **0** or **1**.
- In a **very** balanced dataset, the number of 1s and 0s would be about equal.
- Consider a ratio range of **60/40 - 80/20** to be of mild imbalance


In [None]:
df['fail'].value_counts()

In [None]:
#To get the percentages
df['fail'].value_counts(normalize=True) * 100

## Is the data categorical or numerical?
At this point, the following information is unknown:
- If the predictors are categorical or numerical
- Information about the variables

In this next section, you check how many unique values exist for each variable.

In [None]:
print(df.nunique())

## Analyze the correlation between the predictor and target variable
Without additional information, assume that the predictors are categorical, except **footfall**, **outpressure**, and **temp**.

Separate the categorical and numerical variables as well as the target. 

In [None]:
X = df.drop(columns=['footfall', 'outpressure', 'temp', 'fail'])
X_numeric = df[['footfall', 'outpressure', 'temp']]
y = df['fail']

Encode the categorical variables by converting them to category codes.

In [None]:
X_encoded = X.apply(lambda col: col.astype('category').cat.codes)

Mutual Information (mi) measures the dependency between two variable.

Calculate the dependency between the target and the categorical variables.

In [None]:
mi_scores = mutual_info_classif(X_encoded, y, discrete_features=True)

Calculate the dependencies between the target and the numerical variables.

In [None]:
mi_scores_cont = mutual_info_classif(X_numeric, y, discrete_features=False) 

Print the dependencies for the numerical variables.

In [None]:
for feature, score in zip(X_numeric.columns, mi_scores_cont):
    print(f"mi between {feature} and Target: {score:.4f}")

Print the dependencies for the categorical variables.

In [None]:
for feature, score in zip(X.columns, mi_scores):
    print(f"mi between {feature} and Target: {score:.4f}")

Theoretically, you could omit any features that exhibit very low mutual information with the target. However, you’ll see later that AutoAI automatically carries out this feature selection in a more sophisticated manner.

## Extract a subset of the data

The small dataset will be used to test the deployed model.

In [None]:
print(len(df))
np.random.seed(42)
df_fail_0 = df[df['fail'] == 0].sample(n=min(3, len(df[df['fail'] == 0])), replace=False)
df_fail_1 = df[df['fail'] == 1].sample(n=min(1, len(df[df['fail'] == 1])), replace=False)
remaining_after_fail = df.drop(df_fail_0.index).drop(df_fail_1.index)
df_random = remaining_after_fail.sample(n=min(6, len(remaining_after_fail)), replace=False)
df_selected = pd.concat([df_fail_0, df_fail_1, df_random])
df = df[~df.index.isin(df_selected.index)].reset_index(drop=True)
df_test = df_selected.drop('fail', axis=1)
print(len(df))
print(len(df_selected))

# Save the data to your project space

1. Select the empty cell below.
1. Insert your project information: click **More > Insert project token** on the menu bar.<br/>
![ws-project.mov](https://media.giphy.com/media/jSVxX2spqwWF9unYrs/giphy.gif)

In [None]:
# @hidden_cell
# The project token is an authorization token that is used to access project resources like data sources, connections, and used by platform APIs.
from project_lib import Project
project = Project(project_id='f168d5b9-7d61-43d8-a4d8-11210b2c23b0', project_access_token='p-2+kzkmoe76D1yubCBURnvTUg==;d5c5TDKpizFQiwpiYbTzKg==:RnjSS2J/Ua7wOZrFEZ8ITmHiFmhyOvcwAmD6MPEHQECJxU16Rs8SeV12uVJRHcwEMDEHq90UqlthlZicVCcSWUtXkttEaItJXw==')
pc = project.project_context

from ibm_watson_studio_lib import access_project_or_space
wslib = access_project_or_space({'token':'p-2+kzkmoe76D1yubCBURnvTUg==;d5c5TDKpizFQiwpiYbTzKg==:RnjSS2J/Ua7wOZrFEZ8ITmHiFmhyOvcwAmD6MPEHQECJxU16Rs8SeV12uVJRHcwEMDEHq90UqlthlZicVCcSWUtXkttEaItJXw=='})


#### Save the file you will use for the AutoAI experiment.

In [None]:
wslib.save_data("iot_sensor_data_training.csv", df.to_csv(index=False).encode(), overwrite=True)

### Save the file you will use to test the application.

In [None]:
wslib.save_data("iot_sensor_data_test.csv", df_test.to_csv(index=False).encode(), overwrite=True)