## Load and Understand Data

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv("../data/MiningProcess_Flotation_Plant_Database.csv")
df.head()

Unnamed: 0,date,% Iron Feed,% Silica Feed,Starch Flow,Amina Flow,Ore Pulp Flow,Ore Pulp pH,Ore Pulp Density,Flotation Column 01 Air Flow,Flotation Column 02 Air Flow,...,Flotation Column 07 Air Flow,Flotation Column 01 Level,Flotation Column 02 Level,Flotation Column 03 Level,Flotation Column 04 Level,Flotation Column 05 Level,Flotation Column 06 Level,Flotation Column 07 Level,% Iron Concentrate,% Silica Concentrate
0,2017-03-10 01:00:00,552,1698,301953,557434,395713,100664,174,249214,253235,...,250884,457396,432962,424954,443558,502255,44637,523344,6691,131
1,2017-03-10 01:00:00,552,1698,302441,563965,397383,100672,174,249719,250532,...,248994,451891,42956,432939,448086,496363,445922,498075,6691,131
2,2017-03-10 01:00:00,552,1698,304346,568054,399668,10068,174,249741,247874,...,248071,45124,468927,43461,449688,484411,447826,458567,6691,131
3,2017-03-10 01:00:00,552,1698,304736,568665,397939,100689,174,249917,254487,...,251147,452441,458165,442865,44621,471411,43769,427669,6691,131
4,2017-03-10 01:00:00,552,1698,303369,558167,400254,100697,174,250203,252136,...,248928,452441,4529,450523,45367,462598,443682,425679,6691,131


In [3]:
# Stats for the data
df.describe()


Unnamed: 0,date,% Iron Feed,% Silica Feed,Starch Flow,Amina Flow,Ore Pulp Flow,Ore Pulp pH,Ore Pulp Density,Flotation Column 01 Air Flow,Flotation Column 02 Air Flow,...,Flotation Column 07 Air Flow,Flotation Column 01 Level,Flotation Column 02 Level,Flotation Column 03 Level,Flotation Column 04 Level,Flotation Column 05 Level,Flotation Column 06 Level,Flotation Column 07 Level,% Iron Concentrate,% Silica Concentrate
count,737453,737453,737453,737453,737453,737453,737453,737453,737453,737453,...,737453,737453,737453,737453,737453,737453,737453,737453,737453,737453
unique,4097,278,293,409317,319416,180189,131143,105805,43675,80442,...,86819,299573,331189,322315,309264,276051,301502,295667,38696,55569
top,2017-06-16 15:00:00,6403,626,25625,534668,402246,100591,175,299927,255322,...,299487,452441,608887,60106,491406,513879,47437,479478,6544,208
freq,180,142560,142560,690,959,1735,1509,3214,13683,1487,...,3405,1013,817,989,733,709,746,905,16920,17100


In [4]:
# Check the data types
df.dtypes

date                            object
% Iron Feed                     object
% Silica Feed                   object
Starch Flow                     object
Amina Flow                      object
Ore Pulp Flow                   object
Ore Pulp pH                     object
Ore Pulp Density                object
Flotation Column 01 Air Flow    object
Flotation Column 02 Air Flow    object
Flotation Column 03 Air Flow    object
Flotation Column 04 Air Flow    object
Flotation Column 05 Air Flow    object
Flotation Column 06 Air Flow    object
Flotation Column 07 Air Flow    object
Flotation Column 01 Level       object
Flotation Column 02 Level       object
Flotation Column 03 Level       object
Flotation Column 04 Level       object
Flotation Column 05 Level       object
Flotation Column 06 Level       object
Flotation Column 07 Level       object
% Iron Concentrate              object
% Silica Concentrate            object
dtype: object

In [5]:
# Calculate the number of missing values
missing = df.isnull().sum()
print(missing)


date                            0
% Iron Feed                     0
% Silica Feed                   0
Starch Flow                     0
Amina Flow                      0
Ore Pulp Flow                   0
Ore Pulp pH                     0
Ore Pulp Density                0
Flotation Column 01 Air Flow    0
Flotation Column 02 Air Flow    0
Flotation Column 03 Air Flow    0
Flotation Column 04 Air Flow    0
Flotation Column 05 Air Flow    0
Flotation Column 06 Air Flow    0
Flotation Column 07 Air Flow    0
Flotation Column 01 Level       0
Flotation Column 02 Level       0
Flotation Column 03 Level       0
Flotation Column 04 Level       0
Flotation Column 05 Level       0
Flotation Column 06 Level       0
Flotation Column 07 Level       0
% Iron Concentrate              0
% Silica Concentrate            0
dtype: int64


### Clean Data Type: Date

In [35]:
# Convert the date column to datetime
seconds_sample = df.copy()

seconds_sample['date'] = pd.to_datetime(seconds_sample['date'], format='%Y-%m-%d %H:%M:%S')

seconds_sample = seconds_sample[:176]

seconds_sample.tail()

Unnamed: 0,date,% Iron Feed,% Silica Feed,Starch Flow,Amina Flow,Ore Pulp Flow,Ore Pulp pH,Ore Pulp Density,Flotation Column 01 Air Flow,Flotation Column 02 Air Flow,...,Flotation Column 07 Air Flow,Flotation Column 01 Level,Flotation Column 02 Level,Flotation Column 03 Level,Flotation Column 04 Level,Flotation Column 05 Level,Flotation Column 06 Level,Flotation Column 07 Level,% Iron Concentrate,% Silica Concentrate
171,2017-03-10 01:00:00,552,1698,3121,544312,39564,101607,16722,24995,250192,...,25118,426591,345421,465576,469843,468936,472466,416382,6691,131
172,2017-03-10 01:00:00,552,1698,311846,549713,386162,101601,167117,249686,24984,...,250291,429944,460236,467169,463064,471223,49327,448911,6691,131
173,2017-03-10 01:00:00,552,1698,314727,540131,391406,101595,167014,249269,24561,...,248873,450139,486288,455219,441813,462989,494026,481052,6691,131
174,2017-03-10 02:00:00,552,1698,317041,539673,399697,101589,16691,249291,248269,...,249774,462601,488724,441674,433629,448477,480866,489382,6706,111
175,2017-03-10 02:00:00,552,1698,32083,544922,397529,101584,166807,250115,25207,...,251598,468357,493623,4427,445156,434102,451634,470266,6706,111


Unnamed: 0,date,% Iron Feed,% Silica Feed,Starch Flow,Amina Flow,Ore Pulp Flow,Ore Pulp pH,Ore Pulp Density,Flotation Column 01 Air Flow,Flotation Column 02 Air Flow,...,Flotation Column 07 Air Flow,Flotation Column 01 Level,Flotation Column 02 Level,Flotation Column 03 Level,Flotation Column 04 Level,Flotation Column 05 Level,Flotation Column 06 Level,Flotation Column 07 Level,% Iron Concentrate,% Silica Concentrate
0,2017-03-10 01:00:00,552,1698,301953,557434,395713,100664,174,249214,253235,...,250884,457396,432962,424954,443558,502255,44637,523344,6691,131
1,2017-03-10 01:00:00,552,1698,302441,563965,397383,100672,174,249719,250532,...,248994,451891,42956,432939,448086,496363,445922,498075,6691,131
2,2017-03-10 01:00:00,552,1698,304346,568054,399668,10068,174,249741,247874,...,248071,45124,468927,43461,449688,484411,447826,458567,6691,131
3,2017-03-10 01:00:00,552,1698,304736,568665,397939,100689,174,249917,254487,...,251147,452441,458165,442865,44621,471411,43769,427669,6691,131
4,2017-03-10 01:00:00,552,1698,303369,558167,400254,100697,174,250203,252136,...,248928,452441,4529,450523,45367,462598,443682,425679,6691,131


Up until row: 414173, we convert the date with 20 seconds increment

### Checking Data Type: Other Columns


In [11]:
# Exclude the date column
other_columns = df.drop(columns=['date'])
print(other_columns.dtypes)

% Iron Feed                     object
% Silica Feed                   object
Starch Flow                     object
Amina Flow                      object
Ore Pulp Flow                   object
Ore Pulp pH                     object
Ore Pulp Density                object
Flotation Column 01 Air Flow    object
Flotation Column 02 Air Flow    object
Flotation Column 03 Air Flow    object
Flotation Column 04 Air Flow    object
Flotation Column 05 Air Flow    object
Flotation Column 06 Air Flow    object
Flotation Column 07 Air Flow    object
Flotation Column 01 Level       object
Flotation Column 02 Level       object
Flotation Column 03 Level       object
Flotation Column 04 Level       object
Flotation Column 05 Level       object
Flotation Column 06 Level       object
Flotation Column 07 Level       object
% Iron Concentrate              object
% Silica Concentrate            object
dtype: object


In [12]:
other_columns.head()

Unnamed: 0,% Iron Feed,% Silica Feed,Starch Flow,Amina Flow,Ore Pulp Flow,Ore Pulp pH,Ore Pulp Density,Flotation Column 01 Air Flow,Flotation Column 02 Air Flow,Flotation Column 03 Air Flow,...,Flotation Column 07 Air Flow,Flotation Column 01 Level,Flotation Column 02 Level,Flotation Column 03 Level,Flotation Column 04 Level,Flotation Column 05 Level,Flotation Column 06 Level,Flotation Column 07 Level,% Iron Concentrate,% Silica Concentrate
0,552,1698,301953,557434,395713,100664,174,249214,253235,250576,...,250884,457396,432962,424954,443558,502255,44637,523344,6691,131
1,552,1698,302441,563965,397383,100672,174,249719,250532,250862,...,248994,451891,42956,432939,448086,496363,445922,498075,6691,131
2,552,1698,304346,568054,399668,10068,174,249741,247874,250313,...,248071,45124,468927,43461,449688,484411,447826,458567,6691,131
3,552,1698,304736,568665,397939,100689,174,249917,254487,250049,...,251147,452441,458165,442865,44621,471411,43769,427669,6691,131
4,552,1698,303369,558167,400254,100697,174,250203,252136,249895,...,248928,452441,4529,450523,45367,462598,443682,425679,6691,131


In [13]:
# Replace ',' with '.' and convert to float
other_columns = other_columns.replace(',', '.', regex=True)

other_columns.head()

Unnamed: 0,% Iron Feed,% Silica Feed,Starch Flow,Amina Flow,Ore Pulp Flow,Ore Pulp pH,Ore Pulp Density,Flotation Column 01 Air Flow,Flotation Column 02 Air Flow,Flotation Column 03 Air Flow,...,Flotation Column 07 Air Flow,Flotation Column 01 Level,Flotation Column 02 Level,Flotation Column 03 Level,Flotation Column 04 Level,Flotation Column 05 Level,Flotation Column 06 Level,Flotation Column 07 Level,% Iron Concentrate,% Silica Concentrate
0,55.2,16.98,3019.53,557.434,395.713,10.0664,1.74,249.214,253.235,250.576,...,250.884,457.396,432.962,424.954,443.558,502.255,446.37,523.344,66.91,1.31
1,55.2,16.98,3024.41,563.965,397.383,10.0672,1.74,249.719,250.532,250.862,...,248.994,451.891,429.56,432.939,448.086,496.363,445.922,498.075,66.91,1.31
2,55.2,16.98,3043.46,568.054,399.668,10.068,1.74,249.741,247.874,250.313,...,248.071,451.24,468.927,434.61,449.688,484.411,447.826,458.567,66.91,1.31
3,55.2,16.98,3047.36,568.665,397.939,10.0689,1.74,249.917,254.487,250.049,...,251.147,452.441,458.165,442.865,446.21,471.411,437.69,427.669,66.91,1.31
4,55.2,16.98,3033.69,558.167,400.254,10.0697,1.74,250.203,252.136,249.895,...,248.928,452.441,452.9,450.523,453.67,462.598,443.682,425.679,66.91,1.31


In [14]:
# Find the minimum and maximum values for % Silica Concentrate
min_value = other_columns['% Silica Concentrate'].min()
max_value = other_columns['% Silica Concentrate'].max()