In [47]:
#Load the CSV dataset file
import pandas as pd
df = pd.read_csv('dataset_mf1.csv')

In [48]:
# Check for missing values in each column
missing_values = df.isnull().sum()

# Display columns with missing values and their counts
print(missing_values[missing_values > 0])

ChipRate             5
BF-CMratio          17
BlowFlow            16
ChipLevel4           1
T-upperExt-2         2
T-lowerExt-2         2
UCZAA               25
WhiteFlow-4          1
AAWhiteSt-4        151
AA-Wood-4            1
ChipMoisture-4       1
SteamFlow-4          1
Lower-HeatT-3        2
Upper-HeatT-3        2
ChipMass-4           1
WeakLiquorF          1
BlackFlow-2          2
WeakWashF            1
SteamHeatF-3         2
T-Top-Chips-4        1
SulphidityL-4      151
dtype: int64


In [49]:
#Print total rows and columns in original dataset
num_rows, num_columns = df.shape

print(f"Number of rows: {num_rows}")
print(f"Number of columns: {num_columns}")

Number of rows: 324
Number of columns: 23


In [50]:
print("Step 1: Handling the Missing Data\n")
print("Original Dataset Before cleaning the missing data:\n")
print(df)

df_cleaned = df.dropna()
print("\nDataset After cleaning the missing data:\n")
print(df_cleaned)

Step 1: Handling the Missing Data

Original Dataset Before cleaning the missing data:

    Observation  Y-Kappa  ChipRate  BF-CMratio  BlowFlow  ChipLevel4   \
0      31-00:00    23.10    16.520     121.717  1177.607      169.805   
1      31-01:00    27.60    16.810      79.022  1328.360      341.327   
2      31-02:00    23.19    16.709      79.562  1329.407      239.161   
3      31-03:00    23.60    16.478      81.011  1334.877      213.527   
4      31-04:00    22.90    15.618      93.244  1334.168      243.131   
..          ...      ...       ...         ...       ...          ...   
319    10-16:00    23.75    12.667      93.450  1178.252      276.955   
320     9-19:00    19.80    12.558      94.352  1184.119      297.071   
321     9-20:00    23.01    12.550      90.842  1188.517      289.826   
322     9-21:00    24.32    13.083      88.910  1192.879      318.006   
323     9-22:00    25.75    13.417      85.451  1186.342      248.312   

     T-upperExt-2   T-lowerExt-2    

In [51]:
#Check for missing values
import pandas as pd

# Load your dataset into a DataFrame
df_cleaned

# Count the number of rows with missing values
rows_with_missing_values = df_cleaned.isna().sum(axis=1)

# Count the total number of rows with missing values
total_rows_with_missing_values = (rows_with_missing_values > 0).sum()

print(f"Number of rows with missing values: {total_rows_with_missing_values}")

Number of rows with missing values: 0


In [52]:
print("Step 2: Checking for the Duplicate Values And Removing it\n")

print("Before removing duplicates:\n")
print(df_cleaned)

df_no_duplicates = df_cleaned.drop_duplicates()

print("\nAfter removing duplicates:\n")
print(df_no_duplicates)

Step 2: Checking for the Duplicate Values And Removing it

Before removing duplicates:

    Observation  Y-Kappa  ChipRate  BF-CMratio  BlowFlow  ChipLevel4   \
1      31-01:00    27.60    16.810      79.022  1328.360      341.327   
3      31-03:00    23.60    16.478      81.011  1334.877      213.527   
5       1-08:00    14.23    15.350      85.518  1171.604      198.538   
7      31-06:00    22.65    14.100      91.887  1307.852      288.989   
9      31-08:00    24.70    13.850      96.208  1334.892      362.511   
..          ...      ...       ...         ...       ...          ...   
312    31-10:00    24.40    14.117      85.998  1330.104      394.234   
317     4-16:00    17.80    16.625      78.367  1276.082      202.744   
319    10-16:00    23.75    12.667      93.450  1178.252      276.955   
320     9-19:00    19.80    12.558      94.352  1184.119      297.071   
322     9-21:00    24.32    13.083      88.910  1192.879      318.006   

     T-upperExt-2   T-lowerExt-2   

In [53]:
print("Step 3: Detecting and Removal of Outliers\n")
import pandas as pd
import numpy as np
from scipy import stats
from scipy.stats import zscore

# Load your dataset into a DataFrame
df = df_no_duplicates

# Define a Z-score threshold (e.g., 3 standard deviations)
z_threshold = 3

# Calculate Z-scores for all columns
numerical_cols = df.select_dtypes(include=[np.number]).columns
z_scores = np.abs(zscore(df[numerical_cols]))

# Identify rows with outliers in any column
outlier_rows = (z_scores > z_threshold).any(axis=1)

# Remove rows with outliers
df_no_outliers = df[~outlier_rows]

print("After removing the outliers..\n")
print(df_no_outliers)

Step 3: Detecting and Removal of Outliers

After removing the outliers..

    Observation  Y-Kappa  ChipRate  BF-CMratio  BlowFlow  ChipLevel4   \
1      31-01:00    27.60    16.810      79.022  1328.360      341.327   
3      31-03:00    23.60    16.478      81.011  1334.877      213.527   
5       1-08:00    14.23    15.350      85.518  1171.604      198.538   
7      31-06:00    22.65    14.100      91.887  1307.852      288.989   
9      31-08:00    24.70    13.850      96.208  1334.892      362.511   
..          ...      ...       ...         ...       ...          ...   
290    12-01:00    19.90    11.333      87.405  1033.565      369.383   
292    12-03:00    22.00    11.858      93.199  1171.206      366.787   
294    12-05:00    19.00    12.425      92.905  1272.030      316.226   
296    12-07:00    20.50    13.358      97.662  1304.597      377.678   
298    12-09:00    20.90    15.167      84.640  1283.706      339.440   

     T-upperExt-2   T-lowerExt-2    UCZAA  WhiteF

In [54]:
print("Step 4 : Performing Data Transformation using MinMax Scaling(Normalization) Technique\n")
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler

numerical_cols = df_no_outliers.select_dtypes(include=[np.number]).columns
# Initialize the scaler
scaler = MinMaxScaler()

# Fit and transform the scaler on the numerical features
df_no_outliers[numerical_cols] = scaler.fit_transform(df_no_outliers[numerical_cols])

print("Data is now transformed using Scaling technique\n")

Step 4 : Performing Data Transformation using MinMax Scaling(Normalization) Technique

Data is now transformed using Scaling technique



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_no_outliers[numerical_cols] = scaler.fit_transform(df_no_outliers[numerical_cols])


In [55]:
# Saving the DataFrame with all transformations to a new CSV file

df_no_outliers.to_csv('preprocessed_dataset.csv', index=False)

In [56]:
#Load the transformed CSV dataset file
import pandas as pd
df = pd.read_csv('preprocessed_dataset.csv')
print("Thus,the data is preprocessed and cleaned successfully..\n")
print(df.head())

Thus,the data is preprocessed and cleaned successfully..

  Observation   Y-Kappa  ChipRate  BF-CMratio  BlowFlow  ChipLevel4   \
0    31-01:00  1.000000  0.998830    0.262982  0.978323     0.782530   
1    31-03:00  0.700823  0.943349    0.313389  0.999950     0.424778   
2     1-08:00  0.000000  0.754846    0.427608  0.458104     0.382820   
3    31-06:00  0.629768  0.545956    0.589016  0.910264     0.636020   
4    31-08:00  0.783096  0.504178    0.698523  1.000000     0.841831   

   T-upperExt-2   T-lowerExt-2       UCZAA  WhiteFlow-4   ...  SteamFlow-4   \
0       0.310926        0.853949  0.649558      0.359505  ...      0.320187   
1       0.307710        0.920307  0.746903      0.643685  ...      0.681516   
2       0.108887        0.730125  0.449558      0.659790  ...      0.542206   
3       0.347423        0.920947  0.506195      0.650898  ...      0.800852   
4       0.348887        0.799296  0.589381      0.412181  ...      0.500639   

   Lower-HeatT-3  Upper-HeatT-3   