In [9]:
import pandas as pd
import numpy as np

# Create a dummy dataset with consistent lengths
np.random.seed(0)
dummy_data = {
    'Feature1': np.random.normal(100, 10, 100).tolist() + [np.nan, 200],  # 102 entries
    'Feature2': np.random.randint(0, 100, 102).tolist(),  # 102 entries
    'Category': ['A', 'B', 'C', 'D'] * 25 + [np.nan, 'A'],  # 102 entries
    'Target': np.random.choice([0, 1], 102).tolist()  # 102 entries
}

df_dummy = pd.DataFrame(dummy_data)
print(df_dummy.head())

     Feature1  Feature2 Category  Target
0  117.640523        32        A       1
1  104.001572        70        B       1
2  109.787380        85        C       0
3  122.408932        31        D       1
4  118.675580        13        A       0


In [32]:
print(df_dummy)

       Feature1  Feature2 Category  Target
0    117.640523        32        A       1
1    104.001572        70        B       1
2    109.787380        85        C       0
3    122.408932        31        D       1
4    118.675580        13        A       0
..          ...       ...      ...     ...
97   117.858705        35        B       1
98   101.269121        30        C       1
99   104.019894        29        D       1
100         NaN        33      NaN       1
101  200.000000        18        A       0

[102 rows x 4 columns]


In [13]:
# Fill missing values: numeric columns with mean, categorical columns with mode
df_filled = df_dummy.copy()
# Numeric columns
numeric_cols = df_dummy.select_dtypes(include=np.number).columns
df_filled[numeric_cols] = df_filled[numeric_cols].fillna(df_filled[numeric_cols].mean())
# Categorical columns
df_filled['Category'] = df_filled['Category'].fillna(df_filled['Category'].mode()[0])

In [14]:
print(df_dummy.head())

     Feature1  Feature2 Category  Target
0  117.640523        32        A       1
1  104.001572        70        B       1
2  109.787380        85        C       0
3  122.408932        31        D       1
4  118.675580        13        A       0


In [15]:
# Fill missing values with the mean for numeric columns
df_filled = df_dummy.fillna(df_dummy.mean())

<class 'TypeError'>: can only concatenate str (not "int") to str

In [16]:
# Fill missing values: numeric columns with mean, categorical columns with mode
df_filled = df_dummy.copy()

# Handle NUMERIC columns
numeric_cols = df_dummy.select_dtypes(include=np.number).columns
df_filled[numeric_cols] = df_filled[numeric_cols].fillna(df_filled[numeric_cols].mean())

# Handle CATEGORICAL columns
df_filled['Category'] = df_filled['Category'].fillna(df_filled['Category'].mode()[0])

In [17]:
print(df_dummy.head())

     Feature1  Feature2 Category  Target
0  117.640523        32        A       1
1  104.001572        70        B       1
2  109.787380        85        C       0
3  122.408932        31        D       1
4  118.675580        13        A       0


In [18]:
# Fill missing categorical data with the mode (most frequent value)
df_filled['Category'].fillna(df_filled['Category'].mode()[0], inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_filled['Category'].fillna(df_filled['Category'].mode()[0], inplace=True)


In [19]:
print(df_filled.isnull().sum())  # Verify that there are no missing values

Feature1    0
Feature2    0
Category    0
Target      0
dtype: int64


In [20]:
from scipy import stats
# Calculate Z-scores for numerical features
z_scores = np.abs(stats.zscore(df_filled.select_dtypes(include=[np.number])))

In [21]:
# Remove rows with any Z-scores greater than 3 (commonly used threshold for outliers)
df_no_outliers = df_filled[(z_scores < 3).all(axis=1)]

In [22]:
print(df_no_outliers.describe())  # Verify that outliers have been removed

         Feature1    Feature2      Target
count  101.000000  101.000000  101.000000
mean   100.607824   46.029703    0.534653
std     10.079298   27.147175    0.501285
min     74.470102    0.000000    0.000000
25%     93.656779   28.000000    0.000000
50%    101.216750   41.000000    1.000000
75%    107.290906   69.000000    1.000000
max    122.697546   97.000000    1.000000


In [23]:
from sklearn.preprocessing import StandardScaler
# Scale numeric features using StandardScaler (Z-score normalization)
scaler = StandardScaler()
df_no_outliers[df_no_outliers.select_dtypes(include=[np.number]).columns] = scaler.fit_transform(df_no_outliers.select_dtypes(include=[np.number]))


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_no_outliers[df_no_outliers.select_dtypes(include=[np.number]).columns] = scaler.fit_transform(df_no_outliers.select_dtypes(include=[np.number]))


In [24]:
print(df_no_outliers.head())  # Verify that the data has been scaled

   Feature1  Feature2 Category    Target
0  1.698298 -0.519379        A  0.932936
1  0.338384  0.887380        B  0.932936
2  0.915276  1.442679        C -1.071884
3  2.173747 -0.556399        D  0.932936
4  1.801501 -1.222759        A -1.071884


In [25]:
# One-hot encode the categorical feature
df_encoded = pd.get_dummies(df_no_outliers, columns=['Category'])

In [26]:
print(df_encoded.head())  # Verify that the categorical variable has been encoded

   Feature1  Feature2    Target  Category_A  Category_B  Category_C  \
0  1.698298 -0.519379  0.932936        True       False       False   
1  0.338384  0.887380  0.932936       False        True       False   
2  0.915276  1.442679 -1.071884       False       False        True   
3  2.173747 -0.556399  0.932936       False       False       False   
4  1.801501 -1.222759 -1.071884        True       False       False   

   Category_D  
0       False  
1       False  
2       False  
3        True  
4       False  


In [27]:
# Save the preprocessed DataFrame to a CSV file
df_encoded.to_csv('preprocessed_dummy_data.csv', index=False)

print('Preprocessed data saved as preprocessed_dummy_data.csv')

Preprocessed data saved as preprocessed_dummy_data.csv


In [28]:
print(df_encoded.isnull().sum())

Feature1      0
Feature2      0
Target        0
Category_A    0
Category_B    0
Category_C    0
Category_D    0
dtype: int64


In [29]:
print(df_encoded.describe())

           Feature1      Feature2        Target
count  1.010000e+02  1.010000e+02  1.010000e+02
mean  -2.526444e-15 -3.407615e-17 -2.418308e-17
std    1.004988e+00  1.004988e+00  1.004988e+00
min   -2.606142e+00 -1.704018e+00 -1.071884e+00
25%   -6.930755e-01 -6.674590e-01 -1.071884e+00
50%    6.071482e-02 -1.861994e-01  9.329364e-01
75%    6.663572e-01  8.503597e-01  9.329364e-01
max    2.202524e+00  1.886919e+00  9.329364e-01


In [30]:
print(df_encoded.head())

   Feature1  Feature2    Target  Category_A  Category_B  Category_C  \
0  1.698298 -0.519379  0.932936        True       False       False   
1  0.338384  0.887380  0.932936       False        True       False   
2  0.915276  1.442679 -1.071884       False       False        True   
3  2.173747 -0.556399  0.932936       False       False       False   
4  1.801501 -1.222759 -1.071884        True       False       False   

   Category_D  
0       False  
1       False  
2       False  
3        True  
4       False  


In [31]:
print(df_encoded.columns)

Index(['Feature1', 'Feature2', 'Target', 'Category_A', 'Category_B',
       'Category_C', 'Category_D'],
      dtype='object')
