# Data Cleaning

In [13]:
import pandas as pd

df_cl = pd.read_csv('example_classification.csv')
df_cl.head()

Unnamed: 0,recency,history_segment,history,mens,womens,zip_code,newbie,channel,segment,visit,conversion,spend,DM_category
0,10,2) $100 - $200,142.44,1,0,Surburban,0,Phone,Womens E-Mail,0,0,0.0,4
1,6,3) $200 - $350,329.08,1,1,Rural,1,Web,No E-Mail,0,0,0.0,11
2,7,2) $100 - $200,180.65,0,1,Surburban,1,Web,Womens E-Mail,0,0,0.0,1
3,9,5) $500 - $750,675.83,1,0,Rural,1,Web,Mens E-Mail,0,0,0.0,2
4,2,1) $0 - $100,45.34,1,0,Urban,0,Web,Womens E-Mail,0,0,0.0,4


In [8]:
import pandas as pd

df_rg = pd.read_csv('example_regression.csv')
df_rg.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222.0,18.7,396.9,5.33,36.2


## CLASSIFIER

### Output identification

In [15]:
output_name_cl = 'conversion'
df_cl[output_name_cl].value_counts()

conversion
0    63422
1      578
Name: count, dtype: int64

In [7]:
import math
output_values = []
for e in set(df_cl[output_name_cl]):
  if not math.isnan(e):
      output_values.append(e)

print(f'This dataset has these classes as outputs: {output_values}')

This dataset has these classes as outputs: [0, 1]


### Missing Values

##### 1) Identify the type of missing data

* **Missing Completely at Random (MCAR):** The probability of a data point being missing is entirely random and independent of any other variable in the dataset. In simpler terms, whether a value is missing or not has nothing to do with the values of other variables or the characteristics of the data point itself.

* **Missing at Random (MAR):** The probability of a data point missing depends on the values of other variables in the dataset, but not on the missing variable itself. This means that the missingness mechanism is not entirely random, but it can be predicted based on the available information

* **Missing Not at Random (MNAR):** The probability of a data point being missing is related to the missing value itself. This means that the reason for the missing data is informative and directly associated with the variable that is missing.

In [35]:
df_cl.isna().sum()

recency            0
history_segment    0
history            0
mens               0
womens             0
zip_code           0
newbie             0
channel            0
segment            0
visit              0
conversion         0
spend              0
DM_category        0
dtype: int64

In [36]:
df_cl.isnull().sum()

recency            0
history_segment    0
history            0
mens               0
womens             0
zip_code           0
newbie             0
channel            0
segment            0
visit              0
conversion         0
spend              0
DM_category        0
dtype: int64

In [40]:
feature_with_missing_data = 'segment'
df_cl[df_cl[feature_with_missing_data].isna()].index
df_cl[df_cl[feature_with_missing_data].isna()]

Index([], dtype='int64')

##### 2) Evaluate the impact of missing data

Is the missingness causing bias or affecting the analysis?

In [41]:
df_cl.isnull().sum()/len(df_cl)

recency            0.0
history_segment    0.0
history            0.0
mens               0.0
womens             0.0
zip_code           0.0
newbie             0.0
channel            0.0
segment            0.0
visit              0.0
conversion         0.0
spend              0.0
DM_category        0.0
dtype: float64

##### 3) Choose appropriate handling strategies

**Drop missing values**

In [None]:
df_cl.dropna()

**Replace missing values with mean, median or mode**

In [None]:
mean_imputation = df_cl[feature_with_missing_data].fillna(df_cl[feature_with_missing_data].mean())
median_imputation = df_cl[feature_with_missing_data].fillna(df_cl[feature_with_missing_data].median())
mode_imputation = df_cl[feature_with_missing_data].fillna(df_cl[feature_with_missing_data].mode().iloc[0])

print("\nImputation using Mean:")
print(mean_imputation)

print("\nImputation using Median:")
print(median_imputation)

print("\nImputation using Mode:")
print(mode_imputation)

**Replace missing values with forward and backward fill**

In [None]:
forward_fill = df_cl[feature_with_missing_data].fillna(method='ffill') 
# Uses the last valid observation to fill missing values.

backward_fill = df_cl[feature_with_missing_data].fillna(method='bfill') 
# Uses the next valid observation to fill missing values.

print("\nForward Fill:")
print(forward_fill)

print("\nBackward Fill:")
print(backward_fill)

**Replace missing values with interpolation**

In [None]:
linear_interpolation = df_cl[feature_with_missing_data].interpolate(method='linear') 
# Assumes a straight line between two adjacent non-missing values

quadratic_interpolation = df_cl[feature_with_missing_data].interpolate(method='quadratic') 
# Assumes a quadratic curve that passes through three adjacent non-missing values.


print("\nLinear Interpolation:")
print(linear_interpolation)

print("\nQuadratic Interpolation:")
print(quadratic_interpolation)

### Change datatypes

In [None]:
feature_to_change_dtype = 'channel'
df_cl[feature_to_change_dtype].astype("int64") #int32, category

In [None]:
df_cl.convert_dtypes().dtypes #Convert columns to the best possible dtypes

### Class balancing

In [21]:
max_class = df_cl[output_name_cl].value_counts().idxmax()
min_class = df_cl[output_name_cl].value_counts().idxmin()

df_majority = df_cl[df_cl['conversion'] == max_class]
df_minority = df_cl[df_cl['conversion'] == min_class]

print('Maximum number of samples in class', max_class, f'with {len(df_majority)} samples')
print('Minimum number of samples in class', min_class, f'with {len(df_minority)} samples')

Maximum number of samples in class 0 with 63422 samples
Minimum number of samples in class 1 with 578 samples


In [33]:
print(f'The class {max_class} will be reduced in {round(100*(1-len(df_minority)/len(df_majority)),2)}%')

The class 0 will be reduced in 99.09%


In [27]:
from sklearn.utils import resample

df_majority_downsampled = resample(df_majority, replace=False, n_samples=len(df_minority), random_state=42)

df_balanced = pd.concat([df_majority_downsampled, df_minority])
df_balanced = df_balanced.reset_index()
del df_balanced['index']

df_balanced

Unnamed: 0,recency,history_segment,history,mens,womens,zip_code,newbie,channel,segment,visit,conversion,spend,DM_category
0,9,1) $0 - $100,29.99,0,1,Urban,1,Phone,Mens E-Mail,0,0,0.00,3
1,6,3) $200 - $350,320.66,0,1,Surburban,1,Phone,Mens E-Mail,0,0,0.00,3
2,7,3) $200 - $350,298.44,0,1,Urban,0,Web,No E-Mail,0,0,0.00,5
3,2,"6) $750 - $1,000",946.42,0,1,Urban,1,Multichannel,No E-Mail,0,0,0.00,5
4,2,5) $500 - $750,534.88,1,0,Surburban,1,Web,Mens E-Mail,0,0,0.00,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1151,9,5) $500 - $750,536.80,1,1,Rural,1,Web,No E-Mail,1,1,117.84,11
1152,2,"6) $750 - $1,000",980.39,1,0,Urban,1,Phone,No E-Mail,1,1,90.57,6
1153,5,3) $200 - $350,210.12,0,1,Surburban,0,Phone,Womens E-Mail,1,1,113.44,1
1154,2,3) $200 - $350,215.61,1,0,Rural,0,Web,Mens E-Mail,1,1,99.49,2


### Divide in features and output

In [34]:
y = df_balanced[output_name_cl]
X = df_balanced.copy()
del X[output_name_cl]

print('Features to study and create the ML model are', X.columns.values)
print('Output class name is', y.name)

Features to study and create the ML model are ['recency' 'history_segment' 'history' 'mens' 'womens' 'zip_code' 'newbie'
 'channel' 'segment' 'visit' 'spend' 'DM_category']
Output class name is conversion


## REGRESSOR

#### Output identification

In [11]:
output_name_rg = 'MEDV'
df_rg[output_name_rg].describe()

count    506.000000
mean      22.532806
std        9.197104
min        5.000000
25%       17.025000
50%       21.200000
75%       25.000000
max       50.000000
Name: MEDV, dtype: float64