# Create Logistic Model - Handle Imbalaces

## Load Modules

In [230]:
from imblearn.combine import SMOTETomek
from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.under_sampling import ClusterCentroids, RandomUnderSampler, TomekLinks
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.utils import compute_class_weight

import seaborn as sns

### Laod the Dataset into Pandas Dataframe

In [231]:
data_file_path = "/Users/lancehester/Documents/ml_handling_imbalanced_data/data/clean_data.csv"

In [232]:
df = pd.read_csv(data_file_path)

In [233]:
df.head(5)

Unnamed: 0,length,diameter,height,whole_weight,shucked_weight,viscera_weight,shell_weight,class,sex_I,sex_M
0,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,0,False,True
1,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,0,False,True
2,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,0,False,False
3,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,0,False,True
4,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,0,True,False


----

# Start Build the Binary Logistic Classifier

* `stratify` says to give equal proportions of class column to both train and test dataframes
* 0.2 or 20% of the dataset is used for testing
* random state helps us to see same result each time this is run

In [234]:
# Splitting the Training and Test Sets
df_train, df_test = train_test_split(df, test_size=0.2, stratify=df["class"], random_state=999)


In [235]:
features=df_train.drop(columns=["class"]).columns

In [236]:
df_test

Unnamed: 0,length,diameter,height,whole_weight,shucked_weight,viscera_weight,shell_weight,class,sex_I,sex_M
2883,0.505,0.400,0.125,0.5605,0.2255,0.1435,0.1700,0,True,False
800,0.450,0.335,0.125,0.3490,0.1190,0.1055,0.1150,0,False,True
2005,0.415,0.325,0.110,0.3160,0.1385,0.0795,0.0925,0,True,False
455,0.575,0.470,0.140,0.8375,0.3485,0.1735,0.2400,0,False,True
1008,0.620,0.510,0.180,1.3315,0.5940,0.2760,0.3880,0,False,False
...,...,...,...,...,...,...,...,...,...,...
2691,0.650,0.520,0.175,1.2655,0.6150,0.2775,0.3360,0,False,True
1695,0.630,0.485,0.165,1.2330,0.6565,0.2315,0.3035,0,False,True
436,0.360,0.275,0.095,0.2170,0.0840,0.0435,0.0900,0,True,False
1941,0.635,0.485,0.165,1.2695,0.5635,0.3065,0.3395,0,False,False


In [237]:
# Check the stratisfy worked by looking at proportion of classes in training and test dataframes
df_train["class"].value_counts(normalize=True)

class
0    0.992213
1    0.007787
Name: proportion, dtype: float64

In [238]:
df_test["class"].value_counts(normalize=True)

class
0    0.992814
1    0.007186
Name: proportion, dtype: float64

----

# Handling Imbalanced Datasets

* Oversampling (i.e., increase the number of minority class sets to rival proportion of majority class)

* Undersampling (i.e., decrease the number of majority class sets to rival proportion of minority class)

* Combining Oversampling and Undersampling

* Weighing the Classes Differently

* Chaning Algorithms 
----

#### --- Oversampling - Simple Random Oversampling

Take copies **with replacement**

* use pandas 
* use imbalanced-learn


In [239]:
len(df_train)
df_train["class"].value_counts()

class
0    3313
1      26
Name: count, dtype: int64

In [240]:
current_num_positive = df_train["class"] == 1 # Class == 1 == Positive
current_num_positive.shape

(3339,)

In [241]:
# using Pandas for simple sample and replace
current_num_positive = df_train["class"] == 1 # Class == 1 == Positive
num_to_oversample = len(df_train) - 2*current_num_positive.sum()
df_positive_oversample = df_train[current_num_positive].sample(n=num_to_oversample, replace=True, random_state=999)

#outer join (right join of dataframes) - keep df_train matching indices and add df_positive's why we did 2*current
df_train_oversample = pd.concat([df_train, df_positive_oversample]) 
df_train_oversample["class"].value_counts() # verify that we have the same count majority and minority class

class
0    3313
1    3313
Name: count, dtype: int64

In [242]:
df_train_oversample.head(2)

Unnamed: 0,length,diameter,height,whole_weight,shucked_weight,viscera_weight,shell_weight,class,sex_I,sex_M
457,0.36,0.28,0.105,0.199,0.0695,0.045,0.08,0,True,False
3059,0.63,0.495,0.18,1.31,0.495,0.295,0.4695,0,False,True


In [243]:
df_train_oversample.duplicated().sum() #see we have duplicated this many rows.

np.int64(3287)

In [244]:
# Apply Logisitic Regression to the new Balanced data set to see how fixing imbalances improves the logistic regression classifier

logistic_classifier = LogisticRegression(random_state=999) # instantiate LogisticRegression class

logistic_classifier.fit(df_train_oversample[features], df_train_oversample["class"])
y_pred = logistic_classifier.predict_proba(df_test[features])[:,1] #gets prediction of the Positive class, Class =1



In [245]:
y_pred

array([2.09907933e-01, 2.76187767e-01, 1.03570085e-01, 4.35112261e-01,
       5.22016253e-01, 4.88436783e-01, 8.38664353e-01, 1.38384639e-01,
       8.16057635e-02, 1.83765975e-01, 8.10406703e-02, 2.26678947e-01,
       3.20730486e-01, 9.84910379e-02, 1.94785695e-01, 2.89706667e-01,
       2.25986003e-01, 1.17567675e-01, 2.06352161e-01, 2.46865789e-01,
       4.98698408e-01, 6.14789115e-01, 7.43350220e-01, 3.73908413e-01,
       3.17560087e-01, 9.22186098e-02, 2.12755871e-01, 1.20645524e-01,
       4.69316584e-02, 1.57436948e-01, 4.04000605e-01, 1.16337672e-01,
       6.55898141e-01, 4.08658526e-01, 5.33538895e-01, 2.41522112e-01,
       2.83621559e-01, 1.90606925e-01, 4.93535836e-02, 3.72714442e-01,
       2.00178010e-01, 1.97893085e-01, 8.89165262e-01, 3.78327287e-01,
       7.03163578e-01, 7.38607998e-02, 1.18361572e-01, 2.62637722e-01,
       2.76820769e-01, 3.14652255e-01, 3.23008909e-01, 4.37748965e-01,
       1.54938081e-01, 6.29276244e-01, 2.70297285e-01, 6.14007523e-02,
      

In [246]:
# Look at roc_auc_score
roc_auc_score(df_test["class"], y_pred)

np.float64(0.9225975070365903)

#### Performing Simple Random Oversampling with `imbalanced-learn`


In [247]:
random_over_sampler = RandomOverSampler(random_state=999)
X_resampled, y_resampled = random_over_sampler.fit_resample(df_train[features], df_train["class"])
y_resampled.value_counts()

class
0    3313
1    3313
Name: count, dtype: int64

In [248]:
logistic_classifier = LogisticRegression(random_state=999) # instantiate LogisticRegression class

logistic_classifier.fit(df_train_oversample[features], df_train_oversample["class"])
y_pred = logistic_classifier.predict_proba(df_test[features])[:,1] #gets prediction of the Positive class, Class =1

In [249]:
# Look at roc_auc_score
roc_auc_score(df_test["class"], y_pred)

np.float64(0.9225975070365903)

### Oversampling with Shrinkage

Here we add some noise to the random sampling so that we do not get exact duplicates of sampled values. It is like adding a little bit of jitter so that the values are close, but not exactly the same.

In [250]:
random_over_sampler = RandomOverSampler(random_state=999, shrinkage=0.1) #Adding extra noise value 0.1

In [251]:
X_resampled, y_resampled = random_over_sampler.fit_resample(df_train[features], df_train["class"])

In [252]:
y_resampled.value_counts()

class
0    3313
1    3313
Name: count, dtype: int64

In [253]:
#checking for duplicates. We expect less duplicates because of the shrinkage = 0.1 use
X_resampled.duplicated().sum() #see we have duplicated this many rows.

np.int64(0)

In [254]:
logistic_classifier = LogisticRegression(random_state=999) # instantiate LogisticRegression class

logistic_classifier.fit(X_resampled, y_resampled)
y_pred = logistic_classifier.predict_proba(df_test[features])[:,1] #gets prediction of the Positive class, Class =1

# Look at roc_auc_score
roc_auc_score(df_test["class"], y_pred)

np.float64(0.42621632488942496)

### Oversampling using Synthetic Minority Over-sampling TEchnique `(SMOTE)`

Uses nearest neighbors to make synthetic random samples to increase the minority set.

Let's you add more information to the the training model which helps the model learn better.

In [255]:
smote = SMOTE(random_state=999)
X_resampled, y_resampled = smote.fit_resample(df_train[features], df_train["class"])
y_resampled.value_counts()

class
0    3313
1    3313
Name: count, dtype: int64

In [256]:
logistic_classifier = LogisticRegression(random_state=888)

logistic_classifier.fit(X_resampled, y_resampled)
y_pred = logistic_classifier.predict_proba(df_test[features])[:, 1]

roc_auc_score(df_test["class"], y_pred)

np.float64(0.9236027342179333)

---
---

# Undersampling 

Removes examples of the majority class to balance the data set (i.e., reduce examples). In other words, we randomly sample observations of the majority class equal to size of the minority class. The problem with this approach is that it may mean that we remove useful information from the dataset.


* **Simple random undersampling:** the basic approach of random sampling from the majority class.

* **Undersampling using K-Means:** synthesize based on the cluster centroids.

* **Undersampling using Tomek links:** detects and removes samples from Tomek links.


#### -- Simple Random Undersampling 

We take a sample from the majority class, to have the same size as the minority class. So there are risks of removing useful information from the dataset.


In [257]:
# Simple Random Undersampling Using Pandas
msk_negative = df_train["class"] == 0
msk_positive = df_train["class"] == 1

df_negative_undersample = df_train[msk_negative].sample(n=msk_positive.sum(), random_state=999) #sampling WITHOUT replacement
df_train_undersample = pd.concat([df_negative_undersample, df_train[msk_positive]]) #Outer join right 

df_train_undersample["class"].value_counts()

class
0    26
1    26
Name: count, dtype: int64

In [258]:
logistic_classifier = LogisticRegression(random_state=999)

logistic_classifier.fit(df_train_undersample[features], df_train_undersample["class"])
y_pred = logistic_classifier.predict_proba(df_test[features])[:, 1]

roc_auc_score(df_test["class"], y_pred)

np.float64(0.72778447929232)

In [259]:
# Simple Random Undersampling Using imbalanced-learn
random_under_sampler = RandomUnderSampler(random_state=999)
X_resampled, y_resampled = random_under_sampler.fit_resample(df_train[features], df_train["class"])

y_resampled.value_counts()

class
0    26
1    26
Name: count, dtype: int64

In [260]:
logistic_classifier = LogisticRegression(random_state=999)

logistic_classifier.fit(X_resampled, y_resampled)
y_pred = logistic_classifier.predict_proba(df_test[features])[:, 1]

roc_auc_score(df_test["class"], y_pred)

np.float64(0.72778447929232)

Simple Random Undersampling Results

AUC scores for Simple Random Undersampling (e.g., 0.72778447929232) are much lower than the Oversampling examples. A reason for this decrease in AUC value is because the minority class has a small number of samples. In Undersampling we removed a lot of information when undersampling. 

#### -- Undersampling using `K-Means/Cluster Centroids`

Besides random sampling, we could also use the cluster centroid of the K-Means method as the new sample of the majority class. This means the new sample of the majority class is not the original data anymore. They are synthesized with cluster centroids. So the new sample should be more representative of the actual majority class data.

In [261]:
cc = ClusterCentroids(random_state=999)
X_resampled, y_resampled = cc.fit_resample(df_train[features], df_train["class"])

y_resampled.value_counts()

class
0    26
1    26
Name: count, dtype: int64

In [262]:
logistic_classifier = LogisticRegression(random_state=999)

logistic_classifier.fit(X_resampled, y_resampled)
y_pred = logistic_classifier.predict_proba(df_test[features])[:, 1]

roc_auc_score(df_test["class"], y_pred)

np.float64(0.8021712907117008)

The AUC of K-Means/Cluster Centroid Undersampling performs slightly better than just Simple Random Undersampling

#### -- Undersampling using `Tomek Links`

This method detects Tomek links and removes samples based on them.

A [Tomek link](https://en.wikipedia.org/wiki/Oversampling_and_undersampling_in_data_analysis) removes unwanted overlap between classes where majority class links are removed until all minimally distanced nearest neighbor pairs are of the same class.


It is between two samples of different classes. When the two samples are the nearest neighbors of each other, they form a Tomek link. 

In our example of the binary classification problem, a Tomek link is a pair of examples from each class that is the closest neighbor across the dataset. After detecting such a link, we could remove data within the pair. Usually, we remove the sample from the majority class to achieve undersampling, i.e., remove the majority class close to the minority class. This removes ambiguity between the two classes.

So, undersampling with Tomek links clean up the overlaps between classes, making them easier to distinguish.


In [263]:
tl = TomekLinks()
X_resampled, y_resampled = tl.fit_resample(df_train[features], df_train["class"])

y_resampled.value_counts()

class
0    3301
1      26
Name: count, dtype: int64

Notice Tomek Links Still results in an imbalanced data set, but the clustering of the two classes is made more distinc for training purposes. Namely, removing overalapping majority classes may help to highlight the key features of the minority class for training purposes. 

In practice, we would combine the Tomek Link approach with other techniques to get better results. Like Do Undersampling with Tomek then do an Oversample approach .

In [264]:
logistic_classifier = LogisticRegression(random_state=999)

logistic_classifier.fit(X_resampled, y_resampled)
y_pred = logistic_classifier.predict_proba(df_test[features])[:, 1]

roc_auc_score(df_test["class"], y_pred)

np.float64(0.7740249296340973)

---

# Combining Oversampling and Undersampling

---

####- SMOTE and Tomek Links



In [265]:
smote_tomek = SMOTETomek(random_state=999)
X_resampled, y_resampled = smote_tomek.fit_resample(df_train[features], df_train["class"])

y_resampled.value_counts()

class
0    3310
1    3310
Name: count, dtype: int64

In [266]:
logistic_classifier = LogisticRegression(random_state=999)

logistic_classifier.fit(X_resampled, y_resampled)
y_pred = logistic_classifier.predict_proba(df_test[features])[:, 1]

roc_auc_score(df_test["class"], y_pred)

np.float64(0.9236027342179333)

#### -- Weighing Classes Differently


In [267]:
weights = compute_class_weight("balanced", classes=df_train["class"].unique(), y=df_train["class"])
weights

array([ 0.50392394, 64.21153846])

In [268]:
print((df_train["class"] == 0).sum()*weights[0])

print((df_train["class"] == 1).sum()*weights[1])

1669.5
1669.5000000000002


In [269]:
print((df_train["class"] == 0).sum()*weights[0] + (df_train["class"] == 1).sum()*weights[1])

print((df_train["class"] == 0).sum() + (df_train["class"] == 1).sum())

3339.0
3339


In [270]:
logistic_classifier_weighted = LogisticRegression(class_weight='balanced', random_state=999) #use class_weight="balanced"
logistic_classifier_weighted.fit(df_train[features], df_train["class"])

y_pred = logistic_classifier_weighted.predict_proba(df_test[features])[:, 1]

roc_auc_score(df_test["class"], y_pred)

np.float64(0.9211901889827101)

In [271]:
logistic_classifier_weighted = LogisticRegression(class_weight={0: 1, 1: 100}, random_state=999)

logistic_classifier_weighted.fit(df_train[features], df_train["class"])
y_pred = logistic_classifier_weighted.predict_proba(df_test[features])[:, 1]

roc_auc_score(df_test["class"], y_pred)

np.float64(0.9236027342179333)

----
----

# Comparing The Original Training Dataset

In [272]:
logistic_classifier = LogisticRegression(random_state=999)

features = df_train.drop(columns=["class"]).columns
logistic_classifier.fit(df_train[features], df_train["class"])

y_pred = logistic_classifier.predict_proba(df_test[features])[:, 1]

roc_auc_score(df_test["class"], y_pred)

np.float64(0.7734217933252916)

---------
# Comparing AUC Results for each technique

Creating a model with the original dataset without any kind of rebalancing or weighting method yielded a logisitic regression model with a AUC score of 0.7734217933252916

The following table shows the AUC results when employing each technique in comparison to using the data set as is:

| Technique    | AUC score |
| -------- | ------- |
| Oversampling with Shrinkage  | 0.4262 |
| Simple Random Undersampling  | 0.7278 |
| Simple Random Undersampling with Imbalanced Learn  | 0.7278 |
| _Original Data Set_  | _0.7734_ |
| Tomek Links Undersampling  | 0.7740 |
| K-means/Cluster Centroid Undersampling | 0.8476 |
| Weighing Classes Differently | 0.9212 |
| Simple Random Oversampling  | 0.9226 |
| Simple Random Oversampling with Imbalanced Learn  | 0.9226 |
| Weighing Classes Differently using specific weights | 0.9236 |
| Oversampling with SMOTE  | 0.9236 |
| Combine Oversampling and Undersampling (SMOTETomek)  | 0.9236 |


Using the various techniques the best AUC results for the last three techniques are approximately the same ` 0.9236`