<a href="https://colab.research.google.com/github/carlosdgerez/machine_learning/blob/main/notebooks/hint_imbalanced.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Load some test data
import pandas as pd
titanic = pd.read_csv('https://raw.githubusercontent.com/byui-cse/cse450-course/master/data/titanic.csv')
titanic.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,No,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,Yes,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,Yes,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,Yes,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,No,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [2]:
# Looking ot our survivor ratio, we can see that there are more samples that died than lived
# This example isn't super imbalanced, but it'll serve to illustrate our point
titanic['Survived'].value_counts()

No     549
Yes    342
Name: Survived, dtype: int64

In [3]:
from imblearn.over_sampling import RandomOverSampler
# https://imbalanced-learn.readthedocs.io/en/stable/user_guide.html


# Let's over sample the minority class, which samples with replacement until the
# majority (died) and the minority (survived) are equal
ro = RandomOverSampler()

# Decide which features to use
titanic = titanic.dropna(subset=['Sex', 'Pclass', 'Embarked'])
features = ['Sex', 'Pclass', 'Embarked']
X = titanic[features]
y = titanic['Survived']

# Oversample, note that we oversample X and y at the same time in order to 
# make sure our features and targets stay synched.
X_new, y_new = ro.fit_resample(X, y)

# Convert this to a dataframe and check the counts, now they're equal, because
# we have a bunch of duplicate survivors
survivors = pd.DataFrame(y_new)
survivors.value_counts()

Survived
No          549
Yes         549
dtype: int64

In [11]:
survivors.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1098 entries, 0 to 1097
Data columns (total 1 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Survived  1098 non-null   object
dtypes: object(1)
memory usage: 8.7+ KB


In [4]:
X

Unnamed: 0,Sex,Pclass,Embarked
0,male,3,S
1,female,1,C
2,female,3,S
3,female,1,S
4,male,3,S
...,...,...,...
886,male,2,S
887,female,1,S
888,female,3,S
889,male,1,C


In [5]:
X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 889 entries, 0 to 890
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Sex       889 non-null    object
 1   Pclass    889 non-null    int64 
 2   Embarked  889 non-null    object
dtypes: int64(1), object(2)
memory usage: 27.8+ KB


In [7]:
X_new.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1098 entries, 0 to 1097
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Sex       1098 non-null   object
 1   Pclass    1098 non-null   int64 
 2   Embarked  1098 non-null   object
dtypes: int64(1), object(2)
memory usage: 25.9+ KB


In [10]:
y_new

0        No
1       Yes
2       Yes
3       Yes
4        No
       ... 
1093    Yes
1094    Yes
1095    Yes
1096    Yes
1097    Yes
Name: Survived, Length: 1098, dtype: object