In [1]:
import numpy as np
import pandas as pd
df = pd.read_csv('balance-scale.data', 
                 names=['balance', 'var1', 'var2', 'var3', 'var4'])
 

In [2]:
df.head()

Unnamed: 0,balance,var1,var2,var3,var4
0,B,1,1,1,1
1,R,1,1,1,2
2,R,1,1,1,3
3,R,1,1,1,4
4,R,1,1,1,5


In [3]:
df.balance.value_counts()

L    288
R    288
B     49
Name: balance, dtype: int64

In [7]:
df['balance'] = [1 if b=='B' else 0 for b in df.balance]
df.balance.value_counts()

0    576
1     49
Name: balance, dtype: int64

In [13]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [10]:
y = df.balance
X = df.drop('balance', axis =1)

In [14]:
#Train Model
clf_0 = LogisticRegression().fit(X,y)

pred_y_0 = clf_0.predict(X)

print (accuracy_score(pred_y_0, y))

0.9216


In [16]:
print (np.unique(pred_y_0))

[0]


**Our Classifier is not predicting 1**

**Let's do Undersampling**

In [17]:
from sklearn.utils import resample

In [20]:
#Seperate majoirty and minority classes

df_majority = df[df.balance == 0]
df_minority = df[df.balance == 1]

#Upsample minority class

df_minority_upsampled = resample(df_minority,
                                replace = True,
                                n_samples = 576, random_state = 123)
#Combining majority and upsampled_data

df_upsampled = pd.concat([df_majority, df_minority_upsampled])

#Check the count
df_upsampled.balance.value_counts()

1    576
0    576
Name: balance, dtype: int64

In [22]:
#Let's build another model
y = df_upsampled["balance"]
x = df_upsampled.drop('balance', axis = 1)

clf_1 = LogisticRegression().fit(x,y)

pred_y_1 = clf_1.predict(x)

print(accuracy_score(y,pred_y_1))

0.513888888889


**Downsampling**

In [24]:
df_majority = df[df.balance == 0]
df_minority = df[df.balance == 1]

#Downsample Majority Class

df_majority_downsampled = resample(df_majority, replace = False, n_samples = 49, random_state = 123)

#Concat dataset

df_downsampled = pd.concat([df_majority_downsampled, df_minority])

df_downsampled.balance.value_counts()

1    49
0    49
Name: balance, dtype: int64

In [30]:
x = df_downsampled.drop('balance', axis = 1)
y = df_downsampled["balance"]

# Train model

clf_2 = LogisticRegression().fit(x,y)

pred_2 = clf_2.predict(x)

print( np.unique( pred_2 ) )

print(accuracy_score(y, pred_2))

[0 1]
0.581632653061


In [31]:
from sklearn.metrics import roc_auc_score

prob_y_2 = clf_2.predict_proba(x)

#Keep only positive class

prob_y_2 = [p[1] for p in prob_y_2]

prob_y_2

[0.45419197226479691,
 0.48205962213283965,
 0.46862327066392495,
 0.4786837883268909,
 0.58143856820159612,
 0.5583764990310689,
 0.42669871325264447,
 0.58097288479899123,
 0.48309804296998415,
 0.56235240454957103,
 0.46930663687445212,
 0.44476205428675314,
 0.57712901148359219,
 0.58155988279565041,
 0.55405268351629622,
 0.53911366155306062,
 0.4458303368422461,
 0.45918704415678591,
 0.5577817087202307,
 0.44539874632238285,
 0.42665806467540329,
 0.44646732846302473,
 0.53527568074276366,
 0.52007720575126049,
 0.54959727622020293,
 0.4497297538383348,
 0.52119805966267663,
 0.44531665479453131,
 0.44103505080436944,
 0.47708585493846289,
 0.57170836584701867,
 0.47932711938768546,
 0.46918248269515189,
 0.43098919210118869,
 0.56298674733767673,
 0.47418999697551056,
 0.49205837538425878,
 0.48828357946262607,
 0.57225811394655135,
 0.49712294718990319,
 0.53859695920545214,
 0.48201813383473585,
 0.52569195103186839,
 0.51003932526293161,
 0.45921915922317635,
 0.436639570156

In [32]:
print (roc_auc_score(y,prob_y_2))

0.568096626406


In [36]:
#AUC ROC of Imbalanced Dataset

prob_y_0 = clf_0.predict_proba(x)

prob_y_0 = [p[0] for p in prob_y_0]

print (roc_auc_score(y,prob_y_0))

0.522698875469


**Penalize Algorithms**

In [42]:
from sklearn.svm import SVC
x = df.drop('balance', axis =1)
y = df.balance

#Train Model

clf_3 = SVC(kernel ='linear', class_weight = 'balanced', probability = True)
clf_3.fit(x,y)

pred_y_3 = clf_3.predict(x)

print (accuracy_score(y,pred_y_3))

pred_y_3 = clf_3.predict_proba(x)

pred_y_3 = [p[0] for p in pred_y_3]

print(roc_auc_score(y,pred_y_3))

0.688
0.5305236678


**Tree Based Algorithm**

In [43]:
from sklearn.ensemble import RandomForestClassifier

In [45]:
y = df.balance
x = df.drop('balance', axis = 1)

#Train Model

clf_4 = RandomForestClassifier()
clf_4.fit(x,y)

pred_4 = clf_4.predict(x)

print( np.unique( pred_4 ) )

print (accuracy_score(y,pred_4))


[0 1]
0.9808


In [47]:
prob_y_4 = clf_4.predict_proba(x)
prob_y_4 = [p[1] for p in prob_y_4]
print (roc_auc_score(y,prob_y_4))


0.999840561224
