In [37]:
import os
import pandas as pd
import numpy as np
import math
from sklearn.utils import resample

# Read in clean data

In [6]:
def data_reader():
    """
    read data into notebook 
    """
        
    data_dir = os.path.join('../', 'data') 

    train_binary_path = os.path.join(data_dir, 'train_binary.csv')  
    train_binary = pd.read_csv(train_binary_path, index_col = 0)

    return train_binary

In [23]:
train_binary = data_reader()
train_binary.head()

Unnamed: 0,user_id,obs_count,unique_action,total_secs_elapsed,unique_device,avg_time,date_account_created,timestamp_first_active,date_first_booking,gender,...,signup_flow,language,affiliate_channel,affiliate_provider,first_affiliate_tracked,signup_app,first_device_type,first_browser,country_destination,isNDF
0,00023iyk9l,40,13,867896.0,2,21697.4,2014-05-14,20140514195452,2014-05-14,-unknown-,...,0,en,direct,direct,omg,Web,Mac Desktop,Safari,US,False
1,001wyh0pz8,90,10,282965.0,1,3144.055556,2014-04-24,20140424171216,,-unknown-,...,23,en,direct,direct,untracked,Android,Other/Unknown,-unknown-,NDF,True
2,0028jgx1x1,31,5,297010.0,2,9580.967742,2014-06-21,20140621110439,,-unknown-,...,0,en,direct,direct,linked,Moweb,Android Phone,Android Browser,NDF,True
3,002qnbzfs5,789,25,6487080.0,2,8221.901141,2014-04-26,20140426210953,2014-04-26,FEMALE,...,25,en,direct,direct,linked,iOS,iPhone,Mobile Safari,US,False
4,0035hobuyj,489,20,5724670.0,1,11706.891616,2014-01-02,20140102222454,2014-01-04,FEMALE,...,0,en,sem-brand,google,omg,Web,Mac Desktop,Safari,US,False


# Examine Class Imbalance

In [26]:
# we can see that the classes are very imbalanced
country_counts = train_binary['country_destination'].value_counts()
country_counts

NDF      45041
US       20095
other     3655
FR        1435
IT         979
GB         731
ES         707
CA         440
DE         250
NL         247
AU         152
PT          83
Name: country_destination, dtype: int64

In [35]:
# count of Non NDF
nonNDF_count = sum(country_counts[1:])
print("There are", nonNDF_count, "non NDF observations.")
print("Which is", round((nonNDF_count * 100)/train_binary.shape[0],2), "% of observations.")
print("We could get ~60% accuracy without a model, by predicting all NDF.")

There are 28774 non NDF observations.
Which is 38.98 % of observations.
We could get ~60% accuracy without a model, by predicting all NDF.


# Upsample Non NDF Observations for Binary Classification Use

In [40]:
# Separate majority and minority classes
df_ndf = train_binary[train_binary.isNDF==True]
df_non_ndf = train_binary[train_binary.isNDF==False]
 
# Upsample minority class
df_non_ndf_upsampled = resample(df_non_ndf, 
                                 replace=True,     # sample with replacement
                                 n_samples=country_counts[0],    # to match majority class
                                 random_state=123) # reproducible results
 
# Combine majority class with upsampled minority class
df_upsampled = pd.concat([df_ndf, df_non_ndf_upsampled])
 
# Display new class counts
df_upsampled.isNDF.value_counts()

True     45041
False    45041
Name: isNDF, dtype: int64

In [54]:
# New class balances compared to previous
new_country_counts = df_upsampled.country_destination.value_counts()
print("New class balance:\n",new_country_counts)
print("\nOld class balance:\n", country_counts)
print("\nWe can now only get 50% accuracy by predicting all NDF.")

New class balance:
 NDF      45041
US       31391
other     5721
FR        2284
IT        1526
GB        1176
ES        1067
CA         715
NL         390
DE         378
AU         243
PT         150
Name: country_destination, dtype: int64

Old class balance:
 NDF      45041
US       20095
other     3655
FR        1435
IT         979
GB         731
ES         707
CA         440
DE         250
NL         247
AU         152
PT          83
Name: country_destination, dtype: int64

We can now only get 50% accuracy by predicting all NDF.


In [56]:
df_upsampled.to_csv('../data/upsampled_train_binary.csv') 
#we can use this file as final training data