In [1]:
# Load libraries - some that will be used in this sample #
import pickle
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import MinMaxScaler, LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.model_selection import StratifiedKFold, train_test_split, RandomizedSearchCV, GridSearchCV, LeaveOneOut, StratifiedShuffleSplit
from sklearn.metrics import classification_report, precision_score, recall_score, f1_score, auc, roc_curve, confusion_matrix, accuracy_score, roc_auc_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.feature_selection import RFECV, RFE
import xgboost
from xgboost import XGBClassifier
import boto3
import io
from io import BytesIO 
import joblib
import lightgbm as lgb

# To ignore any warnings - so they wont pop up #
import warnings
warnings.filterwarnings('ignore')

  import pandas.util.testing as tm
  defaults = yaml.load(f)


In [2]:
#Saving/Reading CSV datasets Files to/from S3 Location
#Source: https://stackoverflow.com/questions/43355074/read-a-csv-file-from-aws-s3-using-boto-and-pandas
def split_bucket_name(path):
    path_objs = [f for f in path.split('/') if f not in ['s3:', '']]
    bucket_name = path_objs[0]
    new_path = '/'.join(path_objs[1:])
    
    return new_path, bucket_name

def get_s3_csv(file_path, **kwargs):
    
    new_path, bucket_name = split_bucket_name(file_path)
    
    try:
        s3 = boto3.client('s3')
        obj = s3.get_object(Key=new_path, Bucket=bucket_name)
    except:
        s3 = boto3.resource('s3')
        my_bucket = s3.Bucket(bucket_name)
        
        for object_summary in my_bucket.objects.filter(Prefix=new_path):
            if '.csv' in str(object_summary.key):
                new_path = str(object_summary.key)
        
        s3 = boto3.client('s3')
        obj = s3.get_object(Key=new_path, Bucket=bucket_name)
    
    df = pd.read_csv(obj['Body'], **kwargs)
    
    return df
            
#Saving/Reading SAV models Files to/from S3 Location
#Source: https://stackoverflow.com/questions/62941174/how-to-write-load-machine-learning-model-to-from-s3-bucket-through-joblib

def write_model(model_name,file_path, bucket_name):
  s3 = boto3.client('s3')
  with BytesIO() as f:
   joblib.dump(model_name, f)
   f.seek(0)
   s3.upload_fileobj(Bucket=bucket_name, Key=file_path, Fileobj=f)



def read_model(file_path, bucket_name):
  s3 = boto3.client('s3')
  with BytesIO() as f:
   s3.download_fileobj(Bucket=bucket_name, Key=file_path, Fileobj=f)
   f.seek(0)
   model_output = joblib.load(f)
   
   return(model_output)

def decile_analysis(decile_df):
    base_response_rate = np.round(100*decile_df[decile_df.y == 1].shape[0]/decile_df.shape[0],decimals=2)
    decile_df.sort_values(by='y_prob',inplace=True,ascending=False)
    decile_df.reset_index(inplace=True)
    decile_df['decile'] = np.nan
    d = int(np.ceil(decile_df.shape[0]/10))
    start = 0
    end = d
    for i in range(10):
        decile_df.loc[start:end,['decile']] = i+1
        start = start+d
        end = end+d
    qq = pd.crosstab(decile_df['decile'], decile_df['y'])
    qq.columns = ['zero','one']
    qq['min_prob'] = decile_df.groupby(by=['decile']).min()['y_prob']
    qq['max_prob'] = decile_df.groupby(by=['decile']).max()['y_prob']
    qq['count'] = decile_df.groupby(by=['decile']).count()['y_prob']
    qq['gain'] = np.round(100*qq['one']/qq['one'].sum(),decimals=2)
    qq['cum_gain'] = np.cumsum(qq['gain'])
    qq['capture'] = np.round((100*qq['one']/qq['count']),2)
    return qq

def column_index(df, query_cols):
    cols = df.columns.values
    sidx = np.argsort(cols)
    return sidx[np.searchsorted(cols, query_cols, sorter = sidx)]

In [9]:
my_bucket_name = 'astro-datalake-prod-sandbox'
model = read_model('group_data/njoi/model/Kids_Pack_LGBM_Overall.sav',my_bucket_name)

In [12]:
BASE = get_s3_csv("s3://astro-datalake-prod-sandbox/Amirul/03 NJOI-Sooka/3.1 NJOI/Model/Kids_Pack/merged_table_202304")
BASE = BASE.fillna(0)

In [13]:
# cat = ['race', 'cust_state', 'cust_region', 'Segmentation', 'ARPU Group']
# Select the categorical columns
categorical_subset = BASE[['race', 'cust_state', 'cust_region', 'Segmentation', 'ARPU Group']]

# One hot encode
categorical_subset = pd.get_dummies(categorical_subset)

# Concat all encoded categorical features with the main table
features = pd.concat([BASE, categorical_subset], axis = 1)

In [14]:
x = features[['Recency', 'Frequency', 'ratio_total_spent_l3m_per_l12m', 'sports_pack_purchased_l12m', 'ratio_sports_pack_purchased_l12m', 'ratio_3_days_purchased_l12m',
               'astro_super_sports_3_ch_group_purchased_l12m', 'ratio_sunday_pack_purchased_l12m', 'ratio_evening_pack_purchased_l12m', 'ratio_afternoon_pack_purchased_l12m']]
y = features['purchased_pack']

In [15]:
len(x.columns)

10

In [16]:
x.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 315981 entries, 0 to 315980
Data columns (total 10 columns):
 #   Column                                        Non-Null Count   Dtype  
---  ------                                        --------------   -----  
 0   Recency                                       315981 non-null  float64
 1   Frequency                                     315981 non-null  int64  
 2   ratio_total_spent_l3m_per_l12m                315981 non-null  float64
 3   sports_pack_purchased_l12m                    315981 non-null  float64
 4   ratio_sports_pack_purchased_l12m              315981 non-null  float64
 5   ratio_3_days_purchased_l12m                   315981 non-null  float64
 6   astro_super_sports_3_ch_group_purchased_l12m  315981 non-null  int64  
 7   ratio_sunday_pack_purchased_l12m              315981 non-null  float64
 8   ratio_evening_pack_purchased_l12m             315981 non-null  float64
 9   ratio_afternoon_pack_purchased_l12m           31

In [17]:
y_pred = model.predict_proba(x)[:,1] # Change YOUR_MODEL to whatever you named your trained model as #
y_class = np.where(y_pred >= 0.5, 1, 0)
df_y = pd.DataFrame()
df_y['account_no'] = BASE['account_no']
df_y['y'] = y # Change accordingly #
df_y['y_prob'] = y_pred
df_y['y_pred'] = y_class
qq = decile_analysis(df_y)
# df_y4.drop('index',axis=1).to_csv('May_Full_Base_Decile_Rank.csv')

print(qq)

print('Accuracy: ', accuracy_score(y, y_class))
print('Precision: ', precision_score(y, y_class))
print("AUC: ", roc_auc_score(y, model.predict_proba(x)[:, 1]))
print('F1: ', f1_score(y, y_class))
print('Recall: ', recall_score(y, y_class))

         zero   one  min_prob  max_prob  count   gain  cum_gain  capture
decile                                                                  
1.0     29829  1770  0.112418  1.000000  31599  14.24     14.24     5.60
2.0     29987  1612  0.070000  0.112418  31599  12.97     27.21     5.10
3.0     30095  1504  0.060000  0.070000  31599  12.10     39.31     4.76
4.0     30852   747  0.050000  0.060000  31599   6.01     45.32     2.36
5.0     30561  1038  0.030000  0.050000  31599   8.35     53.67     3.28
6.0     29688  1911  0.020000  0.030000  31599  15.38     69.05     6.05
7.0     29391  2208  0.010000  0.020000  31599  17.76     86.81     6.99
8.0     30919   680  0.000000  0.010000  31599   5.47     92.28     2.15
9.0     31113   486  0.000000  0.000000  31599   3.91     96.19     1.54
10.0    31117   473  0.000000  0.000000  31590   3.81    100.00     1.50
Accuracy:  0.9569088014785699
Precision:  0.020984665052461663
AUC:  0.5782039051176061
F1:  0.0038045068773778163
Recall:  