In [1]:
import pandas as pd
import numpy as np
from pandas_profiling import ProfileReport
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, classification_report, roc_auc_score
from imblearn.over_sampling import SMOTE


  from pandas_profiling import ProfileReport


In [2]:
#Read and Clean data 
def read_data(file_path):
    df = pd.read_csv(file_path)
    drop_cols = ['REF_NO', 'occupation_partner','self_employed_partner','year_last_moved', 'TVarea', 'post_code', 'post_area', 'region']
    new_data = df.drop(drop_cols, axis = 1)

    #Transform the Children Column values
    new_data['children'] = new_data['children'].str.replace("Zero", "0")
    new_data['children'] = new_data['children'].str.replace("4+", "4")

    #Clean the Age Band Column
    new_data['age_band'] = new_data['age_band'].str.replace("Unknown", new_data['age_band'].mode()[0])
    
    return df, new_data

In [3]:
def transform_age(x):
    if '+' in x:
        x = x.replace('+','')
        x = x + '-'+ x
    return x

In [4]:
def create_age_band(new_data):
    new_data['age_band'] = new_data['age_band'].apply(transform_age)
    new_data = new_data.assign(lower_age = lambda x : x['age_band'].str.split("-", expand = True)[0],
               upper_age = lambda x : x['age_band'].str.split("-", expand = True)[1])
    new_data['upper_age'] = new_data['upper_age'].astype('int')
    bins = [0, 20, 30, 40, 50, 60, 100]
    labels = ['<20', '<30', '<40', '<50', '<60', '>60']
    new_data['Age_Bucket'] = pd.cut(new_data['upper_age'], bins=bins, labels=labels, right=False)
    drop_cols = ['age_band', 'lower_age','upper_age']
    new_data.drop(labels= drop_cols, axis= 1, inplace=True)
    return new_data

In [5]:
def get_family_income(x):
    if  ("<" in x) & (">" in x):
        x = x.split(">=")[1].replace(",",'')
    elif '>=' in x:
        x = x.replace('>=','').replace(',', '')
    elif '<=' in x:
        x = x.replace('<=','').replace(',', '')
    elif '>' in x:
        x = x.replace('>', '').replace(',','')
    elif '<' in x:
        x = x.replace('<','').replace(',','')
    else:
        x
    return x   

In [6]:
def income_bucket(x):
    # x = int(x)
    if x <= 10000:
        x = "<10000"
    elif x<=15000:
        x = "<15000"
    elif x<=20000:
        x = "<2000"
    elif x<=25000:
        x = "<25000"
    elif x<=30000:
        x = "<30000"
    else:
        x = ">30000"
    return x

In [7]:
def transform_family_income(new_data):
    new_data['family_income'] = new_data['family_income'].str.replace('Unknown', new_data['family_income'].mode()[0])
    new_data['family_income_new'] = new_data['family_income'].apply(get_family_income)
    new_data['family_income_new']= new_data['family_income_new'].astype('int')
    new_data['family_income_bucket'] = new_data['family_income_new'].apply(income_bucket)
    new_data.drop(['family_income_new','family_income','status', 'occupation', 'home_status','gender'], axis= 1, inplace= True)
    #Map the self Employes column witn 1 and 0
    yes_no = {"Yes" : 1, "No": 0}
    new_data['self_employed'] = new_data['self_employed'].map(yes_no)
    new_data = pd.get_dummies(data = new_data, columns=['Age_Bucket','family_income_bucket'], drop_first= True)
    return new_data

In [8]:
#Handle Imbalance , Craete Features ad Train Model

def create_features(new_data):
    X = new_data.drop('Revenue_Grid', axis = 1)
    Y = new_data['Revenue_Grid']
    
    smote = SMOTE()
    X,Y  = smote.fit_resample(X, Y)
    x_train, x_test, y_train, y_test = train_test_split(X,Y, test_size= 0.20, random_state=101)
    return x_train, x_test, y_train, y_test

In [9]:
def model_building(x_train,y_train):

    model = RandomForestClassifier()
    param_grid = {
              'n_estimators' : [100,150,200],
              'max_depth' : [5,10,15,20],
              'min_samples_split' : [5,10,15],
              'min_samples_leaf' : [5,10,15],
              'criterion' : ['gini', 'entropy'],
    }

    cv = GridSearchCV(estimator= model,param_grid =param_grid, n_jobs = -1, cv= 5, verbose = 3 )
    cv.fit(x_train,y_train)
    y_pred = cv.predict(x_test)

    model = RandomForestClassifier(n_estimators= cv.best_params_['n_estimators'],
                               max_depth= cv.best_params_['max_depth'],
                               min_samples_leaf= cv.best_params_['min_samples_leaf'],
                               min_samples_split= cv.best_params_['min_samples_split'])   
    model.fit(x_train,y_train)
    return model

In [10]:
def fit_model(mdel, x_test):
    
    y_pred = model.predict(x_test)
    acc_score = accuracy_score(y_pred, y_test)
    classif_report = classification_report(y_pred, y_test)
    cnf_matrix = confusion_matrix(y_pred, y_test)
    return y_pred

In [50]:
def evaluate_model(y_pred, y_test):

    acc_score = accuracy_score(y_pred, y_test)
    classif_report = classification_report(y_pred, y_test)
    cnf_matrix = confusion_matrix(y_pred, y_test)
    return acc_score,classif_report,cnf_matrix

Now Apply this on the training data 

In [12]:
data, new_data = read_data('existing_base_train.csv')

In [13]:
new_data = create_age_band(new_data)

In [14]:
new_data.head(2)

Unnamed: 0,children,status,occupation,home_status,family_income,self_employed,Average_Credit_Card_Transaction,Balance_Transfer,Term_Deposit,Life_Insurance,...,Investment_Tax_Saving_Bond,Home_Loan,Online_Purchase_Amount,gender,Investment_in_Commudity,Investment_in_Equity,Investment_in_Derivative,Portfolio_Balance,Revenue_Grid,Age_Bucket
0,2,Partner,Professional,Own Home,">=35,000",No,26.98,29.99,312.25,299.79,...,8.98,55.44,7.68,Female,151.55,81.79,136.02,360.37,2,<40
1,0,Partner,Secretarial/Admin,Own Home,">=35,000",No,35.98,74.48,0.0,99.96,...,0.0,0.0,18.99,Female,44.28,13.91,29.23,89.22,2,<60


In [15]:
new_data = transform_family_income(new_data)

In [16]:
x_train, x_test, y_train, y_test = create_features(new_data)

In [17]:
model = model_building(x_train,y_train)

Fitting 5 folds for each of 216 candidates, totalling 1080 fits


In [43]:
test_df, test_new_data= read_data('existing_base_test.csv')

In [45]:
test_new_data = create_age_band(test_new_data)
test_new_data = transform_family_income(test_new_data)

#run the sample submission data 
col_names = ['REF_NO', 'Revenue_Grid']
sample_df = pd.read_csv('sample_submission.csv', header=None, names= col_names)

In [52]:
y_unseen = sample_df['Revenue_Grid']
y_pred_unseen = model.predict(test_new_data)

In [56]:
acc_score,classif_report,cnf_matrix = evaluate_model(y_pred_unseen,y_unseen)

In [58]:
acc_score

0.49286065977351057

In [62]:
cnf_matrix

array([[144, 126],
       [904, 857]], dtype=int64)

In [64]:
test_df['Predicted Results'] = y_pred_unseen

In [31]:
# y_pred,acc_score,classif_report,cnf_matrix = evaluate_model(model, test_new_data)

In [66]:
test_df.head(2)

Unnamed: 0,REF_NO,children,age_band,status,occupation,occupation_partner,home_status,family_income,self_employed,self_employed_partner,...,Investment_Tax_Saving_Bond,Home_Loan,Online_Purchase_Amount,gender,region,Investment_in_Commudity,Investment_in_Equity,Investment_in_Derivative,Portfolio_Balance,Predicted Results
0,697,Zero,71+,Partner,Retired,Housewife,Own Home,"<12,500, >=10,000",No,No,...,0.0,0.0,0.0,Male,South West,40.48,15.07,28.4,83.05,2
1,7897,Zero,31-35,Partner,Unknown,Business Manager,Own Home,">=35,000",No,No,...,27.45,13.47,57.46,Male,South East,27.07,72.01,82.74,235.29,2


In [81]:
merged_df = pd.merge(test_df, sample_df[['REF_NO', 'Revenue_Grid']], on='REF_NO', how='left', suffixes=('_test_df', '_sample_df'))
merged_df = merged_df[['REF_NO','Predicted Results']]

In [83]:
merged_df.to_csv('submission.csv', index= False, header=False)