In [None]:
!pip install pytorch_tabnet

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import time
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import log_loss
from sklearn.model_selection import train_test_split
from pytorch_tabnet.tab_model import TabNetClassifier
import xgboost as xgb
import torch

In [None]:
%cd  /content/drive/My Drive/Dreamquark_challenge

# Data preprocessing and feature engineering

In [None]:
requests_train = pd.read_csv('data/requests_train.csv')
individuals_train = pd.read_csv('data/individuals_train.csv')

First let's check the amount of missing values, we will focus for a first approach on the requests_train dataset:

In [None]:
#Calculate the percentage of null values for each variable
nullDist = requests_train.isnull().sum().reset_index()
nullDist.columns=['column_name', 'null_Percentage']
c = len(requests_train)
for i,v in nullDist.null_Percentage.iteritems():
    nullDist.null_Percentage[i]=(v*100)/c

fig, ax = plt.subplots(figsize=(25,10))
nullDist.plot.bar(ax=ax)    
ax.set_xlabel('Variable')
ax.set_ylabel('percentage')

#Save the plot in Output folder
plt.show()

In [None]:
print(requests_train.isnull().sum())

The categorical variable victim_of_violence_type as a very high rate of undefined values, let's see the correlation with the variable victim_of_violence, which is a binary variables

In [None]:
no_violence_victim_df = requests_train.loc[requests_train['victim_of_violence']=='f']
print('The values of victim_violence_type for non victims ', list(no_violence_victim_df['victim_of_violence_type'].unique()))
violence_victim_df = requests_train.loc[requests_train['victim_of_violence']=='t']
print('The values of victim_violence_type for  victims ', list(violence_victim_df['victim_of_violence_type'].unique()))
print('number of nans for victims of violence',violence_victim_df['victim_of_violence_type'].isnull().sum())

Thus to encode this variable, we will assign a zero to all nan values that are not victims of violence and a different value for the nan values victims of violence type

In [None]:
def encode_victims_of_violence_type(df):
  no_violence_victim_df = df.loc[df['victim_of_violence']=='f']
  violence_victim_df = df.loc[df['victim_of_violence']=='t']
  df['victim_of_violence_type'] = df['victim_of_violence_type'].astype(str)
  encoder = LabelEncoder()
  df['encoded_victim_of_violence_type'] = encoder.fit_transform(df['victim_of_violence_type'])
  for i in no_violence_victim_df.index:
    df['encoded_victim_of_violence_type'][i] = -1
  for i in range(len(df)):
    df['encoded_victim_of_violence_type'][i] += 1
  df = df.drop(columns=['victim_of_violence_type'])
  return df


In [None]:
# Dataframe of categorical variables:
categorical_val= list(requests_train.select_dtypes(include=[np.object]))
categorical_val.remove('request_id')
#we remove the date variable, we will deal with it later
categorical_val.remove('answer_creation_date')
categorical_val.remove('group_creation_date')
categorical_val.remove('request_creation_date')
categorical_val.remove('victim_of_violence_type')
categorical_val

In [None]:
# let' encode these categorical variables
def encode_categorical_variables(df,columns):
  for column in columns:
    df[column] = df[column].astype(str)
    encoder = LabelEncoder()
    df[column] = encoder.fit_transform(df[column])
    df.loc[df[column].isnull(),column] = -1
  return df

In [None]:
# To encode the date variables we are going to follow a simple approach: keep the year and the month.
def encode_date_variable(df,columns):
  for column in columns:
    df[column] = pd.to_datetime(df[column])
    df[column+'_year'] = df[column].dt.year
    df[column+'_month'] = df[column].dt.month
    df = df.drop(columns = [column])
  return df

In [None]:
def preprocess(df,cat_columns,date_columns):
  """ Transform dataframe into to encoded features and targets
  """
  df = encode_victims_of_violence_type(df)
  df = encode_categorical_variables(df,cat_columns)
  df = encode_date_variable(df,date_columns)
  features = list(df.columns)
  features.remove('request_id')
  features.remove('granted_number_of_nights')
  X = df[features]
  y = df["granted_number_of_nights"]
  return X,y


In [None]:
cat_columns = categorical_val
date_columns = ['answer_creation_date','group_creation_date','request_creation_date']
X,y = preprocess(requests_train,cat_columns,date_columns)

# Random Forrest Classifier

## Training

In [None]:
# split between the train and the validation samples
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
xgb_model = xgb.XGBClassifier(max_depth = 4,learning_rate = 0.01,n_estimators=10000)
xgb_model.fit(X_train, y_train, sample_weight=10**y_train, eval_set=[(X_val, y_val)],eval_metric = 'mlogloss',early_stopping_rounds= 100,sample_weight_eval_set=[10**y_val])

In [None]:
xgb_model.save_model('/content/drive/My Drive/Dreamquark_challenge/model_zoo/xgb_model.model')

In [None]:
xgb.plot_importance(xgb_model)

## Inference on test set

In [None]:
# Define the test scorer
def competition_scorer(y_true, y_pred):
    return log_loss(y_true, y_pred, sample_weight=10**y_true)

In [None]:
requests_test = pd.read_csv('data/requests_test.csv')
X_test , y_test = preprocess(requests_test,cat_columns,date_columns)

In [None]:
#retrieve saved model
best_model = xgb.Booster()
PATH = '/content/drive/My Drive/Dreamquark_challenge/model_zoo/xgb_model.model'
best_model.load_model(PATH)
print('The model has been loaded')


In [None]:
#run inference
start = time.time()
preds = best_model.predict(xgb.DMatrix(X_test))
end = time.time()
score = competition_scorer(y_test, preds)
print('time per prediction:' ,(end-start)/len(X_test))
print('The competition score on test data', score)


# Deep Learning approach

##Training

In [None]:
def preprocess_for_tabnet(X,y):
  X = X.reset_index()
  y = y.reset_index()
  y = y['granted_number_of_nights']
  return np.array(X), np.array(y)

In [None]:


X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=42)
X_train, y_train = preprocess_for_tabnet(X_train,y_train)
X_val , y_val = preprocess_for_tabnet(X_val ,y_val)
clf =  TabNetClassifier(optimizer_fn=torch.optim.Adam,
                       optimizer_params=dict(lr=1e-2))
clf.device= 'cuda'
weights = {0:1,1:10,2:10**2,3:10**3}
clf.fit(X_train=X_train, y_train=y_train, ##Train features and train targets
                X_valid=X_val, y_valid=y_val, ##Valid features and valid targets
                weights=weights,
                max_epochs=20,##Maxiµmum number of epochs during training 
                patience=5, ##Number of consecutive non improving epoch before early stopping
                batch_size=1024 ##Training batch size
                )

In [None]:
saved_filepath = clf.save_model('/content/drive/My Drive/Dreamquark_challenge/model_zoo/TabNet_model')

## Inference on test set

In [None]:
requests_test = pd.read_csv('data/requests_test.csv')
X_test , y_test = preprocess(requests_test,cat_columns,date_columns)

In [None]:
# Define the test scorer
def competition_scorer(y_true, y_pred):
    return log_loss(y_true, y_pred, sample_weight=10**y_true)

In [None]:
# Drop Nan value because otherwise there are memory errors
X_test['granted_number_of_nights'] = y_test
X_test = X_test.dropna()
y_test = X_test['granted_number_of_nights']
X_test = X_test.drop(columns = ['granted_number_of_nights'])
#preprocess the datasets for TabNet
X_test_tab, y_test_tab = preprocess_for_tabnet(X_test,y_test)
# retrieve model
# Not working
"""
PATH = '/content/drive/My Drive/Dreamquark_challenge/model_zoo/TabNet_model.zip'
best_model = TabNetClassifier()
best_model.load_model(PATH)
"""
#run inference
start = time.time()
preds = clf.predict_proba(X_test_tab)
end = time.time()
score = competition_scorer(y_test_tab, preds)
print('time per prediction:' ,(end-start)/len(X_test))
print('The competition score on test data', score)


In [None]:
importance = clf.feature_importances_
print(importance)

In [None]:
importance.argmax()

# Convert to html


In [None]:
!jupyter nbconvert --to html /content/drive/My\ Drive/Colab\ Notebooks/DreamQuark_challenge.ipynb