In [None]:
%matplotlib inline

import numpy as np
import pandas as pd

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss
from sklearn.model_selection import train_test_split

In [None]:
# Define the competition scorer
def competition_scorer(y_true, y_pred):
    return log_loss(y_true, y_pred, sample_weight=10**y_true)

# 1- Information about the challenge

In this challenge, the `requests` dataset contains information about the requests made by group of individuals (or family) to the french emergency housing public service. A sample of the `requests` dataset corresponds to a unique request.

The goal is to predict the categorical variable `granted_number_of_nights` which represents the number of nights of emergency housing granted to a group. You can train your model on the `train_requests`, the predictions should be made for requests listed in the `test_requests` dataset.

The evaluation metric is given by the `competition_scorer` defined above. It corresponds to a weighted log-loss with weights 1, 10, 100, or 1000 if the `granted_number_of_nights` takes the value 0, 1, 2, or 3 respectively. Thus beware that you will be penalized harder for classification mistakes made on the higher labels.

Good luck!

# 2- Load the datasets

In [None]:
# Train sample
requests = pd.read_csv('data/train_requests.csv', sep=',', low_memory=False, error_bad_lines=False)

# Test sample
requests_test = pd.read_csv('data/test_requests.csv', sep=',', low_memory=False, error_bad_lines=False)

In [None]:
# Inspect basic metadata about the dataset
requests.info()

# 3- Distribution of the target

In [None]:
# histogram of the target variable
requests['granted_number_of_nights'].plot.hist()
None

# 4-Train and evaluate a first model

In [None]:
# selected columns for explanatory variable
columns = ['district',
           'housing_situation_id',
           'group_composition_id']

X = requests[columns]
y = requests['granted_number_of_nights']

In [None]:
# split between the train and the validation samples
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.33, random_state=37)

In [None]:
# use logistic regression as first model
model = LogisticRegression(solver='liblinear', multi_class='ovr')

In [None]:
# fit the model
model.fit(X_train, y_train)

In [None]:
# evaluate the model with the competition scorer (validation set)
competition_scorer(y_val, model.predict_proba(X_val))

# 5- Compute predictions on the test set 

In [None]:
# use the model to predict on the test set
X_test = requests_test[columns]
y_pred = model.predict_proba(X_test)

In [None]:
# overview of prediction probabilities for first four rows
y_pred[:4]

In [None]:
predictions = pd.concat([requests_test['request_id'], pd.DataFrame(y_pred)], axis=1)

# 6- Submit your predictions to the QScore platform

In [None]:
import io, math, requests

# Get your token from qscore:
# 1. Go to https://qscore.datascience-olympics.com/
# 2. Chose the competition Data Science Olympics 2019
# 3. In the left menu click 'Submissions'
# 4. Your token is in the 'Submit from your Python Notebook' tab

def submit_prediction(df, sep=',', comment='', compression='gzip', **kwargs):
    TOKEN='YOUR_TOKEN_HERE'
    URL='https://qscore.datascience-olympics.com/api/submissions'
    df.to_csv('temporary.dat', sep=sep, compression=compression, **kwargs)
    r = requests.post(URL, headers={'Authorization': 'Bearer {}'.format(TOKEN)},files={'datafile': open('temporary.dat', 'rb')},data={'comment':comment, 'compression': compression})
    if r.status_code == 429:
        raise Exception('Submissions are too close. Next submission is only allowed in {} seconds.'.format(int(math.ceil(int(r.headers['x-rate-limit-remaining']) / 1000.0))))
    if r.status_code != 200:
        raise Exception(r.text)

In [None]:
submit_prediction(predictions, sep=',', index=False, comment='my submission')