In [1]:
# import main libraries
import os
import pandas as pd
import datetime
from datetime import datetime
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import math

# import ML libraries
import sklearn
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, RepeatedStratifiedKFold, StratifiedKFold
from sklearn.metrics import ndcg_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFE
from catboost import CatBoostClassifier


# Functions

In [2]:
def preprocessing(df, dataset_type, age_mean=None):
    """ This function takes a raw dataframe as input and clean it by:
    - dropping duplicates
    - reformatting some columns (dates, age etc.)
    - extracting meaningful elements from datetimes
    - dropping pointless columns or the ones that cause problems in the next steps
    - dropping rows with missing values in important columns 
    
    NB : If age_mean = None, then age_mean = df.age.mean()
    """

    col_id = df.id
    columns_to_remove = [ 
        'id', 
        'first_affiliate_tracked', 
        'date_first_booking'
    ]

    df.drop(columns_to_remove, axis=1, inplace=True)
    
    df.drop_duplicates(inplace=True) # drop duplicates in the dataset
    
    
    df['timestamp_first_active'] = df['timestamp_first_active'].apply(lambda x: datetime.strptime(str(x), '%Y%m%d%H%M%S')) # convert timestamps to datetimes
    df['date_account_created'] = pd.to_datetime(df.date_account_created) #convert object dates to datetimes
    df['days_to_first_active'] = (pd.to_datetime(df.date_account_created) - df.timestamp_first_active).astype('timedelta64[D]').astype(int)
    
    df['year_first_active'] = df['timestamp_first_active'].dt.year.astype(str) # extract year timestamp_first_active
    df['month_first_active'] = df['timestamp_first_active'].dt.month.astype(str) # extract month timestamp_first_active
    df.drop(['timestamp_first_active'], axis=1, inplace=True)
    
    df['year_creation'] = df['date_account_created'].dt.year.astype(str) # extract year from date_account_created
    df['month_creation'] = df['date_account_created'].dt.month.astype(str) # extract month date_account_created
    df.drop(['date_account_created'], axis=1, inplace=True)
    
    
    dummies_list = [
        'gender',
        'signup_method',
        'signup_flow',
        'language',
        'signup_app',
        'year_creation',
        'month_creation',
        'year_first_active',
        'month_first_active',
        'first_device_type',
        'first_browser',
        'affiliate_channel',
        'affiliate_provider'
    ]
    
    dummy_df = pd.get_dummies(df[dummies_list])
    df = df.drop(dummies_list, axis=1)
    df = pd.concat([df, dummy_df], axis=1)
    

    if dataset_type == 'train':
        cols = list(df)
        cols.append(cols.pop(cols.index('country_destination')))
        df = df[cols]
    

    
    return df, col_id


def get_class_weights(X):
    """ This function takes a dataframe with multiple classes and labels as input 
    and computes the weights of the classes."""
    n_classes = X.country_destination.nunique()
    weights = X.shape[0]/(X.country_destination.value_counts())
    keys = weights.index.tolist()
    values = weights.tolist()
    cl_weights = {keys[i]:values[i] for i in range(len(keys))}
    
    return cl_weights


def merge_sessions(df, session_set, keep_all=False):
    # Clean the sessions set
    session_set.drop_duplicates(inplace=True)

    # Checking if the missing values in action_type correspond to missing values in action_detail and if so, Mark 'unknown' as NaN
    if (session_set.action_type.isna() == session_set.action_detail.isna()).sum() == len(session_set):
        session_set.loc[session_set.action_type == '-unknown-', 'action_type'] = np.nan
        session_set.loc[session_set.action_detail == '-unknown-', 'action_detail'] = np.nan

    session_set.dropna(axis=0, inplace=True)

    # Create a new column to store important actions about booking or receipt

    session_set['book_action'] = ((session_set.action.str.contains('book')) & ~(session_set.action.str.contains('facebook'))).astype(int)
    session_set['book_action_type'] = session_set.action_type.str.contains('book').astype(int)
    session_set['book_action_detail'] = session_set.action_detail.str.contains('book').astype(int)
    
    session_set.secs_elapsed = session_set.secs_elapsed.fillna(session_set.secs_elapsed.mean())
    session_set['book_action_time'] = session_set.book_action * session_set.secs_elapsed
    session_set['book_action_type_time'] = session_set.book_action_type * session_set.secs_elapsed
    session_set['book_action_detail_time'] = session_set.book_action_detail * session_set.secs_elapsed
    
    session_set['message_to_host_focus'] = (session_set.action == 'message_to_host_focus').astype(int) 
    session_set['message_to_host_change'] = (session_set.action == 'message_to_host_change').astype(int) 
    session_set['host_cancel'] = (session_set.action == 'host_cancel').astype(int) 
    
    
    sessions_group = session_set[['user_id', 
                                  'book_action', 
                                   'book_action_type', 
                                   'book_action_detail', 
                                   'book_action_time',
                                   'book_action_type_time',
                                   'book_action_detail_time', 
                                   'message_to_host_focus',
                                   'message_to_host_change', 
                                   'host_cancel',
                                   'secs_elapsed'
                                 ]].groupby('user_id').sum()
    
    
    if not keep_all:
        df = df.merge(sessions_group, how='inner', right_on= 'user_id', left_on = 'id')
    else:
        df = df.merge(sessions_group, how='left', right_on='user_id', left_on = 'id')
        df['book_action'] = df['book_action'].fillna(0)
        df['book_action_type'] = df['book_action_type'].fillna(0)
        df['book_action_detail'] = df['book_action_detail'].fillna(0)
        df['book_action_time'] = df['book_action_time'].fillna(0)
        df['book_action_type_time'] = df['book_action_type_time'].fillna(0)
        df['book_action_detail_time'] = df['book_action_detail_time'].fillna(0)
        df['message_to_host_focus'] = df['message_to_host_focus'].fillna(0)
        df['message_to_host_change'] = df['message_to_host_change'].fillna(0)
        df['host_cancel'] = df['host_cancel'].fillna(0)
        df['secs_elapsed'] = df['secs_elapsed'].fillna(session_set.secs_elapsed.mean())
    
    df.drop_duplicates(inplace = True)
    
    return df



def frequence_visitors(age_bucket, gender, country_destination):
    """ This function takes personal information as inputs 
    like age_bucket and gender of a given person,
    and the country_destination we want to suggest her, 
    and returns the probabilty that this person goes to this country."""
    num = int(age_gend.population_in_thousands[(age_gend.age_bucket==age_bucket)&(age_gend.gender==gender)&(age_gend.country_destination==country_destination)])
    denom = int(age_gend[(age_gend.age_bucket==age_bucket)&(age_gend.gender==gender)].population_in_thousands.sum())
    frequence = num/denom
    return frequence


def get_age_bucket(age):

    if age >= 100:
        return '100+'
    for age_bucket in age_gend.age_bucket.unique():
        if age_bucket != '100+':
            min_age, max_age = age_bucket.split('-')
            min_age, max_age = int(min_age), int(max_age)
            if min_age <= age <= max_age:
                return age_bucket

            
def add_visitors(df, dataset_type, dict_destinations_frequency, age_mean=None):
    
    # convert age to float
    df.age = df.age.fillna(0)
    df.age = df.age.astype(int) 
    
    # mark non-sense values in age as np.nan
    df.loc[(df['age'] < 14), 'age'] = np.nan
    df.loc[(df['age'] > 100), 'age'] = np.nan 
    
    # fill missing values in age with mean of age (fit on train)
    if dataset_type == 'train':
        df.age = df.age.fillna(df.age.mean())
    else:
        df.age = df.age.fillna(age_mean)
    
    
    df['binary_gender'] = df.gender.str.lower()
    df['binary_gender'] = df.binary_gender.apply(
        lambda x: np.random.choice(['male', 'female']) if x in ['-unknown-', 'other'] else x)
    
    df['age_bucket'] = df.age.apply(get_age_bucket)
    
    for country in age_gend.country_destination.unique():
        df['key'] = df.age_bucket + ' ' + df.binary_gender + ' ' + country
        df['visitors_2015_'+country] = df.key.map(dict_destinations_frequency)
    
    df.drop(['key', 'age_bucket', 'binary_gender'], axis=1, inplace=True)
                                          
    return df 
   
    
def draw_graph(df, col, inp):
    """ This function draws a graph with data of a specific series in a dataset (col) about a specific element."""
    what_to_draw = df[df[col] == inp].country_destination
    f, ax = plt.subplots(figsize=(15, 6))
    sns.countplot(x=what_to_draw, palette="ch:.25")
    plt.xticks(rotation=90)
    plt.title(col+': '+str(inp))
    plt.show()

### Loading all the datasets

In [3]:
os.chdir('/Users/amelievogel/Desktop/data_science/Kaggle-Datasets/airbnb-recruiting-new-user-bookings')

In [4]:
# Load the train set
train = pd.read_csv('train_users_2.csv')

In [5]:
# Load the test set
test = pd.read_csv('test_users.csv')

In [6]:
# Load the age-gender set
age_gend = pd.read_csv('age_gender_bkts.csv')

In [7]:
# Load the sessions set
sess = pd.read_csv('sessions.csv')

### Cleaning

In [10]:
# Merge train and sessions with new features
train_2 = merge_sessions(train, sess, keep_all=False)
# Merge test and sessions with new features
test_2 = merge_sessions(test, sess, keep_all=True)

In [11]:
# Create a dictionnary with age_bucket, gender and country to be suggested as keys, and the probability to go to this country as values
dict_destinations_frequency = {}
for age_bucket in list(age_gend.age_bucket.unique()):
    for gend in list(age_gend.gender.unique()):
        for country in list(age_gend.country_destination.unique()):
            key = ' '.join([age_bucket, gend, country])
            value = frequence_visitors(age_bucket, gend, country)
            dict_destinations_frequency[key] = value

In [12]:
# Map the dict to train and test set so that you have new columns that suggest destination according to age and gender of users
train_2 = add_visitors(train_2, 'train', dict_destinations_frequency, age_mean=None)
test_2 = add_visitors(test_2, 'test', dict_destinations_frequency, age_mean=int(train.age.mean()))


In [13]:
# Clean the train set
train, col_id_train = preprocessing(train_2, 'train')

In [14]:
# Clean the test set
age_mean = train.age.mean()
test, col_id_test = preprocessing(test_2, 'test', age_mean)

In [15]:
# now dealing with test_df columns
# test_df columns are considered either "superfluous" or "missing" (in comparison to train_df columns)
superfluous_columns = set(test.columns) - set(train.columns)
test.drop(superfluous_columns, axis=1, inplace=True)

missing_columns = set(train.columns) - set(test.columns) - set(['country_destination'])

for column in missing_columns:
    test[column] = 0

In [24]:
# Define X_train and y_train into np.arrays
X, y = train.values[:, :-1], train.values[:, -1]

In [18]:
# Define X_train and y_train into np.arrays
X_test = test.values[:]

In [19]:
train_orig = pd.read_csv('train_users_2.csv')
test_orig = pd.read_csv('test_users.csv')

### Data Exploration

In [None]:
draw_graph(train_2, 'signup_method', 'google')

In [None]:
draw_graph(train_2, 'is_action_about_booking', True)

In [None]:
draw_graph(train_2, 'is_action_about_booking', False)

In [None]:
draw_graph(train_2, 'is_action_about_booking', 'Don\'t know')

In [None]:
sns.catplot(x="gender", kind="count", palette="ch:.25", data=train)
#plt.xticks(rotation=90)
plt.show()

In [None]:
sns.boxplot(x="age", palette="ch:.25", data=train)
plt.xticks(rotation=90)
plt.show()

In [None]:
f, ax = plt.subplots(figsize=(15, 6))
sns.countplot(x="country_destination", hue='signup_method', palette="ch:.25", data=train)
plt.xticks(rotation=90)
plt.show()

In [None]:
draw_graph(train_2, 'language', 'en')

In [None]:
draw_graph(train_2, 'language', 'es')

In [None]:
draw_graph(train_2, 'language', 'fr')

In [None]:
draw_graph(train_2, 'language', 'de')

In [None]:
draw_graph(train_2, 'signup_method', 'facebook')

In [None]:
draw_graph(train_2, 'signup_method', 'basic')

In [None]:
draw_graph(train_2, 'gender', 'female')

In [None]:
draw_graph(train_2, 'gender', 'male')

In [None]:
draw_graph(train_2, 'first_browser', 'Chrome')

#### Random Forest Classifier

In [84]:
# Create a cross-validation set
X_train, X_cv, y_train, y_cv = train_test_split(X, y, test_size=0.2, shuffle=True, random_state=876675)

In [85]:
classifier=RandomForestClassifier(n_estimators=200, max_depth=14)
classifier.fit(X_train, y_train)

#acc_rf=accuracy_score(y_train,y_pred)
#pr_rf=precision_score(y_train,y_pred, average = 'weighted')
#rec_rf=recall_score(y_train,y_pred, average= 'weighted')
#f1_rf=f1_score(y_train,y_pred, average= 'weighted')
#print('Acc: %s, prec: %s, rec: %s, f1: %s' % (acc_rf, pr_rf, rec_rf, f1_rf))

RandomForestClassifier(max_depth=14, n_estimators=200)

In [86]:
# Test on the cross-validation set
y_pred=classifier.predict_proba(X_cv)

y_cv = pd.get_dummies(y_cv)
ndcg_rf = ndcg_score(y_cv, y_pred)
print('NDCG: %s' % (ndcg_rf))

NDCG: 0.8566583222688717


#### Random Forest Classifier with Balanced Mode with RFE

In [None]:
# Try feature selection with RFE
classifier=RandomForestClassifier()
selector_classifier = RFE(classifier)
selector_classifier = selector_classifier.fit(X_train,y_train)

In [None]:
y_pred=selector_classifier.predict_proba(X_cv)

y_cv = pd.get_dummies(y_cv)
ndcg_rf = ndcg_score(y_cv, y_pred)
print('NDCG: %s' % (ndcg_rf))

#### Random Forest in GridSearch without RFE

In [None]:
classifier=RandomForestClassifier()

# Parameters to test in GridSearch
param = {'n_estimators': [50, 100, 200, 300, 400], 'max_depth': range(2, 50, 2)}

# Score used to evaluate the model
#ndcg_rf = ndcg_score(pd.get_dummies(y_cv), selector_classifier.predict_proba(X_cv))

# GridSearch
grid = GridSearchCV(estimator=classifier, param_grid=param, scoring='f1_weighted', verbose=10)
grid_result = grid.fit(X_train, y_train)

In [None]:
grid_result.best_params_

### Submission

In [112]:
col_id_test = list(col_id_test)
list_index = list(np.argsort(y_pred[0])[::-1])[:5]
list_best_countries = []
for i in range(len(y_pred)):
    list_index = list(np.argsort(y_pred[i])[::-1])[:5]
    list_best_countries.append(list(classifier.classes_[list_index]))

In [113]:
list_ids = [[item] * 5 for item in col_id_test]

In [114]:
def flatten(l):
    return [item for sublist in l for item in sublist]

In [115]:
list_best_countries = flatten(list_best_countries)
list_ids = [[item] * 5 for item in col_id_test]
list_ids = flatten(list_ids)

In [116]:
final_df = pd.DataFrame(zip(list_ids, list_best_countries), columns=['id', 'country'])
final_df

Unnamed: 0,id,country
0,5uwns89zht,NDF
1,5uwns89zht,US
2,5uwns89zht,other
3,5uwns89zht,FR
4,5uwns89zht,IT
...,...,...
70980,49kpri859i,US
70981,49kpri859i,NDF
70982,49kpri859i,other
70983,49kpri859i,FR


In [70]:
final_df = pd.DataFrame(dict_best_countries.items(), columns=['id', 'country']) 
final_df = final_df.explode('country')
final_df.to_csv('submission.csv', index=False)

In [None]:
submission = pd.read_csv('sample_submission_NDF.csv')

In [None]:
submission