# 04 - Model Building

In [29]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split, cross_val_score, learning_curve
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier

In [43]:
train = pd.read_csv('../data/processed/train.csv')
train.head()

Unnamed: 0,flyFrom,flyTo,orig-dest,day_of_month,day_of_week,fly_duration,distance,days_until_dep,session,airline,hops,direct,competition,price,log_price,buy
0,MAD,BCN,MAD-BCN,1,Monday,1.416667,483.25,1,morning,UX,0,True,6,78,4.356709,False
1,MAD,BCN,MAD-BCN,1,Monday,1.25,483.25,1,morning,VY,0,True,6,91,4.51086,False
2,MAD,BCN,MAD-BCN,1,Monday,1.333333,483.25,1,evening,IB,0,True,6,91,4.51086,False
3,MAD,BCN,MAD-BCN,1,Monday,13.583333,483.25,1,evening,UX,0,True,6,108,4.682131,False
4,MAD,BCN,MAD-BCN,1,Monday,18.0,483.25,1,morning,UX,0,True,6,112,4.718499,False


### Feature selection and Preprocessing

In [44]:
numerical = ['days_until_dep', 'fly_duration', 'distance', 'day_of_month', 'price']
categorical = ['flyFrom', 'flyTo', 'days_until_dep', 'day_of_week', 'session']
target = 'buy'  

In [45]:
def prepare_features(df):
    numerical = ['days_until_dep', 'fly_duration', 'distance','day_of_month', 'log_price']
    categorical = ['flyFrom', 'flyTo', 'day_of_week', 'session', 'direct', 'airline']
    target = 'buy'
    
    # split target variable
    X = df.drop(target, axis=1)
    y = df[target].copy()
    y = y.apply(lambda x: 1 if x == True else 0)
    # select features
    X = X[numerical+categorical]
    # get dummies for categorical variables
    X = pd.get_dummies(X, drop_first=True)
    # scale numerical columns
    scaler = StandardScaler()
    X[numerical] = scaler.fit_transform(X[numerical])
 
    return X, y

In [46]:
X, y = prepare_features(train)
X.shape, y.shape

((120060, 105), (120060,))

In [41]:
X.head()

Unnamed: 0,days_until_dep,fly_duration,distance,day_of_month,log_price,direct,flyFrom_MAD,flyTo_BCN,flyTo_EZE,flyTo_FCO,...,airline_UX,airline_V7,airline_VB,airline_VS,airline_VY,airline_W6,airline_WN,airline_X3,airline_XQ,airline_Y4
0,-1.316816,-1.695417,-0.926539,-2.542449,-1.590794,True,1,1,0,0,...,1,0,0,0,0,0,0,0,0,0
1,-1.316816,-1.70846,-0.926539,-2.542449,-1.399929,True,1,1,0,0,...,0,0,0,0,1,0,0,0,0,0
2,-1.316816,-1.701939,-0.926539,-2.542449,-1.399929,True,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,-1.316816,-0.743276,-0.926539,-2.542449,-1.187864,True,1,1,0,0,...,1,0,0,0,0,0,0,0,0,0
4,-1.316816,-0.397636,-0.926539,-2.542449,-1.142835,True,1,1,0,0,...,1,0,0,0,0,0,0,0,0,0


### Split train and test

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

### Model building

In [34]:
from sklearn.metrics import f1_score, accuracy_score

#### Logistic regresion

In [47]:
def build_model(model, X, y):
    X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_train)

    print('-----------------------------------')
    print('training')
    print('-----------------------------------')
    print('acc:', accuracy_score(y_train, y_pred), 'f1-score:', f1_score(y_train, y_pred))
    print('-----------------------------------')
    print('test')
    print('-----------------------------------')
    y_pred = model.predict(X_valid)
    print('acc:', accuracy_score(y_valid, y_pred), 'f1-score:', f1_score(y_valid, y_pred))
    
    return model

In [28]:
log_clf = LogisticRegression()
build_model(log_clf, X, y)

-----------------------------------
training
-----------------------------------
acc: 0.7161211061136099 f1-score: 0.3716063609126527
-----------------------------------
test
-----------------------------------
acc: 0.7203481592537064 f1-score: 0.36847550079939806


LogisticRegression()

#### Random Forest

In [48]:
rf = RandomForestClassifier(random_state=42)
build_model(rf, X, y)

-----------------------------------
training
-----------------------------------
acc: 0.9996668332500417 f1-score: 0.9996597409778194
-----------------------------------
test
-----------------------------------
acc: 0.8537397967682825 f1-score: 0.8456535114705106


RandomForestClassifier(random_state=42)

In [49]:
scores = cross_val_score(rf, X, y, cv=5, scoring='f1')
scores.mean()

0.5971827538144312

#### Logistic regresion

In [10]:
xgb = XGBClassifier()
build_model(xgb, X, y)



-----------------------------------
training
-----------------------------------
acc: 0.8135175799809946 f1-score: 0.6345989727993546
-----------------------------------
test
-----------------------------------
acc: 0.7979094076655052 f1-score: 0.5971204849709522


XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=8, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

### Learning curve

In [15]:
def plot_learning_curve(model, X, y, cv=5, scoring='f1'):
    train_sizes, train_scores, validation_scores = learning_curve(model, X, y, cv=cv, scoring=scoring)
    train_scores_mean = -train_scores.mean(axis = 1)
    validation_scores_mean = -validation_scores.mean(axis = 1)
    plt.plot(train_sizes, train_scores_mean, label = 'Training error')
    plt.plot(train_sizes, validation_scores_mean, label = 'Validation error');
    plt.legend()