In [3]:
#Import libraries:
import pandas as pd
import numpy as np
from sklearn.ensemble import GradientBoostingRegressor  #GBM algorithm
from sklearn import cross_validation, metrics   #Additional scklearn functions
from sklearn.grid_search import GridSearchCV   #Perforing grid search

import matplotlib.pylab as plt
%matplotlib inline
from matplotlib.pylab import rcParams
rcParams['figure.figsize'] = 12, 4

data = pd.read_csv('data/data_aa.csv')
source_data = data[['ITIN_YIELD', 'DISTANCE']]
predicted_column = data['ITIN_FARE']

# with this line we are just showing first 5 line of our all data
data.head()

Unnamed: 0,ITIN_ID,COUPONS,YEAR,QUARTER,ORIGIN,ORIGIN_AIRPORT_ID,ORIGIN_AIRPORT_SEQ_ID,ORIGIN_CITY_MARKET_ID,ORIGIN_COUNTRY,ORIGIN_STATE_FIPS,...,ITIN_YIELD,REPORTING_CARRIER,PASSENGERS,ITIN_FARE,BULK_FARE,DISTANCE,DISTANCE_GROUP,MILES_FLOWN,ITIN_GEO_TYPE,Unnamed: 25
0,201719,1,2017,1,ABE,10135,1013503,30135,US,42,...,0.5482,9E,1.0,233.0,0.0,425.0,1,425.0,2,
1,2017110,1,2017,1,ABE,10135,1013503,30135,US,42,...,0.8447,9E,1.0,359.0,0.0,425.0,1,425.0,2,
2,2017111,1,2017,1,ABE,10135,1013503,30135,US,42,...,0.9153,9E,1.0,389.0,0.0,425.0,1,425.0,2,
3,2017112,2,2017,1,ABE,10135,1013503,30135,US,42,...,0.2306,9E,1.0,196.0,0.0,850.0,2,850.0,2,
4,2017113,2,2017,1,ABE,10135,1013503,30135,US,42,...,0.2306,9E,1.0,196.0,0.0,850.0,2,850.0,2,


In [5]:
def modelfit(alg, dtrain, predictors, performCV=True, printFeatureImportance=True, cv_folds=5):
    #Fit the algorithm on the data
    alg.fit(source_data, predicted_column)
        
    #Predict training set:
    dtrain_predictions = alg.predict(source_data)
    dtrain_predprob = alg.predict_proba(source_data)[:,1]
    
    #Perform cross-validation:
    if performCV:
        cv_score = cross_validation.cross_val_score(alg, source_data, predicted_column, cv=cv_folds, scoring='roc_auc')
    
    #Print model report:
    print("\nModel Report")
    print("Accuracy : %.4g" % metrics.accuracy_score(source_data.values, dtrain_predictions))
    print("AUC Score (Train): %f" % metrics.roc_auc_score(source_data, dtrain_predprob))
    
    if performCV:
        print("CV Score : Mean - %.7g | Std - %.7g | Min - %.7g | Max - %.7g" % (np.mean(cv_score),np.std(cv_score),np.min(cv_score),np.max(cv_score)))
        
    #Print Feature Importance:
    if printFeatureImportance:
        feat_imp = pd.Series(alg.feature_importances_, predictors).sort_values(ascending=False)
        feat_imp.plot(kind='bar', title='Feature Importances')
        plt.ylabel('Feature Importance Score')