In [66]:
import seaborn
import pandas as pd

airfares = pd.read_csv("Airfares.csv")

In [67]:
airfares.dtypes

S_CODE       object
S_CITY       object
E_CODE       object
E_CITY       object
COUPON      float64
NEW           int64
VACATION     object
SW           object
HI          float64
S_INCOME    float64
E_INCOME    float64
S_POP         int64
E_POP         int64
SLOT         object
GATE         object
DISTANCE      int64
PAX           int64
FARE        float64
dtype: object

In [68]:
airfares.describe()

Unnamed: 0,COUPON,NEW,HI,S_INCOME,E_INCOME,S_POP,E_POP,DISTANCE,PAX,FARE
count,638.0,638.0,638.0,638.0,638.0,638.0,638.0,638.0,638.0,638.0
mean,1.202335,2.753918,4442.141129,27759.860502,27663.727273,4557004.0,3194503.0,975.653605,12782.214734,160.876677
std,0.203821,0.760448,1724.267051,3596.207837,4611.325018,3010985.0,2735604.0,646.242403,13202.22886,76.022436
min,1.0,0.0,1230.48,14600.0,14600.0,29838.0,111745.0,114.0,1504.0,42.47
25%,1.04,3.0,3090.1375,24706.0,23903.0,1862106.0,1228816.0,455.0,5328.5,106.29
50%,1.15,3.0,4208.185,28637.0,26409.0,3532657.0,2195215.0,850.0,7792.0,144.6
75%,1.2975,3.0,5480.575,29693.5,31981.0,7830332.0,4549784.0,1306.25,14090.5,209.35
max,1.94,3.0,10000.0,38813.0,38813.0,9056076.0,9056076.0,2764.0,73892.0,402.02


In [69]:
airfares.isna().sum()

S_CODE      0
S_CITY      0
E_CODE      0
E_CITY      0
COUPON      0
NEW         0
VACATION    0
SW          0
HI          0
S_INCOME    0
E_INCOME    0
S_POP       0
E_POP       0
SLOT        0
GATE        0
DISTANCE    0
PAX         0
FARE        0
dtype: int64

In [70]:
target='FARE'
predictors=['COUPON', 'NEW', 'HI', 'S_INCOME', 'E_INCOME', 'S_POP', 'E_POP', 'DISTANCE', 'PAX']

In [71]:
from sklearn.model_selection import train_test_split
X=pd.get_dummies(airfares[predictors])
y=airfares[target]
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3, random_state=0) 
#we set random state so we all get the same answers!

In [72]:
#Ridge model first - make sure we optimize alpha!
import numpy as np
from sklearn import linear_model


reg = linear_model.RidgeCV(alphas=np.logspace(-6, 6, 20))
reg.fit(X_train, y_train)
reg.alpha_

233572.14690901214

In [73]:
from sklearn import metrics
r2=metrics.r2_score(y_train, reg.predict(X_train))
print(r2)


0.6921807443339782


In [74]:
r2=metrics.r2_score(y_test, reg.predict(X_test))
print(r2)

0.697542411525605


In [75]:
#let sklearn do it all for you 
reg2 = linear_model.LassoCV(cv=10)
reg2.fit(X_train, y_train)
reg2.alpha_

54410.31384783662

In [76]:
r2=metrics.r2_score(y_train, reg2.predict(X_train))
print(r2)

0.261986862176392


In [77]:
r2=metrics.r2_score(y_test, reg2.predict(X_test))
print(r2)

0.24729843048655242


In [78]:
#elastic net has two parameters: you could gridsearch and let sklearn do its thing
reg3 = linear_model.ElasticNetCV(cv=10, l1_ratio = [.1, .5, .7, .9, .95, .99, 1])
reg3.fit(X_train, y_train)
reg3.alpha_

54410.31384783662

In [79]:
r2=metrics.r2_score(y_train, reg3.predict(X_train))
print(r2)

0.261986862176392


In [80]:
r2=metrics.r2_score(y_test, reg3.predict(X_test))
print(r2)

0.24729843048655242
