In [1]:
# Import packages 
import pandas as pd
import numpy as np
import matplotlib

# Import models 
from sklearn import linear_model
from sklearn.ensemble import RandomForestRegressor

# Import metrics
from sklearn.metrics import r2_score

# Import helpers
from sklearn.model_selection import train_test_split

In [3]:
# Import training data 
train = pd.read_csv('./train.csv')

In [4]:
# Import test data
test = pd.read_csv('./test.csv')

In [5]:
# Display first 5 rows of training data-set
train.head()

Unnamed: 0,ID,y,X0,X1,X2,X3,X4,X5,X6,X8,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
0,0,130.81,k,v,at,a,d,u,j,o,...,0,0,1,0,0,0,0,0,0,0
1,6,88.53,k,t,av,e,d,y,l,o,...,1,0,0,0,0,0,0,0,0,0
2,7,76.26,az,w,n,c,d,x,j,x,...,0,0,0,0,0,0,1,0,0,0
3,9,80.62,az,t,n,f,d,x,l,e,...,0,0,0,0,0,0,0,0,0,0
4,13,78.02,az,v,n,f,d,h,d,n,...,0,0,0,0,0,0,0,0,0,0


In [6]:
# How many object data-types are there in the data
(train.dtypes == object).sum()

8

In [7]:
# How many integer or numeric data-types are there in the data?
((train.dtypes == int) | (train.dtypes == float)).sum()

370

In [8]:
# Store the numeric variables in the training set
train_numeric = train[train.columns[(train.dtypes == int) | (train.dtypes == float)]]
train_numeric.head()

Unnamed: 0,ID,y,X10,X11,X12,X13,X14,X15,X16,X17,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
0,0,130.81,0,0,0,1,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
1,6,88.53,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
2,7,76.26,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,1,0,0,0
3,9,80.62,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,13,78.02,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [9]:
# Store the object variables in the training set
train_non_numeric = train[train.columns[train.dtypes == object]]
train_non_numeric.head()

Unnamed: 0,X0,X1,X2,X3,X4,X5,X6,X8
0,k,v,at,a,d,u,j,o
1,k,t,av,e,d,y,l,o
2,az,w,n,c,d,x,j,x
3,az,t,n,f,d,x,l,e
4,az,v,n,f,d,h,d,n


In [11]:
# Separate into training labels and features
train_y = train['y']
train_x = train_numeric.drop(['y', 'ID'], axis = 1)

In [12]:
# Create validation set
train_x, dev_x, train_y, dev_y = train_test_split(train_x, train_y, test_size = 0.10, random_state = 2390)

In [13]:
# Create set of regularization parameters
alphas = [0.02, 0.04, 0.06, 0.08, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]

# Create column labels for results data-frame
cols = ['train', 'test']

In [14]:
# Fit model object estimator object
def model_fit(alpha, model):
    
    # Fit model
    clf = model(alpha)
    clf.fit(train_x, train_y)
    
    # Make predictions
    train_preds = clf.predict(train_x)
    dev_preds = clf.predict(dev_x)
    
    # Calculate R-squared
    train_score = r2_score(train_y, train_preds)
    dev_score = r2_score(dev_y, dev_preds)
    
    # Return statement
    return(train_score, dev_score)

In [15]:
# LASSO fit
lasso_results = pd.DataFrame([model_fit(alpha, linear_model.Lasso) 
                              for alpha in alphas], 
                              columns = cols, index = alphas)

# Publish the results
lasso_results

Unnamed: 0,train,test
0.02,0.553602,0.657077
0.04,0.547374,0.653267
0.06,0.542709,0.649275
0.08,0.53838,0.644569
0.1,0.534477,0.639897
0.2,0.519222,0.619613
0.3,0.498536,0.59789
0.4,0.472556,0.572428
0.5,0.439799,0.540446
0.6,0.41212,0.512323


In [16]:
# Ridge fit
ridge_results = pd.DataFrame([model_fit(alpha, linear_model.Ridge) for alpha in alphas], 
                             columns = cols, index = alphas)

# Publish the results
ridge_results

Unnamed: 0,train,test
0.02,0.581069,0.637905
0.04,0.581028,0.638426
0.06,0.580977,0.638918
0.08,0.580921,0.639383
0.1,0.580861,0.639822
0.2,0.580549,0.641707
0.3,0.58025,0.643211
0.4,0.579974,0.644458
0.5,0.579717,0.645522
0.6,0.579477,0.646448


In [22]:
# Make predictions for test set using the best model for validation
# First extract features from test set
test_numeric = test[test.columns[(test.dtypes == int) | (test.dtypes == float)]]
test_x = test_numeric.drop('ID', axis = 1)

In [None]:
# Fit LASSO model
clf = linear_model.Lasso(alpha = 0.02)
clf.fit(train_x, train_y)
    
# Make predictions using LASSO
test_preds = pd.DataFrame(clf.predict(test_x), index = test_numeric['ID'], columns = ['y'])

# Output to .csv
test_preds.to_csv('./submission_lasso.csv')

In [23]:
# Create ridge regression estimation object
clf = linear_model.Ridge(alpha = 0.9)

# Fit ridge regression model
clf.fit(train_x, train_y)

# Make predictions using ridge
test_preds = pd.DataFrame(clf.predict(test_x), index = test_numeric['ID'], columns = ['y'])

# Output to .csv
test_preds.to_csv('./submission_ridge.csv')

In [None]:
# Create estimator object
clf = RandomForestRegressor(max_depth = 5, random_state = 42190, n_estimators = 200)

# Fit random forest model
clf.fit(train_x, train_y)

# Make predictions using random forest
test_preds = pd.DataFrame(clf.predict(test_x), index = test_numeric['ID'], columns = ['y'])

# Output these to .csv
test_preds.to_csv('./submission_random_forest_3.csv')