In [None]:
## Python packages - you may have to pip install sqlalchemy, sqlalchemy_utils, and psycopg2.

import pandas as pd
# Make the graphs a bit prettier, and bigger
pd.set_option('display.mpl_style', 'default')

# This is necessary to show lots of columns in pandas 0.12. 
# Not necessary in pandas 0.13.
pd.set_option('display.width', 5000) 
pd.set_option('display.max_columns', 60)
pd.set_option('display.max_rows', 20)

import numpy as np
import math
# The usual preamble
%matplotlib inline
%pylab inline
import matplotlib.pyplot as plt
plt.rcParams['axes.color_cycle'] = ['r', 'g', 'b', 'c']
plt.rcParams['lines.color'] = 'r'
plt.rcParams['figure.figsize'] = (15, 5)
from matplotlib.colors import ListedColormap

import seaborn as sns

In [None]:
train_df = pd.read_csv('train.csv',na_values=['?','none'])
test_df = pd.read_csv('test.csv',na_values=['?','none'])

In [None]:
train_df.describe()

In [None]:
test_df.describe()

In [None]:
g = sns.PairGrid(train_df.dropna())
g.map_diag(plt.hist)
g.map_offdiag(plt.scatter);
g.add_legend();

In [None]:
train_df.shape,train_df.dropna().shape

a huge number of points are missing from feature f

In [None]:
train_df.dtypes

In [None]:
train_df['b'].unique()

In [None]:
def process(df):
    df['gender_I'] = df['gender']=='I'
    df['gender_M'] = df['gender']=='M'
    df['gender_F'] = df['gender']=='F'
    df = df.drop('gender',1)
    return df
train_df = process(train_df)
test_df = process(test_df)


In [None]:
train_df

after finding the 'none' and '?' nan values and one hot encoding the gender, we almost ready to go. But the nan values involve almost 1/3 of the training data. so let's impute via random sampling

In [None]:
impute_f_col = train_df['f'].dropna()
def impute_col(x,col):
    if np.isnan(x):
        return col.sample().tolist().pop()
    return x

for col_name in train_df.columns.tolist():
    col = train_df[col_name].dropna()
    def impute(x):
        return impute_col(x,col)
    train_df[col_name] = train_df[col_name].map(impute)

In [None]:
train_df.describe()

In [None]:
g = sns.PairGrid(train_df.dropna())
g.map_diag(plt.hist)
g.map_offdiag(plt.scatter);
g.add_legend();

ok, things basically look the same. so hopefully I didn't mess anything up.

In [None]:
def get_Xy(df,label_col='age'):
    return df.drop(label_col,1).as_matrix(),\
           df[label_col].as_matrix()
train_X,train_y = get_Xy(train_df)
test_X,test_y = get_Xy(test_df)

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.grid_search import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
scaler = StandardScaler()
model = LinearRegression()
#cv = GridSearchCV(classifier,params,n_jobs=1,cv=10)
regr = Pipeline([('scaler',scaler),('model',model)])
regr.fit(train_X,train_y)

In [None]:
regr.score(train_X,train_y)

In [None]:
regr.score(test_X,test_y)

In [None]:
def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None,
                        n_jobs=1, train_sizes=np.linspace(.1, 1.0, 5)):
    """
    Generate a simple plot of the test and traning learning curve.

    Parameters
    ----------
    estimator : object type that implements the "fit" and "predict" methods
        An object of that type which is cloned for each validation.

    title : string
        Title for the chart.

    X : array-like, shape (n_samples, n_features)
        Training vector, where n_samples is the number of samples and
        n_features is the number of features.

    y : array-like, shape (n_samples) or (n_samples, n_features), optional
        Target relative to X for classification or regression;
        None for unsupervised learning.

    ylim : tuple, shape (ymin, ymax), optional
        Defines minimum and maximum yvalues plotted.

    cv : integer, cross-validation generator, optional
        If an integer is passed, it is the number of folds (defaults to 3).
        Specific cross-validation objects can be passed, see
        sklearn.cross_validation module for the list of possible objects

    n_jobs : integer, optional
        Number of jobs to run in parallel (default 1).
    """
    plt.figure()
    plt.title(title)
    if ylim is not None:
        plt.ylim(*ylim)
    plt.xlabel("Training examples")
    plt.ylabel("Score")
    train_sizes, train_scores, test_scores = learning_curve(
        estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes)
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    plt.grid()

    plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.1,
                     color="r")
    plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std, alpha=0.1, color="g")
    plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
             label="Training score")
    plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
             label="Cross-validation score")

    plt.legend(loc="best")
    return plt


In [None]:
from sklearn.learning_curve import learning_curve
train_X_scaled = scaler.fit_transform(train_X)
plot_learning_curve(LinearRegression(), "",train_X_scaled, train_y, cv=20,
                    train_sizes=np.linspace(0.1,1.,5))


not terrible. what if we try something else, like a random forest

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.grid_search import RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
scaler = StandardScaler()
model = RandomForestRegressor(n_estimators=100)
params = {'max_features':np.linspace(0.3,1.,6),'min_samples_leaf':[1,2,3,4,5]}
cv = RandomizedSearchCV(model,params,n_jobs=2,cv=10,n_iter=20)
regr = Pipeline([('scaler',scaler),('cv',cv)])
regr.fit(train_X,train_y)

In [None]:
regr.score(train_X,train_y)

In [None]:
regr.score(test_X,test_y)

In [None]:
regr.named_steps['cv'].best_estimator_

In [None]:
importance = regr.named_steps['cv'].best_estimator_.feature_importances_
features = train_df.drop('age',1).columns.tolist()
indices = np.argsort(importance)
for index in indices[::-1]:
    print features[index],importance[index]

In [None]:
plt.scatter(regr.predict(train_X),train_y)
plt.scatter(regr.predict(test_X),test_y,color='r')




x = regr.predict(train_X)
y = test_y
#plt.plot(x, np.poly1d(np.polyfit(x, y, 1))(x))

x = regr.predict(test_X)
y = test_y
#plt.plot(x, np.poly1d(np.polyfit(x, y, 1))(x))
plt.legend(['Train','Test'],loc=4)
plt.plot(np.linspace(0, 30, 1000),np.linspace(0, 30, 1000))
plt.xlabel('Predicted Age')
plt.ylabel('Actual Age')

plt.ylim([0,30])
plt.xlim([0,30])

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.grid_search import RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
scaler = StandardScaler()
model = SVR()
params = {'C':np.logspace(-5,5,20)}
cv = RandomizedSearchCV(model,params,n_jobs=2,cv=10,n_iter=20)
regr = Pipeline([('scaler',scaler),('cv',cv)])
regr.fit(train_X,train_y)

In [None]:
regr.score(train_X,train_y)

In [None]:
regr.score(test_X,test_y)

In [None]:
plt.scatter(regr.predict(train_X),train_y)
plt.scatter(regr.predict(test_X),test_y,color='r')




x = regr.predict(train_X)
y = test_y
#plt.plot(x, np.poly1d(np.polyfit(x, y, 1))(x))

x = regr.predict(test_X)
y = test_y
#plt.plot(x, np.poly1d(np.polyfit(x, y, 1))(x))
plt.legend(['Train','Test'],loc=4)
plt.plot(np.linspace(0, 30, 1000),np.linspace(0, 30, 1000))
plt.xlabel('Predicted Age')
plt.ylabel('Actual Age')

plt.ylim([0,30])
plt.xlim([0,30])