In [41]:
# Your code here - remember to use markdown cells for comments as well!
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from scipy.stats import norm
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import RidgeCV
from sklearn.model_selection import train_test_split
from scipy import stats
from scipy.stats import skew
from scipy.special import boxcox1p
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

In [42]:
## Function doe Ridge regression with n features
def withnfeatures(data,n):
    #correlation matrix between all the variables
    
    corrmat = data.corr()
#    k = train_cont.shape[1] #number of variables for heatmap
    k = data.shape[1]
    cols = corrmat.nlargest(k, 'price')['price'].index
    cm = np.corrcoef(data[cols].values.T)
    #Selecting the features most correlated with target variable in their absolute value
    l = cols[np.argsort(np.abs(cm[0]))[::-1][1:n+1]].values#n+1 because just 1:1 gives the index only

    #Select those particular features
    data_l = data[l]
    
    y = data['price']
    X = data_l

    X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42)

    ridge = RidgeCV(alphas=np.logspace(0,3,20)).fit(X_train, y_train)
    
    testpred = ridge.predict(X_test)
    alpha = ridge.alpha_
    
    #Calculate rmse
    testpred[testpred < 0] = np.mean(y_train)
    err = np.sqrt(((testpred-y_test)**2).sum()/len(y_test))
    act_err = np.sqrt((((np.exp(testpred)-1)-(np.exp(y_test))-1)**2).sum()/len(y_test))

    
    
    #Return
    return err, l, act_err

In [43]:
#### Function to convert categorcical data to numerical by establishing a monotonic relationship
def cattonum(data_i,feature,target,log_target=True):
#    set up an array to calculate the mean and std for each unique feature
    data = data_i.copy()
    if log_target:
        data[target] = np.log1p(data[target])
    meanstd = np.zeros([data[feature].nunique(),2])
    for i,x in enumerate(data[feature].unique()):
        meanstd[i,0] = data[data[feature]==x][target].mean()
        meanstd[i,1] = data[data[feature]==x][target].std()
    #meanstd[:,0] = data.groupby(feature).mean()[[target]]
    #meanstd[:,1] = data.groupby(feature).std()[[target]]
    
    #Sort by the mean of the label to arrange them in ascending order
    s = meanstd[:,0].argsort()
    meanstd = meanstd[s]
    
    #get the original labels
    labels = data[feature].unique()[s]
    
    #Plot in an ascending 
    plt.errorbar(range(0,s.shape[0]), meanstd[:,0], 0.5*meanstd[:,1], marker='o', mfc='red',
         mec='blue', ms=2, mew=4)
    plt.xticks(range(0,s.shape[0]), labels, rotation='vertical')
    
    #replace labels by numbers starting from 1 to no. of labels
    data[feature] = data[feature].replace(labels,range(1,s.shape[0]+1))
    return data

In [47]:
def cattonum_lin(data_i,feature,target,log_target=True):
#    set up an array to calculate the mean and std for each unique feature
    data = data_i.copy()
    if log_target:
        data[target] = np.log1p(data[target])
    meanstd = np.zeros([data[feature].nunique(),2])
    for i,x in enumerate(data[feature].unique()):
        meanstd[i,0] = data[data[feature]==x][target].mean()
        meanstd[i,1] = data[data[feature]==x][target].std()
    #meanstd[:,0] = data.groupby(feature).mean()[[target]]
    #meanstd[:,1] = data.groupby(feature).std()[[target]]
    
    #Sort by the mean of the label to arrange them in ascending order
    mean_std_plot = meanstd
    s = meanstd[:,0].argsort()
    meanstd = meanstd[s]
    
    #get the original labels
    labels = data[feature].unique()[s]
    
    plt.figure(figsize=(16,8))
    #Plot in an ascending
    plt.subplot(1,2,1)
    plt.errorbar(range(1,s.shape[0]+1), mean_std_plot[:,0], 0.5*mean_std_plot[:,1], marker='o', mfc='red',
                 mec='blue', ms=2, mew=4)
    plt.xticks(range(0,s.shape[0]), data[feature].unique(), rotation='vertical')
    plt.ylabel('log price')
    eng_lab = 1+(meanstd[:,0]-meanstd[0,0])*((s.shape[0]-1)/(meanstd[-1,0]-meanstd[0,0]))
    #print(eng_lab)
    #df[feature] = df[feature].replace(labels,range(1,s.shape[0]+1))
    data[feature] = data[feature].replace(labels,eng_lab)
    
    plt.subplot(1,2,2)
    plt.errorbar(eng_lab, meanstd[:,0], 0.5*meanstd[:,1], marker='o', mfc='red',mec='blue', ms=2, mew=4)
    plt.xticks(range(0,s.shape[0]), labels, rotation='vertical')
    plt.ylabel('log price')
    plt.tight_layout()
    #replace labels by numbers starting from 1 to no. of labels
#    data[feature] = data[feature].replace(labels,range(1,s.shape[0]+1))
    plt.savefig('cat_to_num_linear.png')
    return data

In [45]:
def fix_skewed(data,feats):
    skewed_feats = data[feats].apply(lambda x: skew(x.dropna())) #compute skewness
    skewed_feats = skewed_feats[skewed_feats > 0.75]
    skewed_feats = skewed_feats.index
    #print(skewed_feats)
    df_log = data[feats].copy()
    df_log[skewed_feats] = np.log1p(df_log[skewed_feats])
    return df_log
    

In [46]:
def plot_corr(data):
    corrmat = data.corr()
    f, ax = plt.subplots(figsize=(10, 10))
    sns.heatmap(corrmat, vmax=1., square=True);
    k = data.shape[1] #number of variables for heatmap
    cols = corrmat.nlargest(k, 'price')['price'].index
    cm = np.corrcoef(data[cols].values.T)
    sns.set(font_scale=1.25)
    hm = sns.heatmap(cm, cbar=False, annot=True, square=True, fmt='.2f', annot_kws={'size': 10}, yticklabels=cols.values, xticklabels=cols.values)
    return cols