In [6]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
import sklearn.metrics

In [7]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import RidgeCV
from sklearn.linear_model import LassoCV

from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

In [8]:
pd.options.display.max_rows = 1000
pd.options.display.max_columns = 10000

In [18]:
class featureEng:
    'Feature Engineering class'
    def __init__(self, df, target):
        'Initiate feature eng with a csv_file and target'
        # read in data frame 
        
        self.df = df
        self.target = target
        self.features = self.df.drop(target, axis=1)
        
        self.num_features = self.features.select_dtypes(include='number').columns
        self.nom_features = self.features.select_dtypes(exclude='number').columns
        
        self.dummies = pd.get_dummies(self.df[self.nom_features], drop_first=True) # right now dummying all nominal features
        
        self.base_df = pd.concat([self.dummies, self.df[self.num_features], self.df[self.target]], axis=1)
        
        print(f'File shape: {self.df.shape}, Null Values: {self.df.isna().sum().sum()}')
        
    def feature_dropping(self, drop_features):
        # pass in list of feature to drop
        
        self.base_df = self.base_df.drop([drop_features], axis=1)
        print('features dropped!')
            
    def poly(self, features):
        'Transform selected features with Polynomial, add back to base DataFrame. Provided a List of features.'
        
        # update base dataframe with polys 
        
        
        X = self.base_df[features]

        poly = PolynomialFeatures(include_bias=False)
        X_poly = poly.fit_transform(X)
        p_features = poly.get_feature_names(features)
        poly_features = pd.DataFrame(X_poly, columns=p_features)

        
        self.base_df = self.base_df.drop(features, axis=1) # remove features used
        
        self.base_df = pd.concat([self.base_df, poly_features], axis=1) # add poly features back to base df
        
        print('DataFrame updated with Polynomial features!')        
        
    def num_features_score(self, model=LinearRegression):
        self.model = model()
        
        s_val_score = []
        s_test_score = []
        s_train_score = []
        s_features = []
        
        for feature in self.num_features:
            X = self.df[[feature]]
            y = self.df[self.target]
        
            X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=32) 
            self.model.fit(X_train, y_train)
            
            val_score = cross_val_score(self.model, X_train, y_train, cv=5).mean()
            test_score = self.model.score(X_test, y_test)
            train_score = self.model.score(X_train, y_train)
            
            s_features.append(feature)
            s_val_score.append(val_score)
            s_train_score.append(train_score)
            s_test_score.append(test_score)
            
            summary = {'feature' : s_features, 'val_score' : s_val_score, 
                       'train_score' : s_train_score, 'test_score' : s_test_score}
        
        return pd.DataFrame(data=summary).sort_values('val_score', ascending=False)
    


In [33]:
df = pd.read_csv('../datasets/train_clean.csv')

In [43]:
features_num = ['overall_qual',
 'gr_liv_area',
 'garage_area',
 'garage_cars',
 'total_bsmt_sf',
 '1st_flr_sf',
 'year_built',
 'year_remod/add',
 'garage_yr_blt',
 'full_bath']

features_nom = ['exterior_2nd', 'mas_vnr_type', 'exter_qual', 
                'foundation', 'bsmt_exposure', 'bsmtfin_type_1', 'heating_qc', 'kitchen_qual', 'neighborhood',
               'bldg_type', 'house_style', 'exterior_1st', 'garage_finish', 'garage_type', 'fireplace_qu']

target = ['saleprice']

features = features_nom + features_num + target

In [44]:
df = df[features]

In [45]:
df_features = featureEng(df, 'saleprice')

File shape: (2051, 26), Null Values: 0


In [46]:
df_features.poly(df_features.num_features)

DataFrame updated with Polynomial features!


In [47]:
df_features.base_df.shape

(2051, 175)

In [48]:
df_features.base_df.to_csv('../datasets/model_data.csv')