In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
import sklearn.metrics

from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.linear_model import Ridge, Lasso, LinearRegression, ElasticNet

from sklearn.pipeline import Pipeline

In [40]:
class lr_model:
    'Modeling Class, default to LinearRegression'
    def __init__(self, df, target, pipe_steps):
        self.df = df
        self.target = self.df[target]
        self.features = self.df.drop(target, axis=1)
        
        self.num_features = self.features.select_dtypes(include='number')
        self.nom_features = self.features.select_dtypes(exclude='number')
        
        self.pipe = Pipeline(pipe_steps)
        
        self.summary = pd.DataFrame({'random_state' : [], 'val_score': [], 'train_score' : [], 'test_score' : []})
        
    def test_models(self, run_time=3):
        'Run Models X amount of time with different random state'
        for i in np.random.choice(100, run_time, replace=False):
            seed = i
            X = self.features
            y = self.target

            X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=seed) 
            self.pipe.fit(X_train, y_train)

            val_score = round(cross_val_score(self.pipe, X_train, y_train, cv=5).mean(), 2)
            test_score = round(self.pipe.score(X_test, y_test), 2)
            train_score = round(self.pipe.score(X_train, y_train), 2)

            self.summary = self.summary.append({'random_state' : i, 'val_score': val_score, 
                                 'train_score' : train_score, 'test_score' : test_score}, ignore_index=True)
        
        return self.summary
    
    def final_model(self):
        pass
    
    def predictions(self):
        pass
                
        
    def coef_score(self):
        
        X = self.features
        y = self.target
        X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=8) 
        self.model.fit(X_train, y_train)
        
                    
        val_score = cross_val_score(self.model, X_train, y_train, cv=5).mean()
        test_score = self.model.score(X_test, y_test)
        train_score = self.model.score(X_train, y_train)
        
        
        summary = pd.DataFrame({"coefficients": np.transpose([(round(coef, 2)) for coef in self.model.coef_]), 
                             'avg_feature_value' : self.features.mean(), 
                             'avg_feature__median' : self.features.median()})
        
        summary['avg_change'] = summary['avg_feature_value'] * summary['coefficients']
        summary['count'] = self.features[self.features > 0].count()
        
        return summary.sort_values('avg_change', ascending=False)
    

In [34]:
df = pd.read_csv('../datasets/model_data.csv')

In [35]:
lr_mysteps = [
    ('sc', StandardScaler()),
    ('lr', LinearRegression()) # estimator         
    ]

In [36]:
lr_model = lr_model(df, 'saleprice', mysteps)

In [37]:
lr_model.test_models(10)

Unnamed: 0,random_state,val_score,train_score,test_score
0,73.0,0.91,0.93,0.4
1,68.0,0.76,0.93,0.87
2,92.0,0.76,0.93,0.88
3,95.0,0.89,0.93,0.91
4,64.0,0.83,0.93,0.87
5,46.0,0.87,0.93,0.9
6,65.0,0.88,0.93,0.91
7,31.0,0.65,0.93,0.91
8,17.0,0.69,0.93,0.9
9,60.0,0.67,0.93,0.91


In [41]:
ridge_mysteps = [
    ('sc', StandardScaler()),
    ('lr', Ridge()) # estimator         
    ]

In [42]:
ridge_model = lr_model(df, 'saleprice', mysteps)

In [43]:
ridge_model.test_models(10)

Unnamed: 0,random_state,val_score,train_score,test_score
0,57.0,0.88,0.93,0.91
1,30.0,0.92,0.94,0.22
2,60.0,0.67,0.93,0.91
3,15.0,0.76,0.93,0.89
4,29.0,0.88,0.93,0.91
5,43.0,0.77,0.94,0.87
6,81.0,0.89,0.93,0.89
7,48.0,0.79,0.93,0.9
8,21.0,0.83,0.93,0.89
9,62.0,0.9,0.93,0.31


In [44]:
lasso_mysteps = [
    ('sc', StandardScaler()),
    ('lr', Lasso()) # estimator         
    ]

In [45]:
lasso_model = lr_model(df, 'saleprice', mysteps)

In [46]:
lasso_model.test_models(10)

Unnamed: 0,random_state,val_score,train_score,test_score
0,80.0,0.88,0.92,0.92
1,51.0,0.9,0.94,0.88
2,84.0,0.81,0.93,0.89
3,8.0,0.79,0.93,0.89
4,49.0,0.9,0.94,0.87
5,24.0,0.88,0.93,0.92
6,16.0,0.91,0.93,0.26
7,23.0,0.74,0.92,0.92
8,68.0,0.76,0.93,0.87
9,88.0,0.71,0.92,0.92


In [47]:
el_mysteps = [
    ('sc', StandardScaler()),
    ('lr', ElasticNet()) # estimator         
    ]

In [48]:
el_model = lr_model(df, 'saleprice', el_mysteps)

In [49]:
el_model.test_models(10)

Unnamed: 0,random_state,val_score,train_score,test_score
0,25.0,0.87,0.89,0.88
1,97.0,0.86,0.89,0.86
2,32.0,0.84,0.88,0.86
3,38.0,0.84,0.89,0.84
4,22.0,0.81,0.89,0.84
5,5.0,0.86,0.89,0.86
6,91.0,0.85,0.88,0.89
7,15.0,0.81,0.88,0.87
8,92.0,0.81,0.89,0.87
9,66.0,0.86,0.88,0.9
