In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
import sklearn.metrics

from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.linear_model import Ridge, Lasso, LinearRegression, ElasticNet

from sklearn.pipeline import Pipeline

In [12]:
class lr_model:
    'Modeling Class, default to LinearRegression'
    def __init__(self, df, target, pipe_steps):
        self.df = df
        self.target = self.df[target]
        self.features = self.df.drop(target, axis=1)
        
        self.num_features = self.features.select_dtypes(include='number')
        self.nom_features = self.features.select_dtypes(exclude='number')
        
        self.pipe = Pipeline(pipe_steps)
        
        self.summary = pd.DataFrame({'random_state' : [], 'val_score': [], 'train_score' : [], 'test_score' : []})
        
    def test_models(self, run_time=3):
        'Run Models X amount of time with different random state'
        for i in np.random.choice(100, run_time, replace=False):
            seed = i
            X = self.features
            y = self.target

            X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=seed) 
            self.pipe.fit(X_train, y_train)

            val_score = round(cross_val_score(self.pipe, X_train, y_train, cv=5).mean(), 2)
            test_score = round(self.pipe.score(X_test, y_test), 2)
            train_score = round(self.pipe.score(X_train, y_train), 2)

            self.summary = self.summary.append({'random_state' : i, 'val_score': val_score, 
                                 'train_score' : train_score, 'test_score' : test_score}, ignore_index=True)
        
        return self.summary
    
    def final_model(self, seed):
        X = self.features
        y = self.target

        X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=seed) 
        self.pipe.fit(X_train, y_train)
    
    def predictions(self, data):
        return self.pipe.predict(data)
                
        
    def coef_score(self):
        
        X = self.features
        y = self.target
        X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=8) 
        self.model.fit(X_train, y_train)
        
                    
        val_score = cross_val_score(self.model, X_train, y_train, cv=5).mean()
        test_score = self.model.score(X_test, y_test)
        train_score = self.model.score(X_train, y_train)
        
        
        summary = pd.DataFrame({"coefficients": np.transpose([(round(coef, 2)) for coef in self.model.coef_]), 
                             'avg_feature_value' : self.features.mean(), 
                             'avg_feature__median' : self.features.median()})
        
        summary['avg_change'] = summary['avg_feature_value'] * summary['coefficients']
        summary['count'] = self.features[self.features > 0].count()
        
        return summary.sort_values('avg_change', ascending=False)
    

In [3]:
df = pd.read_csv('../datasets/model_data.csv')

In [4]:
lr_mysteps = [
    ('sc', StandardScaler()),
    ('lr', LinearRegression()) # estimator         
    ]

In [6]:
lr_model = lr_model(df, 'saleprice', lr_mysteps)

In [7]:
lr_model.test_models(10)

Unnamed: 0,random_state,val_score,train_score,test_score
0,18.0,0.78,0.93,0.92
1,8.0,0.77,0.93,0.88
2,3.0,-1.394507e+22,0.93,0.88
3,13.0,-1.935087e+24,0.93,0.87
4,6.0,-1.120808e+25,0.93,0.89
5,33.0,0.85,0.94,0.86
6,19.0,-1.631236e+21,0.93,0.85
7,96.0,0.88,0.93,0.89
8,57.0,-2.681459e+18,0.93,-3.6761e+24
9,85.0,0.8,0.93,0.87


In [8]:
ridge_mysteps = [
    ('sc', StandardScaler()),
    ('lr', Ridge()) # estimator         
    ]

In [13]:
ridge_model = lr_model(df, 'saleprice', ridge_mysteps)

In [43]:
ridge_model.test_models(10)

Unnamed: 0,random_state,val_score,train_score,test_score
0,57.0,0.88,0.93,0.91
1,30.0,0.92,0.94,0.22
2,60.0,0.67,0.93,0.91
3,15.0,0.76,0.93,0.89
4,29.0,0.88,0.93,0.91
5,43.0,0.77,0.94,0.87
6,81.0,0.89,0.93,0.89
7,48.0,0.79,0.93,0.9
8,21.0,0.83,0.93,0.89
9,62.0,0.9,0.93,0.31


In [14]:
lasso_mysteps = [
    ('sc', StandardScaler()),
    ('lr', Lasso()) # estimator         
    ]

In [16]:
lasso_model = lr_model(df, 'saleprice', lasso_mysteps)

In [17]:
lasso_model.test_models(10)

  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)


  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)


Unnamed: 0,random_state,val_score,train_score,test_score
0,15.0,0.77,0.93,0.88
1,32.0,0.83,0.93,0.91
2,70.0,0.71,0.93,0.9
3,4.0,0.92,0.94,0.15
4,49.0,0.89,0.94,0.86
5,45.0,0.82,0.93,0.87
6,96.0,0.88,0.93,0.91
7,59.0,0.89,0.93,0.89
8,3.0,0.79,0.93,0.88
9,21.0,0.83,0.93,0.89


In [21]:
el_mysteps = [
    ('sc', StandardScaler()),
    ('lr', ElasticNet()) # estimator         
    ]

In [19]:
el_model = lr_model(df, 'saleprice', el_mysteps)

In [20]:
el_model.test_models(10)

Unnamed: 0,random_state,val_score,train_score,test_score
0,26.0,0.76,0.88,0.9
1,41.0,0.9,0.91,0.54
2,65.0,0.85,0.88,0.89
3,72.0,0.81,0.89,0.86
4,81.0,0.86,0.89,0.87
5,51.0,0.87,0.89,0.86
6,68.0,0.8,0.89,0.86
7,9.0,0.85,0.88,0.89
8,23.0,0.77,0.88,0.89
9,91.0,0.85,0.88,0.89
