In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
from Preprocessing import merge_dfs

In [3]:
import numpy as np
import pandas as pd
from sklearn.model_selection import TimeSeriesSplit, cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.impute import SimpleImputer
from itertools import combinations

In [4]:

class FeatureSelector():
    def __init__(self, model, X_train, y_train, max_combos, features, splits=5) -> None:
        self.model = model
        self.X = X_train
        self.y = y_train
        self.splits = splits
        self.max_combos = max_combos
        self.features = features # list containing names of the features you want to select from
        self.features_to_drop = None # name of the optimal combination of features
        self.best_score = 10000

    def test_performance(self, X_variant):
        """Performs K-fold to test performance of a dataframe"""
        scores = cross_val_score(self.model, X_variant, self.y, cv=self.splits, scoring='neg_mean_squared_error')
        return np.sqrt(-np.mean(scores))


    def find_combos(self):
        """
        0. create a list 
        1. loops trough n_combos
        2. append to list all combos of size of loop+1 (0,1,2)
        3. return list
        """
        combo_list = []
        for i in range(self.max_combos):
            combinations_list = list(combinations(self.features, i))
            for combination in combinations_list:
                combo_list.append(list(combination))
        return combo_list


    def fit(self) -> None:
        """
        1. runds find combos
        2. loops trough all elements in find combos list 
        3. create a copy of X
        4. remove the columns from the copy
        5. test performance on the new df
        6. if lower than best_SCORE save
        """
        combo_list = self.find_combos()
        for combination in combo_list:
            X_copy = self.X.copy()
            X_copy.drop(columns=combination, inplace=True)
            score = self.test_performance(X_copy)
            if score < self.best_score:
                self.best_score = score
                self.features_to_drop = combination
    
    def get_best(self):
        new_X = self.X.drop(columns=self.features_to_drop)
        return self.features_to_drop, new_X


In [5]:
from Preprocessing import merge_dfs    
from sklearn.model_selection import train_test_split
df = merge_dfs()

# Lagre 2023 data til senere
data_2023 = df[df['Aarstall'] == 2023].drop(columns=['Trafikkmengde'])

# Droppe kolonner der trafikkmengde er nan
data = df[df['Aarstall'] != 2023]
data = data.dropna(subset=['Trafikkmengde'])

# Dele i features og target, og splitte datasettet i trenings-og testdata
X = data.drop(columns=['Trafikkmengde'])
y = data['Trafikkmengde']

# Splitter dataen
X_train, X_test, y_train, y_test = train_test_split(X, y ,shuffle=False, test_size=0.3)


Unike verdier i "Felt": ['1' '2' 'Totalt i retning Danmarksplass' 'Totalt i retning Florida'
 'Totalt']
       Trafikkmengde
count   65250.000000
mean       50.380552
std        69.841572
min         0.000000
25%         5.000000
50%        25.000000
75%        64.000000
max       608.000000
Andel av værdata som inneholder "Relativ Luftfuktighet: 0.11075391168620219
       Globalstraling    Solskinstid  Lufttemperatur    Vindretning  \
count   707823.000000  707822.000000   707822.000000  707824.000000   
mean       157.308492      70.214651       49.650598     247.069506   
std        828.637784     827.357277      641.437364     659.156188   
min        -97.200000       0.000000      -11.400000       2.000000   
25%         -0.400000       0.000000        4.100000     146.000000   
50%          5.600000       0.000000        8.100000     164.000000   
75%        101.200000       0.000000       12.600000     298.000000   
max       9999.990000    9999.990000     9999.990000    9999.99

In [6]:
X_train.columns

Index(['Solskinstid', 'Lufttemperatur', 'Vindstyrke', 'Lufttrykk', 'Vindkast',
       'Globalstraling', 'Vindretning', 'Ukedag', 'Maaned', 'Aarstall',
       'Klokkeslett', 'Rod_dag'],
      dtype='object')

In [7]:
features = ['Solskinstid', 'Lufttemperatur', 'Vindstyrke', 'Lufttrykk', 'Vindkast',
       'Globalstraling', 'Vindretning']

all_features = X_train.columns

In [8]:
import pickle
model = pickle.load(open('model.pkl', 'rb'))




https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [9]:
from sklearn.impute import KNNImputer
imputer = KNNImputer()

In [10]:
type(X_train)

pandas.core.frame.DataFrame

In [11]:
X_train = imputer.fit_transform(X_train)

In [12]:
type(X_train)

numpy.ndarray

In [13]:
X_train = pd.DataFrame(X_train, columns=all_features)

In [14]:
feature_selector = FeatureSelector(model, X_train, y_train, max_combos=3, features=features)
feature_selector.fit()

In [19]:
feature_selector.features_to_drop

['Vindkast', 'Globalstraling']

In [17]:
cols, new_X = feature_selector.get_best()

In [18]:
new_X.columns

Index(['Solskinstid', 'Lufttemperatur', 'Vindstyrke', 'Lufttrykk',
       'Vindretning', 'Ukedag', 'Maaned', 'Aarstall', 'Klokkeslett',
       'Rod_dag'],
      dtype='object')