**DESCRIPTION:** This model will run a regression on all of the data over the time period given, treating identifier, market cap, the factors, etc as independent variables. 
The regression is completed using SKLearn which utilizes test and training data to fit a learned model to the data.

In [None]:
import matplotlib
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression, RidgeCV
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.feature_selection import SelectFromModel
from time import time
from sklearn.feature_selection import SequentialFeatureSelector
import warnings
warnings.filterwarnings('ignore')

**Calculating the Model:** The function below calculates and plots our model given a datatable.

In [None]:
def calculate_model(data, predictor_list):
    # Sets the index of the graph as the date so that the regression occurs
    # over the dates
    df.set_index(pd.DatetimeIndex(df['date']), inplace=True)

    # Sets the  predictor values
    predictors = predictor_list

    # Uses the train_test_split to randomly select 30% of the data as testing
    # data and saving the rest for the creation/training of the model
    train, test = train_test_split(data, test_size=0.3)

    # Defines the model as a Linear Regression
    model = LinearRegression()

    # Fits the model using the predictors above and defined target training data
    model.fit(train[predictors], train["target"])

    # Creates the predictors using the model.predict
    preds = model.predict(test[predictors])
    preds = pd.Series(preds, index=test.index)

    # Calculates the r^2 score based on the test values and the predicted values
    r = r2_score(test["target"], preds)

    combined = pd.concat({"target": test["target"], "Predictions": preds},
                         axis=1)

    # k_fold test
    # predictor (x) and response variables (y)
    y = data['target']
    X = data[predictor_list]

    # Conducts the K_Fold test
    cv = KFold(n_splits=10, random_state=100, shuffle=True)

    # modl = LinearRegression()

    # Calculates the cross validation score which is later used to calculate the
    # RMSE below
    scores = cross_val_score(model, train[predictors], train["target"],
                             scoring='neg_mean_absolute_error',
                             cv=cv, n_jobs=-1)
    
    # the lower the RMSE the better
    print("r^2 is = " + str(r))
    print("root mean squared error (RMSE) = " + str(np.sqrt(np.mean(np.absolute(
        scores)))))

    features = ['sector','market_cap', 'index_membership', 'factor_1',
                'factor_2', 'factor_3', 'factor_4', 'factor_5', 'factor_6',
                'factor_7', 'factor_8', 'factor_9', 'factor_10']
        
    ridge = RidgeCV(alphas = np.logspace(-6, 6, num=5)).fit(X, y)
        
    # Visualizing Feature Importance
        
    feature_importance(X, y, ridge)
        
    #Selecting features with Sequential Feature Selection¶
        
    seq_selection(X, y, features, ridge)
        
    #Tuning(test, train, predictors, model, preds)
    
    
    # Plots the origional vs predicted
    combined.plot()
    plt.title("Full Model")
    plt.show()
    

**Feature Importance of Coefficients using RidgeCV:**

In [None]:
# Simple helper function to go from list of True and False to the final selected values

def selecting(lst, features):
    selected = []
    for i in range(0,13):
        if lst[i] == True:
            selected.append(features[i])
    return selected

In [None]:
def feature_importance(X, y, ridge):
        
    importance = list(np.abs(ridge.coef_))

    features = ['sector','market \n cap', 'index \n membership', 'factor_1',
            'factor_2', 'factor_3', 'factor_4', 'factor_5', 'factor_6',
            'factor_7', 'factor_8', 'factor_9', 'factor_10']

    dta = {}
    importance_2 = []
    
    for i in range(0,13):
        importance_2.append(importance[i] *1000)
        dta[features[i]] = importance_2[i]

    names = list(dta.keys())
    values = list(dta.values())

    fig, axs = plt.subplots(1, 1, figsize=(16, 3), sharey=True)
    plt.bar(names, values)
    fig.suptitle('Feature importances via coefficients (scaled by 1000)')

    # Selecting features based on importance
    
    threshold = np.sort(importance)[-3] + 0.01
    tic = time()
    sfm = SelectFromModel(ridge, threshold=threshold).fit(X, y)
    toc = time()
    lst = list(sfm.get_support())
    print(f"Features selected by SelectFromModel:", selecting(lst, features))
    print(f"Done in {toc - tic:.3f}s")

**Selecting features with Sequential Feature Selection:**

In [None]:
def seq_selection(X, y, features, ridge):

    tic_fwd = time()
    sfs_forward = SequentialFeatureSelector(
        ridge, n_features_to_select=2, direction="forward"
    ).fit(X, y)
    toc_fwd = time()

    tic_bwd = time()
    sfs_backward = SequentialFeatureSelector(
        ridge, n_features_to_select=2, direction="backward"
    ).fit(X, y)
    toc_bwd = time()

    print(
        "Features selected by forward sequential selection: ", selecting(sfs_forward.get_support(), features)
    )
    print(f"Done in {toc_fwd - tic_fwd:.3f}s")
    print(
        "Features selected by backward sequential selection: ", selecting(sfs_backward.get_support(), features)
    )
    print(f"Done in {toc_bwd - tic_bwd:.3f}s")

**Note on Path to File:** Below I have specified the path to the given CSV containing the data. You do not need to change the path for it to work on this notebook, but if you would like to download the code, you may.

In [None]:
path_to_file = "data/data.csv"

# Reads the information contained in the CSV
df = pd.read_csv(path_to_file)

# Turns the identifiers into dummy variables for the regression
df = pd.get_dummies(df,prefix='Identifier ', prefix_sep='=', columns=[
    'identifier'])

# Setting the list predictor variables called cols
cols = list(df.columns)

cols.remove('target')
cols.remove('date')

# Running the regression with the given dataset and predictor list
calculate_model(df, cols)

print('\n Model Complete')