In [81]:
import pandas as pd
from datetime import datetime
import seaborn as sns

import re
import numpy as np
from collections import Counter

from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.model_selection import RepeatedStratifiedKFold, GridSearchCV
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score, f1_score, accuracy_score, recall_score, precision_score
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_percentage_error, make_scorer
from sklearn.compose import TransformedTargetRegressor

from imblearn.under_sampling import TomekLinks
from imblearn.over_sampling import SMOTE

# from mlxtend.feature_selection import SequentialFeatureSelector as SFS
# from keras import Input, Model, optimizers, callbacks
# from keras.layers import Bidirectional, LSTM, Dense, Concatenate
# from keras import backend as K

In [93]:
df = pd.read_csv("../data/final_combind.csv")
df = df.drop(['Unnamed: 0'], axis=1)
# df = df[df['Year']>2019]
df

Unnamed: 0,Year,Quarter,Number of Workers,Number of Layoffs,revenue,costOfRevenue,grossProfit,grossProfitRatio,ResearchAndDevelopmentExpenses,GeneralAndAdministrativeExpenses,...,freeCashFlow,employee_count,percent_layoff,industry_labelled,new_cases,new_cases_smoothed,new_cases_per_million,new_deaths,new_deaths_smoothed,new_deaths_per_million
0,2005,4,18,1,1.658000e+09,9.070000e+08,7.510000e+08,0.452955,232000000.0,4.240000e+08,...,9.300000e+07,21000.0,0.000857,0,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000
1,2008,4,120,1,1.393000e+09,6.370000e+08,7.560000e+08,0.542714,181000000.0,4.410000e+08,...,-3.000000e+07,19600.0,0.006122,0,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000
2,2015,4,415,1,2.451000e+09,2.157000e+09,2.940000e+08,0.119951,15000000.0,0.000000e+00,...,-8.000000e+07,0.0,,1,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000
3,2006,2,61,2,5.975000e+09,3.725000e+09,2.250000e+09,0.376569,0.0,2.449000e+09,...,6.810000e+08,118033.0,0.000517,2,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000
4,2008,3,737,5,6.421000e+09,2.695000e+09,3.726000e+09,0.580283,0.0,1.633000e+09,...,-1.398000e+09,118033.0,0.006244,2,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1682,2019,1,49,1,9.462000e+08,4.465000e+08,4.997000e+08,0.528112,0.0,4.319000e+08,...,-4.600000e+06,4800.0,0.010208,576,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000
1683,2020,2,461,1,4.906000e+08,3.145000e+08,1.761000e+08,0.358948,0.0,2.791000e+08,...,1.619000e+08,5100.0,0.090392,576,26741.428571,26039.981154,79.048879,1372.78022,1360.849286,4.058055
1684,2020,2,375,1,1.690300e+08,1.182500e+07,1.572050e+08,0.930042,53969000.0,2.640200e+07,...,6.512000e+06,3900.0,0.096154,577,26741.428571,26039.981154,79.048879,1372.78022,1360.849286,4.058055
1685,2010,1,172,1,8.909600e+07,6.334400e+07,2.575200e+07,0.289037,0.0,0.000000e+00,...,-1.275700e+07,1130.0,0.152212,578,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000


In [85]:
features = ['Year', 'grossProfit', 'otherExpenses', 'operatingExpenses', 'interestExpense', 'operatingIncome', 'totalOtherIncomeExpensesNet', 'incomeBeforeTax', 'incomeBeforeTaxRatio', 'incomeTaxExpense', 'netIncome', 'EPS', 'EPSDiluted', 'weightedAverageShsOut', 'weightedAverageShsOutDil', 'shortTermInvestments', 'cashAndShortTermInvestments', 'intangibleAssets', 'longTermInvestments', 'taxAssets', 'otherNonCurrentAssets', 'totalNonCurrentAssets', 'otherAssets', 'shortTermDebt', 'taxPayables', 'deferredRevenue', 'deferredRevenueNonCurrent', 'deferrredTaxLiabilitiesNonCurrent', 'otherNonCurrentLiabilities', 'totalNonCurrentLiabilities', 'commonStock', 'retainedEarnings', 'accumulatedOtherComprehensiveIncomeLoss', 'totalStockholdersEquity', 'totalInvestments', 'netDebt', 'netIncome_cash-flow-statement', 'depreciationAndAmortization_cash-flow-statement', 'deferredIncomeTax', 'stockBasedCompensation', 'changeInWorkingCapital', 'inventory_cash-flow-statement', 'otherNonCashItems', 'investmentsInPropertyPlantAndEquipment', 'acquisitionsNet', 'purchasesOfInvestments', 'salesMaturitiesOfInvestments', 'otherInvestingActivites', 'netCashUsedForInvestingActivites', 'debtRepayment', 'commonStockIssued', 'dividendsPaid', 'netCashUsedProvidedByFinancingActivities', 'effectOfForexChangesOnCash', 'netChangeInCash', 'capitalExpenditure', 'freeCashFlow']
X = df[features]
y = df['Number of Workers']

In [86]:
def do_regression_grid_search(X, y, model, param_grid, scoring, refit):
    X_train, X_test, y_train, y_test = \
        train_test_split(X, y, test_size=0.2)
    wrapped_model = TransformedTargetRegressor(regressor=model, transformer=StandardScaler())

    gs = GridSearchCV(
        wrapped_model,
        param_grid=param_grid,
        scoring=scoring,
        refit=refit,
        n_jobs=-1,
        return_train_score=True,
        cv=5
    )
    gs.fit(X_train, y_train)

    y_test_hat = gs.predict(X_test)
    # The mean squared error
    print("MSE: %.2f" % mean_squared_error(y_test, y_test_hat))
    # The coefficient of determination: 1 is perfect prediction
    print("R2: %.2f" % r2_score(y_test, y_test_hat))
    print("MAPE: %.2f" % mean_absolute_percentage_error(y_test, y_test_hat))

In [87]:
scoring = {"mse": make_scorer(mean_squared_error, greater_is_better=False), "r2": make_scorer(r2_score)}
refit = "r2"

In [88]:
dtree_regr = DecisionTreeRegressor(random_state=42)

param_grid = {
    "regressor__regressor__min_samples_split": range(2, 101, 20),
    "regressor__regressor__max_depth": [5, 10, 15],
}

pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('regressor', dtree_regr)
])

do_regression_grid_search(X, y, pipeline, param_grid=param_grid, scoring=scoring, refit=refit)

MSE: 264545.29
R2: -0.47
MAPE: 5.47


In [92]:
rf_regr = RandomForestRegressor(random_state=42)

param_grid = {
    "regressor__regressor__n_estimators": range(2, 101, 20),
    "regressor__regressor__max_depth": [5, 10, 15],
}

pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('regressor', rf_regr)
])

do_regression_grid_search(X, y, pipeline, param_grid=param_grid, scoring=scoring, refit=refit)

MSE: 293525.90
R2: -0.67
MAPE: 9.17
