In [1]:
# data saving imports
import pickle
import os

# time tracking
from datetime import datetime as dt

# multiprocessing
from multiprocessing import Pool

import statsmodels.formula.api as smf
import statsmodels.tools.sm_exceptions as smExceptions

# custom imports
from regression_class import RedditRegression as RR
from regression_class import TimestampClass as tc

import warnings
from numpy.linalg import LinAlgError

In [2]:
TEST_INFILE = "test_data_4_days.p"
test_data = pickle.load(open(TEST_INFILE, 'rb'))
regression_df = test_data["regression_data"]
thread_df = test_data["all_data"]

In [3]:
X_COLS = [
    "sentiment_sign",
    "sentiment_magnitude",
    "time_in_secs",
    "num_dayofweek",
    "activity_ratio",
    "mean_author_sentiment_sign",
    "mean_author_sentiment_magnitude",
    "log_author_all_activity_count",
]
quantiles = [0.25, 0.5, 0.75]
thresholds2 = {
    "author_all_activity_count": 2,
    "thread_size": 2,
}
thresholds1 = {
    "author_all_activity_count": 2,
}

regression_params = {
    'regression_data': regression_df['books'],
    'thread_data': thread_df['books'],
    'collection_window': 1,
    'model_window': 2,
    'validation_window': 1,
    'FSS': True,
    'x_cols': X_COLS,
    'scale': True,
}

In [4]:
regression_types = ['logistic', 'linear', 'mnlogit']
to_vary = {
    'name': [f'books_{x}' for x in regression_types],
    'regression_type': regression_types,
    'y_col': ['success', 'thread_size', 'thread_size'],
    'metrics': [['auc'], ['r2'], ['mnlogit_accuracy', 'mnlogit_aucs', "mnlogit_mean_auc"]],
    'thresholds': [thresholds1, thresholds2, thresholds2],
    'quantiles': [[], [], quantiles],
}

regressions = {}
for i, regression_type in enumerate(regression_types):
    in_params = regression_params.copy()
    for key in to_vary:
        in_params[key] = to_vary[key][i]
    regressions[regression_type] = RR(in_params)

In [5]:
testreg = regressions['logistic']

In [6]:
testreg.main()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  model_data.drop(labels=to_drop, axis=1, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  model_data.drop(labels=to_drop, axis=1, inplace=True)
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  ret = ret.dtype.type(ret / rcount)
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  ret = ret.dtype.type(ret / rcount)


Running FSS
Model 1
Optimization terminated successfully.
         Current function value: 0.585670
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.585670
         Iterations 5
Model 2
         Current function value: 0.440824
         Iterations: 35
Optimization terminated successfully.
         Current function value: 0.440824
         Iterations: 27
         Function evaluations: 28
         Gradient evaluations: 28
Optimization terminated successfully.
         Current function value: 0.440824
         Iterations: 27
         Function evaluations: 28
         Gradient evaluations: 28
Model 3
         Current function value: 0.401594
         Iterations: 35
Optimization terminated successfully.
         Current function value: 0.401594
         Iterations: 32
         Function evaluations: 33
         Gradient evaluations: 33
Optimization terminated successfully.
         Current function value: 0.401594
         Iterations: 32
        

In [7]:
testreg.get_cal_val_data()
testreg.sm_modstrings = testreg.run_FSS()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  model_data.drop(labels=to_drop, axis=1, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  model_data.drop(labels=to_drop, axis=1, inplace=True)
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  ret = ret.dtype.type(ret / rcount)
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  ret = ret.dtype.type(ret / rcount)


In [8]:
model_results = {}
param_dict = {}
testreg.smf_models = {}

In [9]:
smf_model = (
            getattr(smf, testreg.SMF_FUNCTIONS[testreg.regression_params["regression_type"]])(
                testreg.sm_modstrings[8], data=testreg.__model_data__["cal"]
            )
        )

In [9]:
smf_model = testreg.run_regression(5)

         Current function value: 0.212076
         Iterations: 35
         Current function value: 0.212076
         Iterations: 35
         Function evaluations: 36
         Gradient evaluations: 36
Optimization terminated successfully.
         Current function value: 0.212076
         Iterations: 37
         Function evaluations: 38
         Gradient evaluations: 38
         Current function value: 0.212223
         Iterations: 100
         Function evaluations: 316
         Gradient evaluations: 316
         Current function value: 0.212113
         Iterations: 150
         Function evaluations: 483
         Gradient evaluations: 483
Optimization terminated successfully.
         Current function value: 0.212076
         Iterations: 167
         Function evaluations: 533
         Gradient evaluations: 533
Optimization terminated successfully.
         Current function value: 0.212076
         Iterations: 167
         Function evaluations: 533
         Gradient evaluations: 533


In [10]:
smf_model.cov_params()

Unnamed: 0,Intercept,sentiment_sign,sentiment_magnitude,time_in_secs,num_dayofweek,mean_author_sentiment_magnitude
Intercept,1688048000000000.0,-908940600000000.0,4136735000000000.0,1.364283,50241100.0,51556810.0
sentiment_sign,-908940600000000.0,489425200000000.0,-2227453000000000.0,0.738022,33299760.0,34244050.0
sentiment_magnitude,4136735000000000.0,-2227453000000000.0,1.01375e+16,3.923707,143409500.0,147036900.0
time_in_secs,1.364283,0.7380225,3.923707,0.781838,3.035245,2.537926
num_dayofweek,50241100.0,33299760.0,143409500.0,3.035245,95345480.0,97936850.0
mean_author_sentiment_magnitude,51556810.0,34244050.0,147036900.0,2.537926,97936850.0,100675600.0


In [11]:
testreg.get_model_metrics_from_smf_mod(smf_model)

Unnamed: 0,param,stderr,pvalue
Intercept,-6.16667,41085860.0,1.0
sentiment_sign,58.070446,22122960.0,0.999998
sentiment_magnitude,14.111301,100685100.0,1.0
time_in_secs,-0.036586,0.8842163,0.966995
num_dayofweek,77.337833,9764.501,0.993681
mean_author_sentiment_magnitude,73.723462,10033.73,0.994138


In [10]:
smf_model = testreg.fit_smf_model(smf_model)

         Current function value: inf
         Iterations: 35
Optimization terminated successfully.
         Current function value: 0.212076
         Iterations: 32
         Function evaluations: 33
         Gradient evaluations: 33
         Current function value: 0.212093
         Iterations: 35
         Function evaluations: 98
         Gradient evaluations: 98
         Current function value: 0.212093
         Iterations: 35
         Function evaluations: 98
         Gradient evaluations: 98
         Current function value: 0.212093
         Iterations: 35
         Function evaluations: 98
         Gradient evaluations: 98
         Current function value: 0.212093
         Iterations: 35
         Function evaluations: 98
         Gradient evaluations: 98
         Current function value: 0.212093
         Iterations: 35
         Function evaluations: 98
         Gradient evaluations: 98
         Current function value: 0.212093
         Iterations: 35
         Function evaluations: 



In [11]:
model_results = {}
model_results["model"] = testreg.sm_modstrings[5]


In [16]:
run_again = True
run_counter = 0
max_runs = 10
kwargs_dict = {}

while run_again==True and run_counter<max_runs:
    run_again, kwargs_dict = testreg.manage_convergence_warnings(smf_model.fit, **kwargs_dict)
    print(run_again, kwargs_dict)
    run_counter += 1

         Current function value: 0.212076
         Iterations: 35
True {'method': 'bfgs'}
         Current function value: 0.212076
         Iterations: 35
         Function evaluations: 36
         Gradient evaluations: 36
True {'method': 'bfgs', 'maxiter': 100}
Optimization terminated successfully.
         Current function value: 0.212076
         Iterations: 37
         Function evaluations: 38
         Gradient evaluations: 38


TypeError: cannot unpack non-iterable NoneType object

In [11]:
smf_model = smf.logit(testreg.sm_modstrings[8], data = testreg.__model_data__['cal'])

In [24]:
testreg.manage_convergence_warnings(smf_model.fit, method='cg')

         Current function value: 0.212093
         Iterations: 35
         Function evaluations: 98
         Gradient evaluations: 98


(True, {'method': 'cg'})

In [26]:
smf_model.fit(method='bfgs', maxiter=100)

Optimization terminated successfully.
         Current function value: 0.212076
         Iterations: 32
         Function evaluations: 33
         Gradient evaluations: 33




<statsmodels.discrete.discrete_model.BinaryResultsWrapper at 0x7f968af18df0>

In [21]:
reg_params = smf_model.fit_regularized().params

Optimization terminated successfully    (Exit mode 0)
            Current function value: 0.2120758210671058
            Iterations: 48
            Function evaluations: 49
            Gradient evaluations: 48


In [None]:
 start_params=reg_params

In [107]:
with warnings.catch_warnings(record=True) as w:
    smf_model.fit(maxiter=55)

         Current function value: 0.212076
         Iterations: 55


In [160]:
smf_model.fit(maxiter=55)

         Current function value: 0.212076
         Iterations: 55


  return 1/(1+np.exp(-X))


<statsmodels.discrete.discrete_model.BinaryResultsWrapper at 0x7f0a3d7f7940>

In [117]:
maxiter = 35

In [157]:
def model_to_run(fit_function, **kwargs):
    run_again=False
    with warnings.catch_warnings(record=True) as w:
        try:
            fit_function(**kwargs)
        except LinAlgError:
            kwargs['method'] = 'bfgs'
            run_again=True
            return run_again, kwargs
    if len(w)>0:
        for w_i in w:
            if w_i.category == RuntimeWarning:
                kwargs['method'] = 'bfgs'
                return run_again, kwargs
            elif w_i.category == smExceptions.ConvergenceWarning:
                if 'maxiter' in kwargs:
                    kwargs['maxiter']*=2
                else:
                    kwargs['maxiter'] = 100
                return run_again, kwargs

    
    

In [158]:
run_again = True
kwargs_dict = {}
while run_again:
    run_again, kwargs_dict = model_to_run(smf_model.fit, **kwargs_dict)

smf_model.fit(**kwargs_dict)

         Current function value: 0.212076
         Iterations: 35
         Current function value: 0.212076
         Iterations: 35
         Function evaluations: 36
         Gradient evaluations: 36
Optimization terminated successfully.
         Current function value: 0.212076
         Iterations: 37
         Function evaluations: 38
         Gradient evaluations: 38




<statsmodels.discrete.discrete_model.BinaryResultsWrapper at 0x7f0a3e497ee0>

In [155]:
smf_model.fit(maxiter=100, method='bfgs')

Optimization terminated successfully.
         Current function value: 0.212076
         Iterations: 37
         Function evaluations: 38
         Gradient evaluations: 38




<statsmodels.discrete.discrete_model.BinaryResultsWrapper at 0x7f0a3d7f7c70>

In [154]:
kwargs_dict

{'maxiter': 100, 'method': 'bfgs'}

In [133]:
model_to_run(smf_model.fit, maxiter=60, method='bfgs')

Optimization terminated successfully.
         Current function value: 0.212076
         Iterations: 37
         Function evaluations: 38
         Gradient evaluations: 38


In [113]:
for w_i in w:
    print(w_i.category)
    print(w_i.category == RuntimeWarning)
    print(w_i.category == smExceptions.ConvergenceWarning)

True
False
False
True


In [98]:

try:
    modfit = smf_model.fit()
except LinAlgError:
    modfit = smf_model.fit(maxiter=55)
except RuntimeWarning:
    modfit = smf_model

         Current function value: 0.212076
         Iterations: 35
         Current function value: 0.212076
         Iterations: 55


  return 1/(1+np.exp(-X))


In [101]:
smf_model.fit(method='bfgs', maxiter=55)

Optimization terminated successfully.
         Current function value: 0.212076
         Iterations: 37
         Function evaluations: 38
         Gradient evaluations: 38




<statsmodels.discrete.discrete_model.BinaryResultsWrapper at 0x7f0a3d7f43a0>

In [71]:
mod_key = 5
testreg.smf_models[mod_key], model_results[mod_key] = testreg.run_regression(
                mod_key
            )

         Current function value: 0.212076
         Iterations: 35


  return 1/(1+np.exp(-X))


LinAlgError: Singular matrix

In [20]:
testreg.run_models()

Model 1
Optimization terminated successfully.
         Current function value: 0.585670
         Iterations 5
Model 2
         Current function value: 0.440824
         Iterations: 35
Model 3
         Current function value: 0.407509
         Iterations: 35
Model 4
         Current function value: 0.407507
         Iterations: 35
Model 5
         Current function value: inf
         Iterations: 35


  return 1/(1+np.exp(-X))
  return np.sum(np.log(self.cdf(q*np.dot(X,params))))


LinAlgError: Singular matrix

In [37]:
testreg.calc_author_thread_counts()
testreg.get_regression_model_data()
testreg.get_regression_model_data(calval='val')

In [39]:
testreg.perform_scaling()

In [23]:
# start running
for key in regressions:
    print(f"Running {key}")
    regressions[key].main()

Running logistic


TypeError: The DType <class 'numpy.dtype[datetime64]'> could not be promoted by <class 'numpy.dtype[float64]'>. This means that no common DType exists for the given inputs. For example they cannot be stored in a single array unless the dtype is `object`. The full list of DTypes is: (<class 'numpy.dtype[float64]'>, <class 'numpy.dtype[float64]'>, <class 'numpy.dtype[datetime64]'>, <class 'numpy.dtype[datetime64]'>, <class 'numpy.dtype[datetime64]'>, <class 'numpy.dtype[int64]'>, <class 'numpy.dtype[float64]'>, <class 'numpy.dtype[float64]'>, <class 'numpy.dtype[int64]'>)

In [25]:
regressions['logistic']

<regression_class.RedditRegression at 0x7f9e1e8b4520>

In [3]:
# outfiles
OUTDIR = "regression_test_outputs"
METRICS_OUTFILE = "regression_metrics"

# subreddits to look at
subreddits = ["books", "crypto", "conspiracy", "politics"]

# regression types to run
regression_types = ["logistic", "linear", "mnlogit"]

# get outdir names for all regression types
out_subdirs = {}
for regtype in regression_types:
    out_subdirs[regtype] = f"{OUTDIR}/{regtype}"

# make out params dict to save spreadsheets
out_params_dict = {}

In [8]:
def run_regression_type(regression_type):
    print(dt.now())
    print(f"\n    ## Regression type: {regression_type}##")
    input_params = regression_params[regression_type].copy()

    # place to store logregs
    subreddit_logregs = {}
    try:
        for subreddit in subreddits:
            print(f"#{subreddit}#")
            input_params["name"] = subreddit
            input_params["regression_data"] = regression_df[subreddit]
            input_params["thread_data"] = thread_df[subreddit]

            subreddit_logregs[subreddit] = RR(regression_params=input_params)

            subreddit_logregs[subreddit].main()
    except TypeError:
        return subreddit_logregs

    # dump pickle results
    outstring = f"{out_subdirs[regression_type]}/{METRICS_OUTFILE}.p"
    print(f"\n\n\n   DUMPING RESULTS TO \n{outstring}\n\n\n")
    pickle.dump(
        subreddit_logregs, open(outstring, "wb",),
    )

In [5]:
# make out directories
for outdirname in ([OUTDIR] + list(out_subdirs.values())):
    if not os.path.isdir(outdirname):
        os.mkdir(outdirname)

# read in files
#print(dt.now())
#print("reading in input files")
# TESTING
#regression_df = pickle.load(open(REGRESSION_INFILE, "rb"))
#thread_df = pickle.load(open(THREAD_INFILE, "rb"))

print("Creating parameter dictionaries")
# fixed regression params
X_COLS = [
    "sentiment_sign",
    "sentiment_magnitude",
    "hour",
    "time_in_secs",
    "num_dayofweek",
    "activity_ratio",
    "mean_author_sentiment_sign",
    "mean_author_sentiment_magnitude",
    "author_all_activity_count",
]

fixed_regression_params = {
    "collection_window": 7,
    "model_window": 14,
    "validation_window": 7,
    "FSS": True,
    "x_cols": X_COLS,
    "scale": True,
}

# variable regression params
quantiles = [0.25, 0.5, 0.75]
thresholds2 = {
    "author_all_activity_count": 2,
    "thread_size": 2,
}
thresholds1 = {
    "author_all_activity_count": 2,
}

to_vary = {
    "regression_type": regression_types,
    "y_col": ["success", "thread_size", "thread_size"],
    "metrics": [
        ["auc"],
        ["r2"],
        ["mnlogit_accuracy", "mnlogit_aucs", "mnlogit_mean_auc"],
    ],
    "thresholds": [thresholds1, thresholds2, thresholds2],
    "quantiles": [[], [], quantiles],
}

regression_params = {}
for i, regtype in enumerate(regression_types):
    regression_params[regtype] = fixed_regression_params.copy()
    for key in to_vary:
        regression_params[regtype][key] = to_vary[key][i]

Creating parameter dictionaries


In [13]:
subreddit_logregs = {}
for key in regression_types:
    subreddit_logregs[key] = run_regression_type(key)

2024-03-11 14:00:19.658496

    ## Regression type: logistic##
#books#
2024-03-11 14:00:20.910123

    ## Regression type: linear##
#books#
2024-03-11 14:00:22.071844

    ## Regression type: mnlogit##
#books#


In [18]:
model_data = subreddit_logregs['logistic']['books'].model_data

In [20]:
model_data

{'cal': Empty DataFrame
 Columns: [thread_id, thread_size, authors, timestamp, author, score, subject_sentiment_score, sentiment_sign, sentiment_magnitude, success, author_all_activity_count, activity_ratio, mean_author_sentiment_sign, mean_author_sentiment_magnitude, time_in_secs, num_dayofweek, hour]
 Index: [],
 'val': Empty DataFrame
 Columns: [thread_id, thread_size, authors, timestamp, author, score, subject_sentiment_score, sentiment_sign, sentiment_magnitude, success, author_all_activity_count, activity_ratio, mean_author_sentiment_sign, mean_author_sentiment_magnitude, time_in_secs, num_dayofweek, hour]
 Index: []}

In [None]:
x_data = self.model_data[calval][self.regression_params["x_cols"]]