## Imports

In [None]:
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm
from sklearn import decomposition, ensemble

import pandas as pd
import numpy as np

import re, random

from datetime import datetime, timedelta
from scipy import stats

from dateutil.parser import parse 
import matplotlib as mpl
import matplotlib.pyplot as plt
import statsmodels.api as sm

## Read data

In [None]:
data = pd.read_csv('Data/dataset_for_regressions_sample.csv', index_col=0)

In [None]:
data.head()

## Regressions

In [None]:
dimensions = ['Politics', 'Health', 'Personal', 'Travel', 'Economic']
controls = ['CharacterCount', 'PunctuationCount', 'UpperCaseWordCount']
author = ['HealthExpert', 'Influencer']

In [None]:
def get_regressions(novelty):
    reg1 = f'RetweetCount ~ {" + ".join(dimensions + controls)}'
    reg2 = f'RetweetCount ~ {" + ".join(dimensions + author + controls)} + {" + ".join([f"{i}:HealthExpert" for i in dimensions])} + {" + ".join([f"{i}:Influencer" for i in dimensions])}'
    reg3 = f'RetweetCount ~ {" + ".join([novelty] + controls)} + {" + ".join([f"{i}:{novelty}" for i in dimensions])} - 1'
    reg4 = f'RetweetCount ~ {" + ".join(author + [novelty, f"Influencer:{novelty}", f"HealthExpert:{novelty}"] + controls)} - 1'
    return reg1, reg2, reg3, reg4

novelty = 'Novelty'
reg1, reg2, reg3, reg4 = get_regressions(novelty)

regressions = {
    'reg1': {
        'number':'1',
        'equation': reg1,
        'equation2': reg1.replace('RetweetCount', 'FavoriteCount')
    },
    'reg2': {
        'number':'2',
        'equation': reg2,
        'equation2': reg2.replace('RetweetCount', 'FavoriteCount')
    },
    'reg3': {
        'number':'3',
        'equation': reg3,
        'equation2': reg3.replace('RetweetCount', 'FavoriteCount')
    },
    'reg4': {
        'number':'4',
        'equation': reg4,
        'equation2': reg4.replace('RetweetCount', 'FavoriteCount')
    }
}

In [None]:
novelty = 'Novelty'
response = 'RetweetCount'
retweet_coefs = pd.DataFrame(columns = ['index', 'lower', 'coef', 'upper', 'Regression', 'Novelty', 'y'])

In [None]:
regression = 'reg1'

model = sm.GLM.from_formula(regressions[regression]['equation'], data=data, family=sm.families.Poisson()).fit()

model.summary()

In [None]:
regression = 'reg2'

model = sm.GLM.from_formula(regressions[regression]['equation'], data=data, family=sm.families.Poisson()).fit()

model.summary()

In [None]:
regression = 'reg3'

model = sm.GLM.from_formula(regressions[regression]['equation'], data=data, family=sm.families.Poisson()).fit()

model.summary()

In [None]:
regression = 'reg4'

model = sm.GLM.from_formula(regressions[regression]['equation'], data=data, family=sm.families.Poisson()).fit()

model.summary()

### 3-day Novelty Regressions

In [None]:
novelty = 'Novelty_3day'
retweet_coefs = pd.DataFrame(columns = ['index', 'lower', 'coef', 'upper', 'Regression', 'Novelty', 'y'])

In [None]:
regression = 'reg3'

equation = regressions[regression]['equation'].replace('Novelty', novelty)

model = sm.GLM.from_formula(equation, data=data, family=sm.families.Poisson()).fit()

model.summary()

In [None]:
regression = 'reg4'

equation = regressions[regression]['equation'].replace('Novelty', novelty)

model = sm.GLM.from_formula(equation, data=data, family=sm.families.Poisson()).fit()

model.summary()

### 7-day Novelty Regressions

In [None]:
novelty = 'Novelty_7day'
retweet_coefs = pd.DataFrame(columns = ['index', 'lower', 'coef', 'upper', 'Regression', 'Novelty', 'y'])

In [None]:
regression = 'reg3'

equation = regressions[regression]['equation'].replace('Novelty', novelty)

model = sm.GLM.from_formula(equation, data=data, family=sm.families.Poisson()).fit()

model.summary()

In [None]:
regression = 'reg4'

equation = regressions[regression]['equation'].replace('Novelty', novelty)

model = sm.GLM.from_formula(equation, data=data, family=sm.families.Poisson()).fit()

model.summary()

## Regressions with RetweetCount as binary

In [None]:
data['is_retweeted'] = [1 if i>0 else 0 for i in data.RetweetCount]

In [None]:
novelty = 'Novelty'
response = 'is_retweeted'
for regression in regressions:
    model = sm.GLM.from_formula(regressions[regression]['equation'].replace('RetweetCount', 'is_retweeted'), data=data, family=sm.families.Poisson()).fit()
    print(regression)
    print(model.summary())

## Quadratic relationship

## Regressions
* RetweetCount ~ Novelty
* RetweetCount ~ Novelty2
* RetweetCount ~ Novelty + Novelty2

In [None]:
scaler = preprocessing.MinMaxScaler(feature_range=[-1,1])
data['Novelty'] = scaler.fit_transform(data[['Novelty']])

In [None]:
data['Novelty2'] = data.Novelty*data.Novelty

In [None]:
model = sm.GLM.from_formula('RetweetCount ~ Novelty', data=data, family=sm.families.Poisson()).fit()
model.summary()

In [None]:
model = sm.GLM.from_formula('RetweetCount ~ Novelty2', data=data, family=sm.families.Poisson()).fit()
model.summary()

In [None]:
model = sm.GLM.from_formula('RetweetCount ~ Novelty + Novelty2', data=data, family=sm.families.Poisson()).fit()
model.summary()

## Regressions with categorical Novelty

In [None]:
data = data.dropna()

In [None]:
data = data[(data.Novelty>=np.quantile(data.Novelty, 0.01))&(data.Novelty<=np.quantile(data.Novelty, 0.99))]

In [None]:
minmax = preprocessing.MinMaxScaler()

data['Novelty'] = minmax.fit_transform(data[['Novelty']])

data['Novelty_cut'] = pd.cut(data.Novelty, bins = 5, labels = ['Very low', 'Low', 'Average', 'High', 'Very high'])

In [None]:
data.groupby('Novelty_cut')[['RetweetCount']].describe()

In [None]:
def get_regressions(novelty):
    reg1 = f'RetweetCount ~ {" + ".join(dimensions + controls)}'
    reg2 = f'RetweetCount ~ {" + ".join(dimensions + author + controls)} + {" + ".join([f"{i}:HealthExpert" for i in dimensions])} + {" + ".join([f"{i}:Influencer" for i in dimensions])}'
    reg3 = f'RetweetCount ~ {" + ".join([novelty] + controls)} + {" + ".join([f"{i}:{novelty}" for i in dimensions])} - 1'
    reg4 = f'RetweetCount ~ {" + ".join(author + [novelty, f"Influencer:{novelty}", f"HealthExpert:{novelty}"] + controls)} - 1'
    return reg1, reg2, reg3, reg4

novelty = 'Novelty_cut'
reg1, reg2, reg3, reg4 = get_regressions(novelty)

regressions = {
    'reg1': {
        'number':'1',
        'equation': reg1,
        'equation2': reg1.replace('RetweetCount', 'FavoriteCount')
    },
    'reg2': {
        'number':'2',
        'equation': reg2,
        'equation2': reg2.replace('RetweetCount', 'FavoriteCount')
    },
    'reg3': {
        'number':'3',
        'equation': reg3,
        'equation2': reg3.replace('RetweetCount', 'FavoriteCount')
    },
    'reg4': {
        'number':'4',
        'equation': reg4,
        'equation2': reg4.replace('RetweetCount', 'FavoriteCount')
    }
}

In [None]:
novelty = 'Novelty_cut'
response = 'RetweetCount'
for regression in ['reg3','reg4']:
    model = sm.GLM.from_formula(regressions[regression]['equation'], data=data, family=sm.families.Poisson()).fit()
    print(model.summary())

In [None]:
novelty = 'Novelty_cut'
response = 'is_retweeted'
for regression in ['reg3','reg4']:
    model = sm.GLM.from_formula(regressions[regression]['equation'].replace('RetweetCount', response), data=data, family=sm.families.Poisson()).fit()
    print(model.summary())