In [171]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
import matplotlib.pyplot as plt
import statsmodels.api as sm

In [131]:
#First step, importing all the data
gucci = pd.read_csv('GucciData.csv')
patek = pd.read_csv('PatekData.csv')
versace = pd.read_csv('VersaceData.csv')
belanciaga = pd.read_csv('BelanciagaData.csv')
saintLaurent = pd.read_csv('SaintLaurentData.csv')
google = pd.read_csv('ResidualGoogleSet.csv')

In [132]:
frames = [gucci, patek, versace, belanciaga, saintLaurent]

In [133]:
idx = pd.date_range('2017-01-01', '2018-01-09')
google = google.set_index(pd.DatetimeIndex(google['Date']))
patek.index.name = 'Date'
google = google.drop(['Date'], axis = 1)
google = google.reindex(idx)
google.head()

Unnamed: 0,gucci,patek,versace,balenciaga,saint laurent
2017-01-01,,,,,
2017-01-02,,,,,
2017-01-03,-0.073965,0.095849,-0.012702,0.011997,-0.152143
2017-01-04,-0.017412,-0.139901,0.012027,-0.032402,-0.053563
2017-01-05,0.049534,0.115266,-0.002468,0.025039,-0.032566


We want to finalize our independent and dependent variables as follows:

Dependent Variable: Residuals from Arima Modeling

Independent Variables:
1. 2-Lags of the Arima Model (Generated from auto and partial-auto correlation function)
2. Total_Streams - lagged
3. %Streams - lagged
4. Trailing 5-day Rank Momentum - Not Lagged
5. Top Categorical Variables (LDA) - lagged

We will run the model all together at first. If we find that the statistical interpretations are not there, we will then attempt to perform a pseudo-granger causality maneuver

In [134]:
gucci = gucci.set_index(pd.DatetimeIndex(gucci['Unnamed: 0']))
gucci.index.name = 'Date'
patek = patek.set_index(pd.DatetimeIndex(patek['Unnamed: 0']))
patek.index.name = 'Date'
belanciaga = belanciaga.set_index(pd.DatetimeIndex(belanciaga['Unnamed: 0']))
belanciaga.index.name = 'Date'
saintLaurent = saintLaurent.set_index(pd.DatetimeIndex(saintLaurent['Unnamed: 0']))
saintLaurent.index.name = 'Date'
versace = versace.set_index(pd.DatetimeIndex(versace['Unnamed: 0']))
versace.index.name = 'Date'
gucci = gucci.drop(['Best_Rank','Best_Rank_Moment','Unnamed: 0'], axis = 1)
patek = patek.drop(['Best_Rank','Best_Rank_Moment','Unnamed: 0'], axis = 1)
versace = versace.drop(['Best_Rank','Best_Rank_Moment','Unnamed: 0'], axis = 1)
belanciaga = belanciaga.drop(['Best_Rank','Best_Rank_Moment','Unnamed: 0'], axis = 1)
saintLaurent = saintLaurent.drop(['Best_Rank','Best_Rank_Moment','Unnamed: 0'], axis = 1)

In [135]:
idx = pd.date_range('2017-01-01', '2018-01-09')
gucci = gucci.reindex(idx, fill_value = -1)
patek = patek.reindex(idx, fill_value = -1)
versace = versace.reindex(idx, fill_value = -1)
balenciaga = belanciaga.reindex(idx, fill_value = -1)
saintLaurent = saintLaurent.reindex(idx, fill_value = -1)

In [136]:
#Easiest to Append Dependent Collumn to end of current dataframe selection
gucci['dependent'] = google['gucci']
patek['dependent'] = google['patek']
versace['dependent'] = google['versace']
balenciaga['dependent'] = google['balenciaga']
saintLaurent['dependent'] = google['saint laurent']

In [137]:
#dropna
gucci = gucci.dropna()
patek = patek.dropna()
versace = versace.dropna()
balenciaga = balenciaga.dropna()
saintLaurent = saintLaurent.dropna()

In [138]:
#still will have -1 values where no songs are present
#can be easily identified by Top_Category, where -1
#only need to replace Tot_Streams with 0 for these rows

def replace_Tot_Streams(df):
    df.loc[df['Tot_Streams'] < 0, 'Tot_Streams'] = 0
    return df
gucci = replace_Tot_Streams(gucci)
patek = replace_Tot_Streams(patek)
versace = replace_Tot_Streams(versace)
balenciaga = replace_Tot_Streams(balenciaga)
saintLaurent = replace_Tot_Streams(saintLaurent)

In [139]:
#Cannot Forget about the -1 fill_value (this is important)
#Must drop NaN's
#Need to create Categorical variable DF

def add_categoricals(df):
    for i in df.index:
        cat = int(df.loc[i]['Top_Category'])
        df.at[i,'Cat_' + str(cat)] = 1
    return df

In [140]:
gucci = add_categoricals(gucci).drop(['Top_Category','Cat_-1'], axis = 1)
patek = add_categoricals(patek).drop(['Top_Category','Cat_-1'], axis = 1)
versace = add_categoricals(versace).drop(['Top_Category','Cat_-1'], axis = 1)
balenciaga = add_categoricals(balenciaga).drop(['Top_Category','Cat_-1'], axis = 1)
saintLaurent = add_categoricals(saintLaurent).drop(['Top_Category','Cat_-1'], axis = 1)

In [151]:
#Since only the categorical columns have NA now
#Filling
gucci.fillna(0, inplace = True)
patek.fillna(0, inplace = True)
versace.fillna(0, inplace = True)
balenciaga.fillna(0, inplace = True)
saintLaurent.fillna(0, inplace = True)

In [188]:
gucci.corr()

Unnamed: 0,Tot_Streams,%Stream,Best_Rank_Moment_5_Day,dependent,Cat_0,Cat_2
Tot_Streams,1.0,0.430024,0.014233,0.038934,-0.014888,0.157411
%Stream,0.430024,1.0,0.310077,0.014683,0.386592,0.028745
Best_Rank_Moment_5_Day,0.014233,0.310077,1.0,0.024551,-0.015482,0.165541
dependent,0.038934,0.014683,0.024551,1.0,0.025349,-0.021805
Cat_0,-0.014888,0.386592,-0.015482,0.025349,1.0,-0.909027
Cat_2,0.157411,0.028745,0.165541,-0.021805,-0.909027,1.0


In [179]:
#Time to fill Categoricals with 0's
#Let's do these Regressions!
X1 = gucci.iloc[0:-1,:]
X2 = patek.iloc[0:-1,:]
X3 = versace.iloc[0:-1,:]
X4 = balenciaga.iloc[0:-1,:]
X5 = saintLaurent.iloc[0:-1,:]
Y1 = gucci['dependent'].shift(-1)[:-1]
Y2 = patek['dependent'].shift(-1)[:-1]
Y3 = versace['dependent'].shift(-1)[:-1]
Y4 = balenciaga['dependent'].shift(-1)[:-1]
Y5 = saintLaurent['dependent'].shift(-1)[:-1]

In [198]:
models = {}
X = [(X1,Y1),(X2,Y2),(X3,Y3),(X4,Y4),(X5,Y5)]
Names = ['gucci','patek','versace','balenciaga','saintLaurent']
i = 0
for key in X:
    models[Names[i]] = sm.OLS(key[1], key[0]).fit()
    i+=1

In [211]:
for key in models.keys():
    print(key)
    print(models[key].summary())

gucci
                            OLS Regression Results                            
Dep. Variable:              dependent   R-squared:                       0.003
Model:                            OLS   Adj. R-squared:                 -0.014
Method:                 Least Squares   F-statistic:                    0.1584
Date:                Wed, 15 May 2019   Prob (F-statistic):              0.987
Time:                        05:02:03   Log-Likelihood:                 230.81
No. Observations:                 371   AIC:                            -449.6
Df Residuals:                     365   BIC:                            -426.1
Df Model:                           6                                         
Covariance Type:            nonrobust                                         
                             coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------------
Tot_Streams           

In [182]:
from sklearn import linear_model

In [184]:
lm = linear_model.LinearRegression()
model = lm.fit(X1,Y1)

In [186]:
model.score()

TypeError: score() missing 2 required positional arguments: 'X' and 'y'