In [3]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [4]:
%%time

# Third-party imports
import os.path as op
import pandas as pd
import great_expectations as ge
import numpy as np
import matplotlib
matplotlib.use('TkAgg')
import math

import matplotlib.pyplot as plt


# Project imports
from ta_lib.core.api import display_as_tabs, initialize_environment

# Initialization
initialize_environment(debug=False, hide_warnings=True)

CPU times: total: 31.2 ms
Wall time: 81.9 ms


In [5]:
from ta_lib.core.api import create_context, list_datasets, load_dataset

In [6]:
config_path = op.join('conf', 'config.yml')
context = create_context(config_path)

In [7]:
list_datasets(context)

['/raw/google_search_data',
 '/raw/product_manufacturer_list',
 '/raw/sales_data',
 '/raw/social_media_data',
 '/raw/Theme_list',
 '/raw/Theme_product_list']

In [8]:
# load datasets
google_search_data = load_dataset(context, 'raw/google_search_data')
product_manufacturer_list = load_dataset(context, 'raw/product_manufacturer_list')
sales_data = load_dataset(context, 'raw/sales_data')
social_media_data = load_dataset(context, 'raw/social_media_data')
Theme_list = load_dataset(context, 'raw/Theme_list')
Theme_product_list = load_dataset(context, 'raw/Theme_product_list')

In [9]:
dataset_name = ['google_search_data', 'product_manufacturer_list', 'sales_data', 'social_media_data', 'Theme_list', 'Theme_product_list']
dataset = [google_search_data, product_manufacturer_list, sales_data, social_media_data, Theme_list, Theme_product_list]

In [10]:
social_media_data['published_date']  = social_media_data['published_date'].str[-4:]
sales_data['system_calendar_key_N'] = sales_data['system_calendar_key_N'].apply(str)
sales_data['system_calendar_key_N'] = sales_data['system_calendar_key_N'].str[0:4]

In [11]:
social_media_data

Unnamed: 0,Theme Id,published_date,total_post
0,148.0,2015,76
1,148.0,2015,31
2,148.0,2015,65
3,148.0,2015,88
4,148.0,2015,85
...,...,...,...
533385,876.0,2019,4658
533386,876.0,2019,3731
533387,876.0,2019,2336
533388,876.0,2019,1374


In [12]:
google_search_data

Unnamed: 0,date,platform,searchVolume,Claim_ID,week_number,year_new
0,05-01-2014,google,349,916,1,2014
1,06-01-2014,google,349,916,2,2014
2,07-01-2014,google,697,916,2,2014
3,10-01-2014,google,349,916,2,2014
4,20-01-2014,google,697,916,4,2014
...,...,...,...,...,...,...
181560,05-09-2019,walmart,125,980,36,2019
181561,22-09-2019,walmart,84,980,38,2019
181562,26-09-2019,walmart,42,980,39,2019
181563,15-07-2019,walmart,42,622,29,2019


In [13]:
sales_data

Unnamed: 0,system_calendar_key_N,product_id,sales_dollars_value,sales_units_value,sales_lbs_value
0,2016,1,13927.0,934,18680
1,2016,3,10289.0,1592,28646
2,2016,4,357.0,22,440
3,2016,6,23113.0,2027,81088
4,2016,7,23177.0,3231,58164
...,...,...,...,...,...
4526177,2018,47536,8.0,2,3
4526178,2018,47539,391.0,39,68
4526179,2018,47543,105.0,59,48
4526180,2018,47544,3720.0,1246,4361


In [14]:
len(sales_data['product_id'].unique())

42616

In [15]:
m1 = social_media_data.groupby(['Theme Id','published_date'])['total_post'].sum().reset_index()
print(m1)

     Theme Id published_date  total_post
0         8.0           2015       49315
1         8.0           2016       91243
2         8.0           2017      120414
3         8.0           2018      168008
4         8.0           2019      106923
..        ...            ...         ...
960     999.0           2015        4223
961     999.0           2016        8759
962     999.0           2017       11782
963     999.0           2018       20926
964     999.0           2019       13873

[965 rows x 3 columns]


In [16]:
m2 = google_search_data.groupby(['Claim_ID','year_new'])['searchVolume'].sum().reset_index()
print(m2)

     Claim_ID  year_new  searchVolume
0           8      2014       2147362
1           8      2015       2390107
2           8      2016       2435585
3           8      2017       1962169
4           8      2018       3431254
..        ...       ...           ...
827       999      2015         63149
828       999      2016         69397
829       999      2017        103438
830       999      2018        102876
831       999      2019        303681

[832 rows x 3 columns]


In [17]:
m3 = sales_data.groupby(['product_id', 'system_calendar_key_N'])['sales_dollars_value'].sum().reset_index()
print(m3)

        product_id system_calendar_key_N  sales_dollars_value
0                1                  2016             615747.0
1                1                  2017             681909.0
2                1                  2018             810009.0
3                1                  2019             295866.0
4                2                  2016               3825.0
...            ...                   ...                  ...
126687       57313                  2019               7797.0
126688       57314                  2019              13305.0
126689       57315                  2019               1492.0
126690       57316                  2019               3466.0
126691       57317                  2019               8661.0

[126692 rows x 3 columns]


In [18]:
m4 = pd.merge(m3,Theme_product_list, how = 'inner', left_on='product_id', right_on='PRODUCT_ID') 
m4

Unnamed: 0,product_id,system_calendar_key_N,sales_dollars_value,PRODUCT_ID,CLAIM_ID
0,1,2016,615747.0,1,0
1,1,2017,681909.0,1,0
2,1,2018,810009.0,1,0
3,1,2019,295866.0,1,0
4,2,2016,3825.0,2,0
...,...,...,...,...,...
213507,57314,2019,13305.0,57314,40
213508,57314,2019,13305.0,57314,8
213509,57315,2019,1492.0,57315,8
213510,57316,2019,3466.0,57316,8


In [19]:
m4 = m4.groupby(['CLAIM_ID', 'system_calendar_key_N'])['sales_dollars_value'].sum().reset_index()
print(m4)

     CLAIM_ID system_calendar_key_N  sales_dollars_value
0           0                  2016         1.585879e+10
1           0                  2017         1.548783e+10
2           0                  2018         1.534391e+10
3           0                  2019         1.207660e+10
4           8                  2016         4.263884e+09
..        ...                   ...                  ...
183       437                  2019         4.171012e+07
184       438                  2016         7.572860e+08
185       438                  2017         7.520031e+08
186       438                  2018         7.363435e+08
187       438                  2019         5.605742e+08

[188 rows x 3 columns]


In [20]:
m2['year_new'] = m2['year_new'].apply(str)

In [21]:
final_m = pd.merge(m1,m2, how = 'inner', left_on=['Theme Id','published_date'], right_on=['Claim_ID','year_new']) 
print(final_m)

     Theme Id published_date  total_post  Claim_ID year_new  searchVolume
0         8.0           2015       49315         8     2015       2390107
1         8.0           2016       91243         8     2016       2435585
2         8.0           2017      120414         8     2017       1962169
3         8.0           2018      168008         8     2018       3431254
4         8.0           2019      106923         8     2019       2544527
..        ...            ...         ...       ...      ...           ...
705     999.0           2015        4223       999     2015         63149
706     999.0           2016        8759       999     2016         69397
707     999.0           2017       11782       999     2017        103438
708     999.0           2018       20926       999     2018        102876
709     999.0           2019       13873       999     2019        303681

[710 rows x 6 columns]


In [22]:
merged_data = pd.merge(final_m, m4, how='inner', left_on=['Theme Id', 'published_date'], right_on=['CLAIM_ID', 'system_calendar_key_N'])
print(merged_data)

     Theme Id published_date  total_post  Claim_ID year_new  searchVolume  \
0         8.0           2016       91243         8     2016       2435585   
1         8.0           2017      120414         8     2017       1962169   
2         8.0           2018      168008         8     2018       3431254   
3         8.0           2019      106923         8     2019       2544527   
4        39.0           2016        1221        39     2016         19544   
..        ...            ...         ...       ...      ...           ...   
97      437.0           2019       63124       437     2019        939608   
98      438.0           2016       92474       438     2016       4300900   
99      438.0           2017      157611       438     2017       3847661   
100     438.0           2018      121179       438     2018       4474757   
101     438.0           2019      119876       438     2019       3758485   

     CLAIM_ID system_calendar_key_N  sales_dollars_value  
0           8   

In [23]:
merged_data

Unnamed: 0,Theme Id,published_date,total_post,Claim_ID,year_new,searchVolume,CLAIM_ID,system_calendar_key_N,sales_dollars_value
0,8.0,2016,91243,8,2016,2435585,8,2016,4.263884e+09
1,8.0,2017,120414,8,2017,1962169,8,2017,4.839030e+09
2,8.0,2018,168008,8,2018,3431254,8,2018,5.343067e+09
3,8.0,2019,106923,8,2019,2544527,8,2019,4.179676e+09
4,39.0,2016,1221,39,2016,19544,39,2016,4.005493e+07
...,...,...,...,...,...,...,...,...,...
97,437.0,2019,63124,437,2019,939608,437,2019,4.171012e+07
98,438.0,2016,92474,438,2016,4300900,438,2016,7.572860e+08
99,438.0,2017,157611,438,2017,3847661,438,2017,7.520031e+08
100,438.0,2018,121179,438,2018,4474757,438,2018,7.363435e+08


In [24]:
train = merged_data[merged_data['year_new'] != '2019']
test = merged_data[merged_data['year_new'] == '2019']

# train = qf[qf['date'] != '2019']
# test = qf[qf['date'] == '2019']
# X_train, X_test, y_train, y_test
print(train)
print('--------------------')
print(test)

y_train = train['sales_dollars_value']
y_test = test['sales_dollars_value']

train.drop(['Claim_ID', 'published_date', 'CLAIM_ID', 'system_calendar_key_N', 'sales_dollars_value'], axis=1, inplace=True)
test.drop(['Claim_ID', 'published_date', 'CLAIM_ID', 'system_calendar_key_N', 'sales_dollars_value'], axis=1, inplace=True)

vv = pd.get_dummies(train['Theme Id'], drop_first=True)
train = pd.concat([train, vv], axis=1)

vv = pd.get_dummies(test['Theme Id'], drop_first=True)
test = pd.concat([test, vv], axis=1)

X_train = train.iloc[:, 1:]


X_test = test.iloc[:, 1:]

X_train.drop(148, axis=1, inplace=True)
print(X_train.columns)
print(X_test.columns)


     Theme Id published_date  total_post  Claim_ID year_new  searchVolume  \
0         8.0           2016       91243         8     2016       2435585   
1         8.0           2017      120414         8     2017       1962169   
2         8.0           2018      168008         8     2018       3431254   
4        39.0           2016        1221        39     2016         19544   
5        39.0           2017        1748        39     2017         12717   
..        ...            ...         ...       ...      ...           ...   
95      437.0           2017      123692       437     2017       1000381   
96      437.0           2018       95180       437     2018       1121144   
98      438.0           2016       92474       438     2016       4300900   
99      438.0           2017      157611       438     2017       3847661   
100     438.0           2018      121179       438     2018       4474757   

     CLAIM_ID system_calendar_key_N  sales_dollars_value  
0           8   

In [25]:
X_train.drop('year_new', axis=1, inplace=True)
X_test.drop('year_new', axis=1, inplace=True)


In [26]:
# from sklearn.model_selection import train_test_split
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train, y_train)

Feature names only support names that are all strings. Got feature names with dtypes: ['float', 'str']. An error will be raised in 1.2.



LinearRegression()

In [27]:
y_pred = regressor.predict(X_test)
print(y_pred)

Feature names only support names that are all strings. Got feature names with dtypes: ['float', 'str']. An error will be raised in 1.2.



[ 4.01310751e+09  5.89615527e+07  3.82358233e+09 -8.01130468e+07
 -4.89648959e+07  9.01710899e+08  1.62330327e+08  1.04860753e+06
  2.07093983e+08  1.41148700e+08 -1.34269861e+08  4.79362529e+08
 -1.92400553e+08 -1.35902884e+09  4.44558581e+07  7.82843249e+08
  8.92704517e+08  1.33228237e+07  2.66770484e+07 -9.47132874e+06
 -2.12538998e+06  7.15811612e+08 -5.30088513e+07 -3.71921343e+07
  1.05030190e+08  7.32719730e+08 -2.22675587e+08  4.57725003e+08]


In [28]:
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
# predicting the accuracy score
score=r2_score(y_test,y_pred)
print('r2 socre is ',score)
print('mean_sqrd_error is==',mean_squared_error(y_test,y_pred))
print('root_mean_squared error of is==',np.sqrt(mean_squared_error(y_test,y_pred)))

r2 socre is  0.8090786929370665
mean_sqrd_error is== 1.6382411437502576e+17
root_mean_squared error of is== 404751917.0739353


In [29]:
from sklearn.metrics import mean_absolute_percentage_error
mean_absolute_percentage_error(y_test, y_pred)

88.94015918518447

In [30]:
X_train

Unnamed: 0,total_post,searchVolume,39.0,40.0,65.0,75.0,81.0,100.0,101.0,120.0,...,193.0,207.0,211.0,227.0,228.0,384.0,393.0,430.0,437.0,438.0
0,91243,2435585,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,120414,1962169,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,168008,3431254,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1221,19544,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,1748,12717,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,123692,1000381,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
96,95180,1121144,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
98,92474,4300900,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
99,157611,3847661,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [31]:
u1 = sales_data.groupby(['product_id', 'system_calendar_key_N'])['sales_dollars_value'].sum().reset_index()
u1

Unnamed: 0,product_id,system_calendar_key_N,sales_dollars_value
0,1,2016,615747.0
1,1,2017,681909.0
2,1,2018,810009.0
3,1,2019,295866.0
4,2,2016,3825.0
...,...,...,...
126687,57313,2019,7797.0
126688,57314,2019,13305.0
126689,57315,2019,1492.0
126690,57316,2019,3466.0


In [32]:
u1 = pd.merge(u1, Theme_product_list, left_on='product_id', right_on=['PRODUCT_ID'], how='inner')
u1

Unnamed: 0,product_id,system_calendar_key_N,sales_dollars_value,PRODUCT_ID,CLAIM_ID
0,1,2016,615747.0,1,0
1,1,2017,681909.0,1,0
2,1,2018,810009.0,1,0
3,1,2019,295866.0,1,0
4,2,2016,3825.0,2,0
...,...,...,...,...,...
213507,57314,2019,13305.0,57314,40
213508,57314,2019,13305.0,57314,8
213509,57315,2019,1492.0,57315,8
213510,57316,2019,3466.0,57316,8


In [33]:
u1.drop(['product_id', 'PRODUCT_ID'], axis=1, inplace = True)
u1

Unnamed: 0,system_calendar_key_N,sales_dollars_value,CLAIM_ID
0,2016,615747.0,0
1,2017,681909.0,0
2,2018,810009.0,0
3,2019,295866.0,0
4,2016,3825.0,0
...,...,...,...
213507,2019,13305.0,40
213508,2019,13305.0,8
213509,2019,1492.0,8
213510,2019,3466.0,8


In [34]:
u1 = u1.groupby(['CLAIM_ID', 'system_calendar_key_N'])['sales_dollars_value'].sum().reset_index()
u1

Unnamed: 0,CLAIM_ID,system_calendar_key_N,sales_dollars_value
0,0,2016,1.585879e+10
1,0,2017,1.548783e+10
2,0,2018,1.534391e+10
3,0,2019,1.207660e+10
4,8,2016,4.263884e+09
...,...,...,...
183,437,2019,4.171012e+07
184,438,2016,7.572860e+08
185,438,2017,7.520031e+08
186,438,2018,7.363435e+08


In [35]:
len(u1[u1['CLAIM_ID'] == 8]['sales_dollars_value'].unique())

4

In [36]:
u2 = google_search_data.groupby(['Claim_ID', 'date'])['searchVolume'].sum().reset_index()
u2

Unnamed: 0,Claim_ID,date,searchVolume
0,8,01-01-2014,6613
1,8,01-01-2015,5977
2,8,01-01-2016,7248
3,8,01-01-2017,3510
4,8,01-01-2018,10334
...,...,...,...
138482,999,31-08-2014,417
138483,999,31-08-2015,417
138484,999,31-08-2019,1790
138485,999,31-10-2018,556


In [37]:
u3 = social_media_data.groupby(['Theme Id', 'published_date'])['total_post'].sum().reset_index()
u3

Unnamed: 0,Theme Id,published_date,total_post
0,8.0,2015,49315
1,8.0,2016,91243
2,8.0,2017,120414
3,8.0,2018,168008
4,8.0,2019,106923
...,...,...,...
960,999.0,2015,4223
961,999.0,2016,8759
962,999.0,2017,11782
963,999.0,2018,20926


In [38]:
u_f = pd.merge(u1, u2, left_on=['CLAIM_ID', 'system_calendar_key_N'] , right_on=['Claim_ID', 'date'], how='inner')
u_f

Unnamed: 0,CLAIM_ID,system_calendar_key_N,sales_dollars_value,Claim_ID,date,searchVolume


In [39]:
u_f = pd.merge(u_f, u3, left_on=['CLAIM_ID', 'system_calendar_key_N'] , right_on=['Theme Id', 'published_date'], how='inner')
u_f

Unnamed: 0,CLAIM_ID,system_calendar_key_N,sales_dollars_value,Claim_ID,date,searchVolume,Theme Id,published_date,total_post


In [40]:
u_f.drop(['Claim_ID', 'system_calendar_key_N', 'Theme Id', 'published_date'], axis=1, inplace = True)
u_f

Unnamed: 0,CLAIM_ID,sales_dollars_value,date,searchVolume,total_post


In [41]:
u_f.corr()

Unnamed: 0,CLAIM_ID,sales_dollars_value,searchVolume,total_post
CLAIM_ID,,,,
sales_dollars_value,,,,
searchVolume,,,,
total_post,,,,


In [42]:
X = u_f[['CLAIM_ID', 'searchVolume_category', 'total_post_category', 'date']]
y = u_f['sales_dollars_value']

KeyError: "['searchVolume_category', 'total_post_category'] not in index"

In [None]:
def ff(x):
    from sklearn.model_selection import train_test_split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = x)

    from sklearn.linear_model import LinearRegression
    regressor = LinearRegression()
    regressor.fit(X_train, y_train)
    
    y_pred = regressor.predict(X_test)
    # print(y_pred)
    from sklearn.metrics import r2_score
    from sklearn.metrics import mean_squared_error
    # predicting the accuracy score
    score=r2_score(y_test,y_pred)
    print('r2 socre is ',score)
    print('mean_sqrd_error is==',mean_squared_error(y_test,y_pred))
    print('root_mean_squared error of is==',np.sqrt(mean_squared_error(y_test,y_pred)))
    

In [None]:
for i in range(10):
    print(i)
    ff(i)
    print('---------------')

In [None]:
X = u_f[u_f['CLAIM_ID']==39]
X

In [None]:
abs(y_pred[0]-y_test)/y_test

In [None]:
def fun(X):
    print(X)
    t1 = X[0:3]
    t2 = X[3:]
    # print(t1, t2)
    X_train = t1[['searchVolume', 'total_post']]
    y_train = t1['sales_dollars_value']

    X_test = t2[['searchVolume', 'total_post']]
    y_test = t2['sales_dollars_value']
    
    from sklearn.linear_model import LinearRegression
    regressor = LinearRegression()
    regressor.fit(X_train, y_train)
    
    y_pred = regressor.predict(X_test)
    print('Prediction : ', y_pred)
    print('Error')
    print(abs(y_pred[0]-y_test)/y_test)
    print('---------------------------------')
# print(X_train, X_test, y_train, y_test)
# X_test = test.iloc[:, 2:]
# y_test = test['sales_dollars_value']

In [None]:
len(u_f[u_f['CLAIM_ID']==65]['date'].unique())

In [None]:
themes = np.array(qf['CLAIM_ID'].unique())
pf = pd.DataFrame()
for x in themes:
    if(len(qf[qf['CLAIM_ID']==x]['date'].unique()) == 4):
        fun(qf[qf['CLAIM_ID']==x])
        pf = pd.concat([pf, qf[qf['CLAIM_ID']==x]], axis=0)
pf

In [None]:
right = pf['sales_dollars_value'].mean() + .000001*pf['sales_dollars_value'].std()
left = pf['sales_dollars_value'].mean() - .000001*pf['sales_dollars_value'].std()
print(right, left)

In [None]:
plt.figure(figsize=(16,8))
plt.subplot(2,2,1)
sns.distplot(pf['sales_dollars_value'])
plt.subplot(2,2,2)
sns.boxplot(pf['sales_dollars_value'])
plt.subplot(2,2,3)
plt.show()

In [None]:
qf = pf[pf['sales_dollars_value'] < 1.04e9]

In [None]:
h = pf[(pf['sales_dollars_value'] < right) | (pf['sales_dollars_value'] > left)]
print(h)
h.describe()

In [None]:
# Box Plot
import seaborn as sns
sns.boxplot(pf['sales_dollars_value'])
pf

In [None]:
qf[(qf['date']=='2019') & (qf['CLAIM_ID'].isin([75, 100]))]

In [None]:
# X =df2[df2['CLAIM_ID']==8]
# print(X)
train = qf[(qf['date'] != '2019') & (qf['CLAIM_ID'].isin([39, 75, 100]))]
test = qf[(qf['date'] == '2019') & (qf['CLAIM_ID'].isin([39, 75, 100]))]
# X_train, X_test, y_train, y_test
vv = pd.get_dummies(train['CLAIM_ID'], drop_first=True)
train = pd.concat([train, vv], axis=1)

vv = pd.get_dummies(test['CLAIM_ID'], drop_first=True)
test = pd.concat([test, vv], axis=1)

X_train = train.iloc[:, 3:]
y_train = train['sales_dollars_value']

X_test = test.iloc[:, 3:]
y_test = test['sales_dollars_value']

In [None]:
X_train

In [None]:
# vv = pd.get_dummies(train['CLAIM_ID'], drop_first=True)
# print(vv)
# df2 = pd.concat([train, vv], axis=1)
# df2

In [None]:
print(X_train)
print(X_test)
print(y_train)
print(y_test)

In [None]:
# from sklearn.model_selection import train_test_split
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)

from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train, y_train)

In [None]:
import ta_lib.core.api as dataset
import ta_lib.eda.api as ta_analysis
from ta_lib.reports.api import create_report
import ta_lib.reports.api as health
from sklearn.ensemble import RandomForestClassifier
from ta_lib.classification.api import ClassificationComparison, ClassificationReport, confusion_matrix_by_feature, SKLStatsmodelLogit
from ta_lib.data_processing.api import Outlier, WoeBinningTransformer


In [None]:
# # X_train.info()
# X_train[5:] = X_train[5:].astype('int64')
# X_train.info()

In [None]:
# target_corr = ta_analysis.get_target_correlation(X_train, y_train)
# display_as_tabs([(k, v) for k,v in target_corr.items()]) 

In [None]:
# X_train.columns
y_test

In [None]:
y_pred = regressor.predict(X_test)
print(str(y_pred[1]))

In [None]:
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
# predicting the accuracy score
score=r2_score(y_test,y_pred)
print('r2 socre is ',score)
print('mean_sqrd_error is==',mean_squared_error(y_test,y_pred))
print('root_mean_squared error of is==',np.sqrt(mean_squared_error(y_test,y_pred)))

In [None]:
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error,mean_absolute_percentage_error
from statsmodels.stats.outliers_influence import variance_inflation_factor

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
import statsmodels.api as sm

In [None]:
qf

In [None]:
mean_absolute_percentage_error(y_test,y_pred)

In [None]:
def performance(X_train,y_train, X_test,y_test):
    lin_reg = sm.OLS(y_train,sm.add_constant(X_train)).fit()
    y_train_pred =  lin_reg.predict(sm.add_constant(X_train))
    y_test_pred =  lin_reg.predict(sm.add_constant(X_test))
    print('Train R2',r2_score(y_train,y_train_pred))
    print('Test R2',r2_score(y_test,y_test_pred))
    print('-'*50)
    print('Train MAPE:', mean_absolute_percentage_error(y_train,y_train_pred))
    print('Test MAPE:', mean_absolute_percentage_error(y_test,y_test_pred))
    print('Cross Val Score of MAPE:')
    scores = -1*cross_val_score(LinearRegression(),X_train,y_train,cv=5,
                scoring='neg_mean_absolute_percentage_error')
    bias  = np.mean(scores)
    variance = np.std(scores,ddof=1)
    print('CV_scores:',scores)
    print('Bias :',bias)
    print('Variance:',variance)

In [None]:
performance(X_train,y_train, X_test,y_test)

In [None]:
len(u_f['date'].unique())

In [None]:
cop = u_f
cop['searchVolume_category'] = cop['searchVolume'].apply(lambda x: round(math.log10(x)))
cop['total_post_category'] = cop['total_post'].apply(lambda x: round(math.log10(x)))
cop

In [None]:
u_f['searchVolume_category'] = u_f['searchVolume'].apply(lambda x: round(math.log10(x)))
u_f['total_post_category'] = u_f['total_post'].apply(lambda x: round(math.log10(x)))

In [None]:
u_f.drop(['searchVolume_category', 'total_post_category'],axis=1, inplace=True)

In [None]:
Vendor_dummies = pd.get_dummies(u_f['CLAIM_ID'], drop_first=True)
df2 = pd.concat([u_f, Vendor_dummies], axis=1)
df2

In [None]:
df2 = df2.sample(frac = 1)

In [None]:
X = df2.iloc[:, 2:]
y = df2['sales_dollars_value']

In [None]:
X[X['date'] == '2019']

In [None]:
X, df2

In [None]:
df2.corr()

In [None]:
plt.figure(figsize=(200,200))
corr=df2.corr()
sns.heatmap(corr[corr>0.8], annot=True, fmt='.0%')

In [None]:
import seaborn as sns
