In [139]:
import numpy as np
import pandas as pd
import sklearn.metrics as metrics

from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import train_test_split  
from sklearn.linear_model import LinearRegression  

In [140]:
#Load the trend data
df = pd.read_csv('data/trends1.csv')
Top_1 = df[['Week','Top 1']]
Top_2 = df[['Week','Top 2']]
Top_3 = df[['Week','Top 3']]

In [141]:
#Extract the attributes and labels for all three tops
Top_1_x = Top_1.iloc[:, :-1].values  
Top_1_y = Top_1.iloc[:, 1].values 

Top_2_x = Top_2.iloc[:, :-1].values  
Top_2_y = Top_2.iloc[:, 1].values 

Top_3_x = Top_3.iloc[:, :-1].values  
Top_3_y = Top_3.iloc[:, 1].values

In [142]:
#Split the data into training and test sets
T1X_train, T1X_test, T1y_train, T1y_test = train_test_split(Top_1_x, Top_1_y, test_size=0.2, random_state=0)
T2X_train, T2X_test, T2y_train, T2y_test = train_test_split(Top_2_x, Top_2_y, test_size=0.2, random_state=0)
T3X_train, T3X_test, T3y_train, T3y_test = train_test_split(Top_3_x, Top_3_y, test_size=0.2, random_state=0)


In [145]:
#Train the algorithm
regressor1 = LinearRegression()  
regressor1.fit(T1X_train, T1y_train)

regressor2 = LinearRegression()  
regressor2.fit(T2X_train, T2y_train)

regressor3 = LinearRegression()  
regressor3.fit(T3X_train, T3y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [146]:
#Obtain and display the slopes and y-intercepts of the linear trendline for all three tops obtained through algorithm
d = {'Component' : ['y-intercept', 'gradient'] , 'Top 1' : [regressor1.intercept_, regressor1.coef_[0]] , 'Top 2' : [regressor2.intercept_, regressor2.coef_[0]] , 'Top 3' :  [regressor3.intercept_, regressor3.coef_[0]]}
modelcoefficient = pd.DataFrame(data=d)
modelcoefficient.set_index('Component')

Unnamed: 0_level_0,Top 1,Top 2,Top 3
Component,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
y-intercept,-0.177028,-1.665524,11.48373
gradient,0.092361,0.294847,0.092494


In [147]:
#Create our own predictions
T1y_pred = regressor1.predict(T1X_test)
T2y_pred = regressor2.predict(T2X_test)
T3y_pred = regressor3.predict(T3X_test)

In [148]:
#Display the actual and predicted values of the test sets)
compare = pd.DataFrame({'T1 Actual': T1y_test, 'T1 Predicted': T1y_pred, 'T2 Actual': T2y_test, 'T2 Predicted': T2y_pred, 'T3 Actual': T3y_test, 'T3 Predicted': T3y_pred})
compare.head()

Unnamed: 0,T1 Actual,T1 Predicted,T2 Actual,T2 Predicted,T3 Actual,T3 Predicted
0,18,15.154953,25,47.279101,17,26.83778
1,5,9.613273,23,29.588272,19,21.288123
2,19,21.89733,80,68.802942,36,33.589862
3,5,6.842433,22,20.742858,16,18.513295
4,27,20.142464,57,63.200846,24,31.832471


In [126]:
#Calculate and display the metrics between our own predictions and actual values of the test sets for all three tops
d = {'Metrics' : ['Mean Absolute Error', 'Mean Squared Error', 'Root Mean Squared Error'] , 'Top 1' : [metrics.mean_absolute_error(T1y_test, T1y_pred), metrics.mean_squared_error(T1y_test, T1y_pred), np.sqrt(metrics.mean_squared_error(T1y_test, T1y_pred))] , 'Top 2' : [metrics.mean_absolute_error(T2y_test, T2y_pred), metrics.mean_squared_error(T2y_test, T2y_pred), np.sqrt(metrics.mean_squared_error(T2y_test, T2y_pred))] , 'Top 3' :  [metrics.mean_absolute_error(T3y_test, T3y_pred), metrics.mean_squared_error(T3y_test, T3y_pred), np.sqrt(metrics.mean_squared_error(T3y_test, T3y_pred))]}
metricval = pd.DataFrame(data=d)
metricval.set_index('Metrics')

Unnamed: 0_level_0,Top 1,Top 2,Top 3
Metrics,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Mean Absolute Error,4.368719,11.35749,3.039455
Mean Squared Error,29.044991,184.948055,13.244954
Root Mean Squared Error,5.389341,13.599561,3.639362
