># LECTURE - 01
>### INTRODUCTION TO LINEAR REGRESSION: <br><br>

In [1]:
import quandl, math
import numpy as np
import pandas as pd
from sklearn import preprocessing, model_selection, svm
from sklearn.linear_model import LinearRegression

In [2]:
df = quandl.get('WIKI/GOOGL')
print(df.head())

              Open    High     Low    Close      Volume  Ex-Dividend  \
Date                                                                   
2004-08-19  100.01  104.06   95.96  100.335  44659000.0          0.0   
2004-08-20  101.01  109.08  100.50  108.310  22834300.0          0.0   
2004-08-23  110.76  113.48  109.05  109.400  18256100.0          0.0   
2004-08-24  111.24  111.60  103.57  104.870  15247300.0          0.0   
2004-08-25  104.76  108.00  103.88  106.000   9188600.0          0.0   

            Split Ratio  Adj. Open  Adj. High   Adj. Low  Adj. Close  \
Date                                                                   
2004-08-19          1.0  50.159839  52.191109  48.128568   50.322842   
2004-08-20          1.0  50.661387  54.708881  50.405597   54.322689   
2004-08-23          1.0  55.551482  56.915693  54.693835   54.869377   
2004-08-24          1.0  55.792225  55.972783  51.945350   52.597363   
2004-08-25          1.0  52.542193  54.167209  52.100830   53.1

In [3]:
df = df[ ['Adj. Open', 'Adj. High', 'Adj. Low', 'Adj. Close', 'Adj. Volume'] ]
print(df.head)

<bound method NDFrame.head of               Adj. Open    Adj. High     Adj. Low   Adj. Close  Adj. Volume
Date                                                                       
2004-08-19    50.159839    52.191109    48.128568    50.322842   44659000.0
2004-08-20    50.661387    54.708881    50.405597    54.322689   22834300.0
2004-08-23    55.551482    56.915693    54.693835    54.869377   18256100.0
2004-08-24    55.792225    55.972783    51.945350    52.597363   15247300.0
2004-08-25    52.542193    54.167209    52.100830    53.164113    9188600.0
...                 ...          ...          ...          ...          ...
2018-03-21  1092.570000  1108.700000  1087.210000  1094.000000    1990515.0
2018-03-22  1080.010000  1083.920000  1049.640000  1053.150000    3418154.0
2018-03-23  1051.370000  1066.780000  1024.870000  1026.550000    2413517.0
2018-03-26  1050.600000  1059.270000  1010.580000  1054.090000    3272409.0
2018-03-27  1063.900000  1064.540000   997.620000  1006.94

In [4]:
# Percentage change in High to Low stock price: 
df['H2L_PerCng'] = ( df['Adj. High'] - df['Adj. Close'] ) / df['Adj. Close'] * 100.0


In [5]:
# (Features)
# Daily Percentage change in Close to Open stock price: 
df['Per_Cng'] = ( df["Adj. Close"] - df['Adj. Open'] ) / df['Adj. Open'] * 100.0


In [6]:
# (Features)
# creating the dataframe we wnat to work with:
df = df[ ['Adj. Close', 'H2L_PerCng', 'Per_Cng', 'Adj. Volume'] ]

print(df.head())

            Adj. Close  H2L_PerCng   Per_Cng  Adj. Volume
Date                                                     
2004-08-19   50.322842    3.712563  0.324968   44659000.0
2004-08-20   54.322689    0.710922  7.227007   22834300.0
2004-08-23   54.869377    3.729433 -1.227880   18256100.0
2004-08-24   52.597363    6.417469 -5.726357   15247300.0
2004-08-25   53.164113    1.886792  1.183658    9188600.0


># LECTURE - 02
>### Features and Labels: 
> Usually we define features by capital X and labels by small y.<br><br>

In [7]:
# It is just a variable created to work with the stock prices:
forecast_col = 'Adj. Close'

# In case of missing data and we can't get rid of it, so we fill that column with some specific values:
df.fillna(-9999, inplace=True)

# Actually we are trying to predict the 10% of the dataframe to predict the forecast out:
forecast_out = int(math.ceil(0.01*len(df)))

# Creating label:
# Since we have forecast_out, so we can create labels
# Here we are using forecast_out as minus that's why we made it as int data type
# So, basically we are shifting the columns negativelly upwards
df['label'] = df[forecast_col].shift(-forecast_out)

# So, this will just print first five rows of the dataframes
print(df.head())
print("\n\n")
df.dropna(inplace=True)
print(df.tail())


            Adj. Close  H2L_PerCng   Per_Cng  Adj. Volume      label
Date                                                                
2004-08-19   50.322842    3.712563  0.324968   44659000.0  69.078238
2004-08-20   54.322689    0.710922  7.227007   22834300.0  67.839414
2004-08-23   54.869377    3.729433 -1.227880   18256100.0  68.912727
2004-08-24   52.597363    6.417469 -5.726357   15247300.0  70.668146
2004-08-25   53.164113    1.886792  1.183658    9188600.0  71.219849



            Adj. Close  H2L_PerCng   Per_Cng  Adj. Volume    label
Date                                                              
2018-01-30     1177.37    0.896914 -0.029718    1792602.0  1094.00
2018-01-31     1182.22    0.346805 -0.134312    1643877.0  1053.15
2018-02-01     1181.59    0.495942  0.476195    2774967.0  1026.55
2018-02-02     1119.20    1.081129 -0.729098    5798880.0  1054.09
2018-02-05     1068.76    4.325574 -2.893850    3742469.0  1006.94


># LECTURE - 03
>### Training and Testing: <br><br>

In [None]:
X = np.array( df.drop(['label'], axis = 1) )
# Creating training and testing set of data(hopefully here we are using 20% of total data)
# X_test and y_test we used to fit our classifier.
X_train, X_test, y_train, y_test = model_selection.train_test_split( X, y, test_size=0.2 )

X = X[:-forecast_out]

# Stuff that is actually goinig to predict again:
X_lately = X[-forecast_out:]

X = preprocessing.scale(X)

# Dropping the missing data:
df.dropna(inplace=True)

# Creating the labels:
y = np.array(df['label'])
y = np.array(df['label'])


# Now we need to define the classifier
clf = LinearRegression()

# Now to fit or train our classifier, we just fit our classifier
# fit is synonymous with train 
clf.fit( X_train, y_train )

# Now we need to see out what the accuracy is
# score is synonymous with test
accuracy = clf.score( X_test, y_test )

print(accuracy)


># LECTURE - 04
>### Forecasting and Predicting: <br><br>