In [33]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn import preprocessing, svm
from sklearn.model_selection import cross_validate, train_test_split
import math

In [3]:
train_files = "./data/GOOG.csv"

In [4]:
data = pd.read_csv(train_files)

In [5]:
data.head()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,2020-07-07,1490.0,1516.800049,1483.550049,1485.180054,1485.180054,1458200
1,2020-07-08,1494.319946,1505.880005,1485.630005,1496.0,1496.0,1249700
2,2020-07-09,1506.449951,1522.719971,1488.084961,1510.98999,1510.98999,1423300
3,2020-07-10,1506.150024,1543.829956,1496.540039,1541.73999,1541.73999,1856300
4,2020-07-13,1550.0,1577.131958,1505.243042,1511.339966,1511.339966,1846400


In [6]:
features = ['Open',  'High',  'Low',  'Adj Close', 'Volume']

In [7]:
df = data[features]
df

Unnamed: 0,Open,High,Low,Adj Close,Volume
0,1490.000000,1516.800049,1483.550049,1485.180054,1458200
1,1494.319946,1505.880005,1485.630005,1496.000000,1249700
2,1506.449951,1522.719971,1488.084961,1510.989990,1423300
3,1506.150024,1543.829956,1496.540039,1541.739990,1856300
4,1550.000000,1577.131958,1505.243042,1511.339966,1846400
...,...,...,...,...,...
247,2535.449951,2540.000000,2508.739990,2520.370117,1047500
248,2513.071045,2516.000000,2495.149902,2506.320068,1200300
249,2496.995117,2529.250000,2496.995117,2527.370117,856000
250,2536.790039,2576.959961,2535.379883,2574.379883,1058000


### Replace 'NaN'/missing data from the dataset

Generally we're using regression to forecast_out

Forecast_out is just a percentage of our data to forecast out. Basically, in our dataset, maybe we want to train our model to predict the price 1% into the future. Then, to train, we need historically to grab values, and then use those values alongside whatever the price was 1% into the future (1% into the future as in 1% of the days of the entire dataset. If the dataset was 100 days, 1% into the future would be 1 day into the future). We use .shift, which is a pandas method, which can take a column and literally shift it in a direction by a number you decide. Thus, we use this to make a new column, which is the price column shifted, giving us the future prices in the same rows as current price, volume...etc to be trained against.

Hindi: predict kya kr rhe hai ? future mai stock price, data set mai hai na future ka stock price, vo bnaya label

In [8]:
forecast_col = 'Adj Close'                # This is the column we are trying to predict

In [9]:
df.fillna(-99999, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().fillna(


In [10]:
forecast_out = int(math.ceil(0.01 * len(df)))
forecast_out

3

In the above line, 0.01 is the percentage of days we want to predict

In [11]:
df['label'] = df[forecast_col].shift(-forecast_out)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['label'] = df[forecast_col].shift(-forecast_out)


In [12]:
df['label']

0      1541.739990
1      1511.339966
2      1520.579956
3      1513.640015
4      1518.000000
          ...     
247    2574.379883
248    2595.419922
249            NaN
250            NaN
251            NaN
Name: label, Length: 252, dtype: float64

In [13]:
df.head()

Unnamed: 0,Open,High,Low,Adj Close,Volume,label
0,1490.0,1516.800049,1483.550049,1485.180054,1458200,1541.73999
1,1494.319946,1505.880005,1485.630005,1496.0,1249700,1511.339966
2,1506.449951,1522.719971,1488.084961,1510.98999,1423300,1520.579956
3,1506.150024,1543.829956,1496.540039,1541.73999,1856300,1513.640015
4,1550.0,1577.131958,1505.243042,1511.339966,1846400,1518.0


Remove missing values from the dataframe

In [15]:
df.dropna(inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.dropna(inplace=True)


In [16]:
X = np.array(df.drop(['label'], 1))      # all columns from dataframe except 'label'
y = np.array(df['label'])                # label column from dataframe

Now we will pre-process the features.
We will normalize these features within the range -1 to 1. This speeds up the processing

In [31]:
X = preprocessing.scale(X)

Now we split the data into training and testing portions
size here used in 80% - 20%

In [37]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Linear Regression Classifier

In [38]:
clf = LinearRegression()

In [39]:
clf.fit(X_train, y_train)

LinearRegression()

In [40]:
accuracy = clf.score(X_test, y_test)

Accuracy here is in squared error

In [41]:
accuracy


0.9799018981356228

# Support Vector Machines


In [42]:
clf = svm.SVR()

In [43]:
clf.fit(X_train, y_train)

SVR()

In [44]:
accuracy = clf.score(X_test, y_test)

In [45]:
accuracy

0.16283728830452582

In [46]:
### Lets try different kernels

In [47]:
for k in ['linear','poly','rbf','sigmoid']:
    clf = svm.SVR(kernel=k)
    clf.fit(X_train, y_train)
    confidence = clf.score(X_test, y_test)
    print(k,confidence)

linear 0.9792298268509012
poly 0.6600240085491769
rbf 0.16283728830452582
sigmoid 0.40074763808916747
