In [24]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

from sklearn.metrics import mean_squared_error, accuracy_score

In [2]:
# read the csv file 
df = pd.read_csv('NFLX.csv')

In [3]:
# display the first five rows of the data 
df.head()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,2018-02-05,262.0,267.899994,250.029999,254.259995,254.259995,11896100
1,2018-02-06,247.699997,266.700012,245.0,265.720001,265.720001,12595800
2,2018-02-07,266.579987,272.450012,264.329987,264.559998,264.559998,8981500
3,2018-02-08,267.079987,267.619995,250.0,250.100006,250.100006,9306700
4,2018-02-09,253.850006,255.800003,236.110001,249.470001,249.470001,16906900


In [4]:
# check the information 
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1009 entries, 0 to 1008
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Date       1009 non-null   object 
 1   Open       1009 non-null   float64
 2   High       1009 non-null   float64
 3   Low        1009 non-null   float64
 4   Close      1009 non-null   float64
 5   Adj Close  1009 non-null   float64
 6   Volume     1009 non-null   int64  
dtypes: float64(5), int64(1), object(1)
memory usage: 55.3+ KB


In [5]:
# describe the data
df.describe()

Unnamed: 0,Open,High,Low,Close,Adj Close,Volume
count,1009.0,1009.0,1009.0,1009.0,1009.0,1009.0
mean,419.059673,425.320703,412.374044,419.000733,419.000733,7570685.0
std,108.537532,109.26296,107.555867,108.289999,108.289999,5465535.0
min,233.919998,250.649994,231.229996,233.880005,233.880005,1144000.0
25%,331.48999,336.299988,326.0,331.619995,331.619995,4091900.0
50%,377.769989,383.01001,370.880005,378.670013,378.670013,5934500.0
75%,509.130005,515.630005,502.529999,509.079987,509.079987,9322400.0
max,692.349976,700.98999,686.090027,691.690002,691.690002,58904300.0


In [15]:
df.dtypes

Date          object
Open         float64
High         float64
Low          float64
Close        float64
Adj Close    float64
Volume         int64
dtype: object

In [16]:
df.drop('Date', axis=1, inplace=True)

In [17]:
df.dtypes

Open         float64
High         float64
Low          float64
Close        float64
Adj Close    float64
Volume         int64
dtype: object

In [20]:
# separate the data into features and target
X = df.drop('Close', axis=1)
y = df.Close

In [21]:
# split the data into X_train, X_test, y_train, y_test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=42)

In [22]:
# shapes of X_train, X_test, y_train, y_test
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(807, 5) (202, 5) (807,) (202,)


### Building the model 

In [23]:
# LinearRegression initialization
lr = LinearRegression()
lr.fit(X_train, y_train)

LinearRegression()

In [30]:
# predict the new values
y_pred = lr.predict(X_test)

In [31]:
print("Mean Sqaured Error: ", mean_squared_error(y_test, y_pred))

Mean Sqaured Error:  5.0739033551558705e-26


In [34]:
print(lr.score(X_test, y_test))

1.0


In [35]:
lr.score(X_train, y_train)

1.0

In [36]:
data = pd.DataFrame({"Loss":(y_pred-y_test)})

In [38]:
data.head(25)

Unnamed: 0,Loss
628,5.684342e-14
631,0.0
741,0.0
514,-2.842171e-13
365,1.705303e-13
656,1.136868e-13
657,0.0
530,0.0
321,-1.705303e-13
70,-3.410605e-13
