In [121]:
# import required libraries
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go

# now read the dataset using pandas
dataset = pd.read_csv('psx.csv')
dataset.head()

Unnamed: 0,Date,Open,High,Low,Close,Change,Volume
0,23-Feb-21,31722.16,31800.9,31597.31,31626.19,-21.38,718191025
1,22-Feb-21,31874.78,31958.58,31612.55,31647.57,-203.61,721952658
2,19-Feb-21,31748.75,31904.3,31749.43,31851.18,91.36,694795084
3,18-Feb-21,32049.85,32104.67,31745.72,31759.82,-288.86,577837595
4,17-Feb-21,32166.21,32390.77,32044.01,32048.68,-93.15,701658181


Data: current date where stocks are examined.<br>
Open: the initial value of assests for the day.<br>
High: the highest value of assests for the day.<br>
Low: the lowest value of assets fallen on that day.<br>
Close: the value of assests when the stock market closed that day.<br>
Change: the change in stock value for that day.<br>
Volume: it is the number of current assets on that day.

In [122]:
# find if there are null values
dataset.isnull().sum()

Date      0
Open      0
High      0
Low       0
Close     0
Change    0
Volume    0
dtype: int64

In [123]:
# find type of all variables
dataset.dtypes

Date      object
Open      object
High      object
Low       object
Close     object
Change    object
Volume    object
dtype: object

In [124]:
# find number of instances
len(dataset)

3221

In [125]:
# all columns
dataset.columns

Index(['Date', 'Open', 'High', 'Low', 'Close', 'Change', 'Volume'], dtype='object')

In [126]:
# unique dates
len(dataset['Date'].unique())

3221

In [127]:
# find description of datset
dataset.describe()

Unnamed: 0,Date,Open,High,Low,Close,Change,Volume
count,3221,3221.0,3221.0,3221.0,3221.0,3221,3221
unique,3221,3213.0,3215.0,3211.0,3209.0,3104,3220
top,23-Feb-21,29269.0,24777.37,6639.0,6639.25,0,302831072
freq,1,2.0,2.0,3.0,3.0,5,2


In [128]:
# find further info
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3221 entries, 0 to 3220
Data columns (total 7 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Date    3221 non-null   object
 1   Open    3221 non-null   object
 2   High    3221 non-null   object
 3   Low     3221 non-null   object
 4   Close   3221 non-null   object
 5   Change  3221 non-null   object
 6   Volume  3221 non-null   object
dtypes: object(7)
memory usage: 176.3+ KB


In [129]:
dataset['standard_date'] = pd.to_datetime(dataset['Date'], format='%d-%b-%y', errors='coerce')

print(dataset[['Date', 'standard_date']])

           Date standard_date
0     23-Feb-21    2021-02-23
1     22-Feb-21    2021-02-22
2     19-Feb-21    2021-02-19
3     18-Feb-21    2021-02-18
4     17-Feb-21    2021-02-17
...         ...           ...
3216  28-Feb-08    2008-02-28
3217  27-Feb-08    2008-02-27
3218  26-Feb-08    2008-02-26
3219  25-Feb-08    2008-02-25
3220  22-Feb-08    2008-02-22

[3221 rows x 2 columns]


In [130]:
dataset.head()

Unnamed: 0,Date,Open,High,Low,Close,Change,Volume,standard_date
0,23-Feb-21,31722.16,31800.9,31597.31,31626.19,-21.38,718191025,2021-02-23
1,22-Feb-21,31874.78,31958.58,31612.55,31647.57,-203.61,721952658,2021-02-22
2,19-Feb-21,31748.75,31904.3,31749.43,31851.18,91.36,694795084,2021-02-19
3,18-Feb-21,32049.85,32104.67,31745.72,31759.82,-288.86,577837595,2021-02-18
4,17-Feb-21,32166.21,32390.77,32044.01,32048.68,-93.15,701658181,2021-02-17


In [131]:
# remove date with initial format
dataset.pop('Date')
dataset.head()

Unnamed: 0,Open,High,Low,Close,Change,Volume,standard_date
0,31722.16,31800.9,31597.31,31626.19,-21.38,718191025,2021-02-23
1,31874.78,31958.58,31612.55,31647.57,-203.61,721952658,2021-02-22
2,31748.75,31904.3,31749.43,31851.18,91.36,694795084,2021-02-19
3,32049.85,32104.67,31745.72,31759.82,-288.86,577837595,2021-02-18
4,32166.21,32390.77,32044.01,32048.68,-93.15,701658181,2021-02-17


In [132]:
# rename the standard_date column to Date column
dataset.rename(columns = {'standard_date' : 'Date'}, inplace=True)
dataset.head()

Unnamed: 0,Open,High,Low,Close,Change,Volume,Date
0,31722.16,31800.9,31597.31,31626.19,-21.38,718191025,2021-02-23
1,31874.78,31958.58,31612.55,31647.57,-203.61,721952658,2021-02-22
2,31748.75,31904.3,31749.43,31851.18,91.36,694795084,2021-02-19
3,32049.85,32104.67,31745.72,31759.82,-288.86,577837595,2021-02-18
4,32166.21,32390.77,32044.01,32048.68,-93.15,701658181,2021-02-17


In [133]:
# move column to different place
column_to_move = 'Date'
new_index = 0

# Step 3: Move the column to the new index
columns = dataset.columns.tolist()
columns.remove(column_to_move)
columns.insert(new_index, column_to_move)
dataset = dataset[columns]
dataset.head()


Unnamed: 0,Date,Open,High,Low,Close,Change,Volume
0,2021-02-23,31722.16,31800.9,31597.31,31626.19,-21.38,718191025
1,2021-02-22,31874.78,31958.58,31612.55,31647.57,-203.61,721952658
2,2021-02-19,31748.75,31904.3,31749.43,31851.18,91.36,694795084
3,2021-02-18,32049.85,32104.67,31745.72,31759.82,-288.86,577837595
4,2021-02-17,32166.21,32390.77,32044.01,32048.68,-93.15,701658181


In [134]:

dataset.dtypes

Date      datetime64[ns]
Open              object
High              object
Low               object
Close             object
Change            object
Volume            object
dtype: object

In [135]:
columns_to_convert = ['Open', 'High', 'Low', 'Close', 'Change', 'Volume']

# Step 1: Remove commas from the values in multiple columns
dataset[columns_to_convert] = dataset[columns_to_convert].apply(lambda x: x.str.replace(',', ''))

# Step 2: Convert the multiple columns to numeric
dataset[columns_to_convert] = dataset[columns_to_convert].apply(pd.to_numeric, errors='coerce')
dataset.head()


Unnamed: 0,Date,Open,High,Low,Close,Change,Volume
0,2021-02-23,31722.16,31800.9,31597.31,31626.19,-21.38,718191025
1,2021-02-22,31874.78,31958.58,31612.55,31647.57,-203.61,721952658
2,2021-02-19,31748.75,31904.3,31749.43,31851.18,91.36,694795084
3,2021-02-18,32049.85,32104.67,31745.72,31759.82,-288.86,577837595
4,2021-02-17,32166.21,32390.77,32044.01,32048.68,-93.15,701658181


In [136]:
# now find data types
dataset.dtypes

Date      datetime64[ns]
Open             float64
High             float64
Low              float64
Close            float64
Change           float64
Volume             int64
dtype: object

In [137]:
# find distribution of volume using scatter plot
fig = px.scatter(dataset, x = 'Date', y = 'Volume', trendline = 'ols', title = 'Distribution of Volume w.r.t Date')
fig.update_traces(marker=dict(color='blue'), line=dict(color='red'))
fig.update_layout(xaxis_title = 'Year', yaxis_title = 'Volume')
fig.show()

In [138]:
# lets find the change in volume over time
fig = px.line(dataset, x='Date', y='Volume', markers=True, line_shape='linear')
fig.update_traces(marker=dict(color='green'), line=dict(color='red'))
fig.show()

In [139]:
# find distribution of change using scatter plot
fig = px.scatter(dataset, x = 'Date', y = 'Change', trendline = 'ols', title = 'Distribution of Change w.r.t Date')
fig.update_traces(marker=dict(color='lightblue'), line=dict(color='black'), opacity = 0.6)
fig.update_layout(xaxis_title = 'Year', yaxis_title = 'Change')
fig.show()

In [140]:
# lets find the change in volume over time
fig = px.line(dataset, x='Date', y='Change', markers=True, line_shape='linear')
fig.update_traces(marker=dict(color='cyan'), line=dict(color='red'))
fig.show()

In [141]:
# find relationship between Date and Low
fig = px.scatter(dataset, x = 'Date', y = 'Low', trendline = 'ols', title = 'Low Assets w.r.t Date')
fig.update_traces(marker = dict(color = 'green'), line = dict(color = 'black'), opacity = 0.7)
fig.show()

In [142]:
# relationship between high and date
fig = px.scatter(dataset, x = 'Date', y = 'High', trendline = 'ols', title = 'High w.r.t Date')
fig.update_traces(marker = dict(color = 'lightblue'), line = dict(color = 'black'), opacity = 0.5)
fig.show()

In [143]:
# Relationship between Open and Date
fig = px.scatter(dataset, x = 'Date', y = 'Open', trendline = 'ols', title = 'Assets Opening w.r.t Date')
fig.update_traces(marker = dict(color = 'blue'), line = dict(color = 'black'), opacity = 0.8)
fig.show()

In [144]:
fig = px.scatter(dataset, x = 'Date', y = 'Close', trendline = 'ols', title = 'Assets Closing w.r.t Date')
fig.update_traces(marker = dict(color = 'cyan'), line = dict(color = 'black'))
fig.show()

In [145]:
# show using graph the relationship between opening and closing assets for each day
fig = px.scatter(dataset, x = 'Open', y = 'Close', trendline = 'ols', title = 'Relationship Between Open and Close')
fig.update_traces(marker=dict(color='lightgreen'), line=dict(color='black'), opacity = 0.6)
fig.show()

In [146]:
# show using graph the relationship between opening and closing assets for each day
fig = px.scatter(dataset, x = 'Low', y = 'High', trendline = 'ols', title = 'Relationship Between Low and High')
fig.update_traces(marker=dict(color='lightgrey'), line=dict(color='red'), opacity = 0.6)
fig.show()

In [147]:
data = dataset['Volume']
fig = px.histogram(data, nbins = 500, color_discrete_sequence = ['#FF5733'], opacity = 0.6)
fig.update_layout(title = 'Distribution of Volume', xaxis_title = 'Volume')
fig.show()

In [148]:
data = dataset['Change']
fig = px.histogram(data, nbins = 500, color_discrete_sequence = [' #707b7c '], opacity = 0.6)
fig.update_layout(title = 'Distribution of Change', xaxis_title = 'Change')
fig.show()

In [149]:
data = dataset['Open']
fig = px.histogram(data, nbins = 500, color_discrete_sequence = ['#eb984e'], opacity = 0.6)
fig.update_layout(title = 'Distribution of Open', xaxis_title = 'Open')
fig.show()

In [150]:
data = dataset['Close']
fig = px.histogram(data, nbins = 500, color_discrete_sequence = ['#2ecc71'], opacity = 0.6)
fig.update_layout(title = 'Distribution of Close', xaxis_title = 'Close')
fig.show()

In [151]:
data = dataset['Low']
fig = px.histogram(data, nbins = 500, color_discrete_sequence = ['#5dade2'], opacity = 0.6)
fig.update_layout(title = 'Distribution of Low', xaxis_title = 'Low')
fig.show()

In [152]:
dataa = dataset['High']
fig = px.histogram(dataa, nbins = 500, color_discrete_sequence = [' #a569bd '], opacity = 0.6)
fig.update_layout(title = 'Distribution of High', xaxis_title = 'High')
fig.show()

In [153]:
# find pearson correlation
corr_columns = ['Open', 'Close', 'Volume']
corr = dataset[corr_columns].corr()
corr

Unnamed: 0,Open,Close,Volume
Open,1.0,0.9998,0.428193
Close,0.9998,1.0,0.43072
Volume,0.428193,0.43072,1.0


In [154]:
# show the correlation using plotly
fig = px.imshow(corr, labels=dict(color="Correlation"), x=corr.index, y=corr.columns, color_continuous_scale='Viridis')

# Customize the layout
fig.update_layout(title="Correlation Matrix")

# Show the plot
fig.show()

In [155]:
# we need to find if the change is more then what happens to volume
corrr_columns = ['Change', 'Volume']
corrr = dataset[corrr_columns].corr()
corrr

Unnamed: 0,Change,Volume
Change,1.0,0.125037
Volume,0.125037,1.0


In [156]:
# show the correlation using plotly
fig = px.imshow(corrr, labels=dict(color="Correlation"), x=corrr.index, y=corrr.columns, color_continuous_scale='Viridis')

# Customize the layout
fig.update_layout(title="Correlation Matrix")

# Show the plot
fig.show()

In [157]:
# plot a boxplot and see outliers in Volume column
fig = px.box(dataset, x = 'Volume', points = 'all', color_discrete_sequence= ['#f39c12'])
fig.update_layout(title = 'Boxplot Showing Volume')
fig.show()

In [158]:
# find the range of outliers limit
data = dataset['Volume']

# minimum and maximum in the data
min_value = np.min(data)
max_value = np.max(data)

# Calculate the interquartile range (IQR)
Q1 = np.percentile(data, 25)
Q2 = np.percentile(data, 50)
Q3 = np.percentile(data, 75)
IQR = Q3 - Q1

# Define the lower and upper bounds for potential outliers
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Find potential outliers
outliers = [x for x in data if x < lower_bound or x > upper_bound]

# Print the results
print("Minimum number in the column is ", min_value)
print("Maximum number in the column is ", max_value)
print("First quarter or 25th percentile is ", Q1)
print("Second quarter or 50th percentile is ", Q2)
print("Third quater or 75th percentile is ", Q3)
print("Lower Bound:", lower_bound)
print("Upper Bound:", upper_bound)
print("Potential Outliers:", outliers)

Minimum number in the column is  -152526704
Maximum number in the column is  1124724205
First quarter or 25th percentile is  112084380.0
Second quarter or 50th percentile is  170272992.0
Third quater or 75th percentile is  254314272.0
Lower Bound: -101260458.0
Upper Bound: 467659110.0
Potential Outliers: [718191025, 721952658, 694795084, 577837595, 701658181, 514044525, 486340423, 1124724205, 1011825950, 663958819, 616200447, 467989986, 692898215, 839457168, 843983643, 610915766, 603234345, 470018957, 606372517, 476573388, 491756674, 543589793, 531011703, 620582768, 845204198, 825891276, 588003325, 696414831, 641411673, 664516672, 582082375, 540785879, 642577537, 578245258, 502981305, 570558163, 561845093, 516440882, 482533331, 497516465, 702121165, 629331166, 557605174, 472350360, 476848692, 489117747, 541190803, 481018352, 487115967, 500086129, 661189180, 492671335, 476730392, 473729084, 582794744, 516126248, 508691857, 489600391, 662797128, 509518330, 526175254, 884940929, 707012027

In [159]:
# plot boxplot to find outliers
fig = px.box(dataset, x = 'Change', points = 'all', color_discrete_sequence = ['#cacfd2'])
fig.update_layout(title = 'Boxplot Showing Distribution of Change in Assests')
fig.show()

In [160]:
# find the range of outliers limit
data = dataset['Change']

# minimum and maximum in the data
min_value = np.min(data)
max_value = np.max(data)

# Calculate the interquartile range (IQR)
Q1 = np.percentile(data, 25)
Q2 = np.percentile(data, 50)
Q3 = np.percentile(data, 75)
IQR = Q3 - Q1

# Define the lower and upper bounds for potential outliers
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Find potential outliers
outliers = [x for x in data if x < lower_bound or x > upper_bound]

# Print the results
print("Minimum number in the column is ", min_value)
print("Maximum number in the column is ", max_value)
print("First quarter or 25th percentile is ", Q1)
print("Second quarter or 50th percentile is ", Q2)
print("Third quater or 75th percentile is ", Q3)
print("Lower Bound:", lower_bound)
print("Upper Bound:", upper_bound)
print("Potential Outliers:", outliers)

Minimum number in the column is  -8060.96
Maximum number in the column is  7957.09
First quarter or 25th percentile is  -68.46
Second quarter or 50th percentile is  6.33
Third quater or 75th percentile is  95.56
Lower Bound: -314.48999999999995
Upper Bound: 341.59
Potential Outliers: [-410.7, 364.42, 389.79, 483.72, 355.28, 350.55, 386.86, -341.81, -415.95, 482.63, 864.72, -619.35, -790.57, -326.05, 363.34, 362.21, -362.66, 471.06, -780.14, -418.85, -325.26, -622.48, 517.8, 354.33, 354.33, -480.01, 574.58, 385.32, -583.26, 861.04, -587.21, 455.44, 424.7, -493.37, 526.39, 772.79, 715.28, 441.34, -763.38, -1211.16, 420.98, -1279.91, -618.88, -1424.74, -816.7, -682.79, -670.94, 769.96, -331.03, -646.85, 444.19, 344.81, -522.47, -355.55, -751.93, 403.79, 643.81, -364.9, -628.48, 593.0, -469.97, -669.38, 551.75, 432.93, 399.4, 376.43, 488.55, -526.36, -330.13, 534.26, 396.31, 462.07, -469.32, 386.81, 346.87, -383.75, -408.48, 626.3, 466.35, 409.48, -370.35, -431.08, -338.15, -459.71, -420.3

In [161]:
# find outliers in Open column
fig = px.box(dataset, x = 'Open', points = 'all', color_discrete_sequence = ['#f1948a'])
fig.update_layout(title = 'Boxplot Showing Distribution of Open Assets')
fig.show()

In [162]:
# find the range of outliers limit
data = dataset['Open']

# minimum and maximum in the data
min_value = np.min(data)
max_value = np.max(data)

# Calculate the interquartile range (IQR)
Q1 = np.percentile(data, 25)
Q2 = np.percentile(data, 50)
Q3 = np.percentile(data, 75)
IQR = Q3 - Q1

# Define the lower and upper bounds for potential outliers
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Find potential outliers
outliers = [x for x in data if x < lower_bound or x > upper_bound]

# Print the results
print("Minimum number in the column is ", min_value)
print("Maximum number in the column is ", max_value)
print("First quarter or 25th percentile is ", Q1)
print("Second quarter or 50th percentile is ", Q2)
print("Third quater or 75th percentile is ", Q3)
print("Lower Bound:", lower_bound)
print("Upper Bound:", upper_bound)
print("Potential Outliers:", outliers)

Minimum number in the column is  3634.56
Maximum number in the column is  36317.92
First quarter or 25th percentile is  8418.15
Second quarter or 50th percentile is  21777.35
Third quater or 75th percentile is  27911.0
Lower Bound: -20821.125
Upper Bound: 57150.274999999994
Potential Outliers: []


In [163]:
# plot boxplot fpr distribution of Close Assets
fig = px.box(dataset, x = 'Close', points = 'all', color_discrete_sequence = ['#abebc6'])
fig.update_layout(title = 'Distribution of Close Assets')
fig.show()

In [164]:
# find the range of outliers limit
data = dataset['Close']

# minimum and maximum in the data
min_value = np.min(data)
max_value = np.max(data)

# Calculate the interquartile range (IQR)
Q1 = np.percentile(data, 25)
Q2 = np.percentile(data, 50)
Q3 = np.percentile(data, 75)
IQR = Q3 - Q1

# Define the lower and upper bounds for potential outliers
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Find potential outliers
outliers = [x for x in data if x < lower_bound or x > upper_bound]

# Print the results
print("Minimum number in the column is ", min_value)
print("Maximum number in the column is ", max_value)
print("First quarter or 25th percentile is ", Q1)
print("Second quarter or 50th percentile is ", Q2)
print("Third quater or 75th percentile is ", Q3)
print("Lower Bound:", lower_bound)
print("Upper Bound:", upper_bound)
print("Potential Outliers:", outliers)

Minimum number in the column is  3647.1
Maximum number in the column is  36234.2
First quarter or 25th percentile is  8426.35
Second quarter or 50th percentile is  21771.63
Third quater or 75th percentile is  27895.15
Lower Bound: -20776.850000000006
Upper Bound: 57098.350000000006
Potential Outliers: []


In [165]:
# plot a boxplot for Low
fig = px.box(dataset, x = 'Low', points = 'all', color_discrete_sequence = ['#dc7633'])
fig.update_layout(title = 'Boxplot shows Distribution of Low Assets')
fig.show()

In [166]:
# find the range of outliers limit
data = dataset['Low']

# minimum and maximum in the data
min_value = np.min(data)
max_value = np.max(data)

# Calculate the interquartile range (IQR)
Q1 = np.percentile(data, 25)
Q2 = np.percentile(data, 50)
Q3 = np.percentile(data, 75)
IQR = Q3 - Q1

# Define the lower and upper bounds for potential outliers
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Find potential outliers
outliers = [x for x in data if x < lower_bound or x > upper_bound]

# Print the results
print("Minimum number in the column is ", min_value)
print("Maximum number in the column is ", max_value)
print("First quarter or 25th percentile is ", Q1)
print("Second quarter or 50th percentile is ", Q2)
print("Third quater or 75th percentile is ", Q3)
print("Lower Bound:", lower_bound)
print("Upper Bound:", upper_bound)
print("Potential Outliers:", outliers)

Minimum number in the column is  3623.35
Maximum number in the column is  36132.76
First quarter or 25th percentile is  8348.88
Second quarter or 50th percentile is  21640.08
Third quater or 75th percentile is  27656.07
Lower Bound: -20611.905000000006
Upper Bound: 56616.855
Potential Outliers: []


In [167]:
# Boxplot for High column
fig = px.box(dataset, x = 'High', points = 'all', color_discrete_sequence = ['#2e4053'])
fig.update_layout(title = 'Boxplot Showing Distribution of High Assests')
fig.show()

In [168]:
# find the range of outliers limit
data = dataset['High']

# minimum and maximum in the data
min_value = np.min(data)
max_value = np.max(data)

# Calculate the interquartile range (IQR)
Q1 = np.percentile(data, 25)
Q2 = np.percentile(data, 50)
Q3 = np.percentile(data, 75)
IQR = Q3 - Q1

# Define the lower and upper bounds for potential outliers
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Find potential outliers
outliers = [x for x in data if x < lower_bound or x > upper_bound]

# Print the results
print("Minimum number in the column is ", min_value)
print("Maximum number in the column is ", max_value)
print("First quarter or 25th percentile is ", Q1)
print("Second quarter or 50th percentile is ", Q2)
print("Third quater or 75th percentile is ", Q3)
print("Lower Bound:", lower_bound)
print("Upper Bound:", upper_bound)
print("Potential Outliers:", outliers)

Minimum number in the column is  3742.31
Maximum number in the column is  36389.5
First quarter or 25th percentile is  8479.86
Second quarter or 50th percentile is  21885.33
Third quater or 75th percentile is  28108.02
Lower Bound: -20962.379999999997
Upper Bound: 57550.259999999995
Potential Outliers: []


In [169]:
# dataset
dataset.head()

Unnamed: 0,Date,Open,High,Low,Close,Change,Volume
0,2021-02-23,31722.16,31800.9,31597.31,31626.19,-21.38,718191025
1,2021-02-22,31874.78,31958.58,31612.55,31647.57,-203.61,721952658
2,2021-02-19,31748.75,31904.3,31749.43,31851.18,91.36,694795084
3,2021-02-18,32049.85,32104.67,31745.72,31759.82,-288.86,577837595
4,2021-02-17,32166.21,32390.77,32044.01,32048.68,-93.15,701658181


In [170]:
# length of data
len(dataset)

3221

In [171]:
# lets drop date column as we would not use to train the model
final_dataset = dataset.drop(['Date'], axis = 1)

In [172]:
# apply Multi Regression
# split data into dependent and independent variables
x = final_dataset.iloc[:, :-1]
y = final_dataset.iloc[:, -1]

# split data into training and testing
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state = 0, test_size = 0.3)

# import linear model
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler, MinMaxScaler
model = LinearRegression()

# scale the data
scaler = MinMaxScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.transform(x_test)

# evaluate model for scaled data
model.fit(x_train_scaled, y_train)
#model.fit(x_train, y_train)
pred = model.predict(x_test_scaled)

# evaluate the model
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
mse = mean_squared_error(y_test, pred)
mae = mean_absolute_error(y_test, pred)
r2 = r2_score(y_test, pred)
print("The Mean Squared Error is ", mse)
print("The Mean Absolute Error is ", mae)
print("R Square is ", r2)

The Mean Squared Error is  1.5958352383377956e+16
The Mean Absolute Error is  91845139.59986882
R Square is  0.19013213336341395


In [173]:
# make a scatter plot to show how y_test and pred are disperesed
# Create a DataFrame for Plotly
data = {"y_test": y_test, "pred": pred}
df = pd.DataFrame(data)

# Create a scatter plot using Plotly Express
fig = px.scatter(df, x="y_test", y="pred", labels={"y_test": "True Values", "pred": "Predicted Values"},
                 title="True vs Predicted Values Scatter Plot")

# Add a dashed line for reference
fig.update_layout(shapes=[
    dict(
        type='line',
        x0=min(y_test),
        y0=min(y_test),
        x1=max(y_test),
        y1=max(y_test),
        line=dict(dash='dash', color='red')
    )
])

# Show the plot
fig.show()

In [174]:
# Evaluate the model for Overfitting and Underfitting

# predict values for training set
y_pred_train = model.predict(x_train)

# predict values for testing set
y_pred_test = model.predict(x_test)

# evualate using metrics
# find mean_squared_error
mse_train = mean_squared_error(y_train, y_pred_train)
mse_test = mean_squared_error(y_test, y_pred_test)

# find r2
r2_train = r2_score(y_train, y_pred_train)
r2_test = r2_score(y_test, y_pred_test)

# print mse and r2
print("Mean Squared Error for training data is ", mse_train)
print("Mean Squared Error for testing data is ", mse_test)
print("R2 for training data is ", r2_train)
print("R2 for testing data is ", r2_test)

Mean Squared Error for training data is  1.6476135128654368e+25
Mean Squared Error for testing data is  1.781217434829501e+25
R2 for training data is  -940385608.1700535
R2 for testing data is  -903947179.3265889



X has feature names, but LinearRegression was fitted without feature names


X has feature names, but LinearRegression was fitted without feature names



As both training and testing errors are very large so here we are encountering underfitting and we need to solve it

In [175]:
# lets explore further by plotting graph
fig = px.scatter(x = y_test, y = y_pred_test, title = "Actual VS Predicted Test Values", color_discrete_sequence = ['lightgreen'], opacity = 0.9)
fig.update_layout(xaxis_title = "Actual Values", yaxis_title = "Predicted Values")
fig.show()

In [176]:
# lets find strong correaltion and then make those features polynomial
correlation_matrix = dataset.corr()
correlation_with_target = correlation_matrix['Volume'].abs().sort_values(ascending=False)
correlation_with_target





Volume    1.000000
Close     0.430720
High      0.430356
Low       0.429827
Open      0.428193
Change    0.125037
Name: Volume, dtype: float64

As we have seen that the strongest correlations are close, high, low, and open, lets make add columns for of their binomials

In [177]:
# to solve problem of underfitting lets make polynomial features of the existing variables
dataset.head()

Unnamed: 0,Date,Open,High,Low,Close,Change,Volume
0,2021-02-23,31722.16,31800.9,31597.31,31626.19,-21.38,718191025
1,2021-02-22,31874.78,31958.58,31612.55,31647.57,-203.61,721952658
2,2021-02-19,31748.75,31904.3,31749.43,31851.18,91.36,694795084
3,2021-02-18,32049.85,32104.67,31745.72,31759.82,-288.86,577837595
4,2021-02-17,32166.21,32390.77,32044.01,32048.68,-93.15,701658181


In [178]:
# add new columns which contains square of the required columns
new_cols = {
    'Binomial_open': (dataset['Open'])**2,
    'Binomial_High': (dataset['High'])**2,
    'Binomial_Low': (dataset['Low'])**2,
    'Binomial_Close': (dataset['Close'])**2
}

# concat the new columns into the existing data
dataset = pd.concat([dataset, pd.DataFrame(new_cols)], axis = 1)
dataset.head()

Unnamed: 0,Date,Open,High,Low,Close,Change,Volume,Binomial_open,Binomial_High,Binomial_Low,Binomial_Close
0,2021-02-23,31722.16,31800.9,31597.31,31626.19,-21.38,718191025,1006295000.0,1011297000.0,998390000.0,1000216000.0
1,2021-02-22,31874.78,31958.58,31612.55,31647.57,-203.61,721952658,1016002000.0,1021351000.0,999353300.0,1001569000.0
2,2021-02-19,31748.75,31904.3,31749.43,31851.18,91.36,694795084,1007983000.0,1017884000.0,1008026000.0,1014498000.0
3,2021-02-18,32049.85,32104.67,31745.72,31759.82,-288.86,577837595,1027193000.0,1030710000.0,1007791000.0,1008686000.0
4,2021-02-17,32166.21,32390.77,32044.01,32048.68,-93.15,701658181,1034665000.0,1049162000.0,1026819000.0,1027118000.0


In [182]:
# now move volume column to the last
removed_column = dataset.pop('Volume')

dataset['Volume'] = removed_column

dataset.head()

Unnamed: 0,Date,Open,High,Low,Close,Change,Binomial_open,Binomial_High,Binomial_Low,Binomial_Close,Volume
0,2021-02-23,31722.16,31800.9,31597.31,31626.19,-21.38,1006295000.0,1011297000.0,998390000.0,1000216000.0,718191025
1,2021-02-22,31874.78,31958.58,31612.55,31647.57,-203.61,1016002000.0,1021351000.0,999353300.0,1001569000.0,721952658
2,2021-02-19,31748.75,31904.3,31749.43,31851.18,91.36,1007983000.0,1017884000.0,1008026000.0,1014498000.0,694795084
3,2021-02-18,32049.85,32104.67,31745.72,31759.82,-288.86,1027193000.0,1030710000.0,1007791000.0,1008686000.0,577837595
4,2021-02-17,32166.21,32390.77,32044.01,32048.68,-93.15,1034665000.0,1049162000.0,1026819000.0,1027118000.0,701658181


In [186]:
# find datatypes
dataset.dtypes

Date              datetime64[ns]
Open                     float64
High                     float64
Low                      float64
Close                    float64
Change                   float64
Binomial_open            float64
Binomial_High            float64
Binomial_Low             float64
Binomial_Close           float64
Volume                     int64
dtype: object

In [187]:
# convert volume into float
dataset['Volume'] = dataset['Volume'].astype(float)

In [190]:
dataset.dtypes

Date              datetime64[ns]
Open                     float64
High                     float64
Low                      float64
Close                    float64
Change                   float64
Binomial_open            float64
Binomial_High            float64
Binomial_Low             float64
Binomial_Close           float64
Volume                   float64
dtype: object

In [195]:
date = dataset.pop('Date')
dataset.head()

Unnamed: 0,Open,High,Low,Close,Change,Binomial_open,Binomial_High,Binomial_Low,Binomial_Close,Volume
0,31722.16,31800.9,31597.31,31626.19,-21.38,1006295000.0,1011297000.0,998390000.0,1000216000.0,718191025.0
1,31874.78,31958.58,31612.55,31647.57,-203.61,1016002000.0,1021351000.0,999353300.0,1001569000.0,721952658.0
2,31748.75,31904.3,31749.43,31851.18,91.36,1007983000.0,1017884000.0,1008026000.0,1014498000.0,694795084.0
3,32049.85,32104.67,31745.72,31759.82,-288.86,1027193000.0,1030710000.0,1007791000.0,1008686000.0,577837595.0
4,32166.21,32390.77,32044.01,32048.68,-93.15,1034665000.0,1049162000.0,1026819000.0,1027118000.0,701658181.0


In [196]:
# now again the apply the regression model
x = dataset.iloc[:, : -1]
y = dataset.iloc[:, -1]


# classify into training and testing
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state = 0, test_size = 0.3)

# train model
modell = LinearRegression()
modell.fit(x_train, y_train)
predd = modell.predict(x_test)

r2 = r2_score(y_test, predd)
r2

0.20535868225586518