# Import Libraries and Load the Data

## First, import the necessary libraries and load the dataset into a pandas DataFrame.

In [161]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt # For visualizing data.
%matplotlib inline
import datetime

## Load the dataset

In [162]:
bank_datadf = pd.read_csv('../data/myusabank.csv')

## Preview the dataset

In [163]:
df = pd.read_csv('../data/myusabank.csv')
df.head(5)

Unnamed: 0,Date,Interest_Income,Interest_Expense,Average_Earning_Assets,Net_Income,Total_Assets,Shareholder_Equity,Operating_Expenses,Operating_Income,Market_Share,Stock_Price
0,1/3/2022,2121958.0,773237,55001045,1572212.0,95404302,23496605,811672,3494178,24,128
1,1/4/2022,2671155.0,518070,54463418,1336326.0,86440010,24948600,1030817,3231496,20,63
2,1/5/2022,2131932.0,797921,59771802,1224137.0,88674163,26416438,1345961,3692148,22,196
3,1/6/2022,2365838.0,556958,54345760,1452268.0,97221407,29694095,1289921,4779685,14,177
4,1/7/2022,2259178.0,1746222,57153768,3858336.0,98279553,25311499,1432303,4764985,10,103


In [164]:
df.tail(5)

Unnamed: 0,Date,Interest_Income,Interest_Expense,Average_Earning_Assets,Net_Income,Total_Assets,Shareholder_Equity,Operating_Expenses,Operating_Income,Market_Share,Stock_Price
518,3/3/2023,2513153.0,551934,55397662,1554736.0,96040922,21265898,1068252,4022145,18,169
519,8/23/2022,,866329,50421240,,82229180,21551330,921373,3842210,23,53
520,5/4/2022,2106530.0,736175,51910705,1331770.0,98827165,29454531,973260,3833525,20,123
521,1/18/2023,2879989.0,561476,52562469,1810694.0,83138744,25418952,1141416,4241968,26,142
522,3/23/2023,2811774.0,737027,58204298,1793719.0,82627144,22164412,1160244,4478768,14,106


## Find data information

In [165]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 523 entries, 0 to 522
Data columns (total 11 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Date                    523 non-null    object 
 1   Interest_Income         471 non-null    float64
 2   Interest_Expense        523 non-null    int64  
 3   Average_Earning_Assets  523 non-null    int64  
 4   Net_Income              471 non-null    float64
 5   Total_Assets            523 non-null    int64  
 6   Shareholder_Equity      523 non-null    int64  
 7   Operating_Expenses      523 non-null    int64  
 8   Operating_Income        523 non-null    int64  
 9   Market_Share            523 non-null    int64  
 10  Stock_Price             523 non-null    int64  
dtypes: float64(2), int64(8), object(1)
memory usage: 45.1+ KB


## Find  Columns and Rows

In [166]:
df.shape

(523, 11)

# Data Cleaning and Preprocessing
 

## Check for null values

In [167]:
df.isnull().sum()
print(df.isnull().sum())

Date                       0
Interest_Income           52
Interest_Expense           0
Average_Earning_Assets     0
Net_Income                52
Total_Assets               0
Shareholder_Equity         0
Operating_Expenses         0
Operating_Income           0
Market_Share               0
Stock_Price                0
dtype: int64


### Drop null values

In [168]:
df.dropna(inplace = True)

### Check for null values after Drop null values if any

In [169]:
#df.isnull().sum()
#print(df.isnull().sum())
df.shape

(471, 11)

## Handling Duplicates 
Check for duplicates and remove them if present

### Check duplicates

In [170]:
#df.duplicated().sum()
duplicates = df.duplicated().sum()
print(f"Number of duplicate rows: {duplicates}")

Number of duplicate rows: 21


### Remove duplicates

In [171]:
df = df.drop_duplicates()

### Check duplicates if any

In [172]:
df.duplicated().sum()
#duplicates = df.duplicated().sum()
#print(f"Number of duplicate rows: {duplicates}")

np.int64(0)

## Find all columns name

In [173]:
df.columns

Index(['Date', 'Interest_Income', 'Interest_Expense', 'Average_Earning_Assets',
       'Net_Income', 'Total_Assets', 'Shareholder_Equity',
       'Operating_Expenses', 'Operating_Income', 'Market_Share',
       'Stock_Price'],
      dtype='object')

## Check Data Type

In [174]:
df.dtypes

Date                       object
Interest_Income           float64
Interest_Expense            int64
Average_Earning_Assets      int64
Net_Income                float64
Total_Assets                int64
Shareholder_Equity          int64
Operating_Expenses          int64
Operating_Income            int64
Market_Share                int64
Stock_Price                 int64
dtype: object

### Check skewness 

In [175]:
df.skew(numeric_only = 'True')

Interest_Income           0.066883
Interest_Expense          3.670733
Average_Earning_Assets   -0.033531
Net_Income                2.599736
Total_Assets              0.007236
Shareholder_Equity        0.159052
Operating_Expenses        0.040829
Operating_Income          0.059995
Market_Share              0.014802
Stock_Price               0.021435
dtype: float64

Describe() method returns description of the data in the data DataFrame.

In [176]:
df.describe()

Unnamed: 0,Interest_Income,Interest_Expense,Average_Earning_Assets,Net_Income,Total_Assets,Shareholder_Equity,Operating_Expenses,Operating_Income,Market_Share,Stock_Price
count,450.0,450.0,450.0,450.0,450.0,450.0,450.0,450.0,450.0,450.0
mean,2493105.0,790187.2,55142760.0,1589052.0,89911460.0,24825500.0,1099544.0,3985466.0,19.464444,126.066667
std,292616.9,329599.2,2887246.0,487806.5,5594602.0,2857464.0,224338.1,580294.8,5.755588,42.620091
min,2002869.0,501015.0,50064320.0,1007906.0,80100730.0,20019680.0,706530.0,3000159.0,10.0,50.0
25%,2235993.0,612999.2,52694450.0,1260571.0,84885100.0,22323430.0,903144.2,3512030.0,14.0,92.0
50%,2475174.0,720297.5,55281070.0,1553584.0,89880520.0,24484910.0,1110432.0,3942739.0,19.5,124.0
75%,2742374.0,867199.2,57586660.0,1793312.0,94601570.0,27299900.0,1284741.0,4514281.0,24.0,162.0
max,2999890.0,2967591.0,59988210.0,3999402.0,99951320.0,29992150.0,1497622.0,4999284.0,29.0,199.0


## Data Type Conversion

#### Convert 'Date' column to datetime format.
    Ensure the Date column is in datetime format.

In [177]:
df['Date'] = pd.to_datetime(df['Date'])
#df.loc[row_indexer, col_indexer] = value instead

## Scaling and Normalization

In [182]:
#scaler = StandardScaler()
#df[columns_to_scale] = scaler.fit_transform(df[columns_to_scale])
#df.StandardScaler()
columns_to_scale = ['Interest_Income', 'Interest_Expense', 'Average_Earning_Assets', 
                    'Net_Income', 'Total_Assets', 'Shareholder_Equity', 
                    'Operating_Expenses', 'Operating_Income', 'Market_Share', 'Stock_Price']

# Remove columns

## Select only the 'Date' and 'Close' columns

In [179]:
# Convert the date column to datetime format (assuming your date column is named 'Date')
df['Date'] = pd.to_datetime(df['Date'])

# Define the date range as strings
start_date = 'YYYY-MM-DD'
end_date = 'YYYY-MM-DD'
# Select only the 'Date' and 'Close' columns
bank_df = df[['Date', 'Stock_Price']]

# Display the first few rows of the selected data
bank_df.head()

Unnamed: 0,Date,Stock_Price
0,2022-01-03,128
1,2022-01-04,63
2,2022-01-05,196
3,2022-01-06,177
4,2022-01-07,103


In [180]:
bank_df.tail()

Unnamed: 0,Date,Stock_Price
494,2023-12-22,195
495,2023-12-26,104
496,2023-12-27,169
497,2023-12-28,124
498,2023-12-29,70


In [181]:
bank_df.dtypes

Date           datetime64[ns]
Stock_Price             int64
dtype: object