In [None]:
!pip install fbprophet

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

#!pip install fbprophet

import numpy             as np # linear algebra
import pandas            as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn           as sns
import gc

from fbprophet   import Prophet
from datetime    import datetime, date 
from functools   import reduce

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

#  **Load dataset**

In [None]:
df_stock_prices           = pd.read_csv('../input/jpx-tokyo-stock-exchange-prediction/train_files/stock_prices.csv')

df_options                = pd.read_csv('../input/jpx-tokyo-stock-exchange-prediction/train_files/options.csv')

df_secondary_stock_prices = pd.read_csv('../input/jpx-tokyo-stock-exchange-prediction/train_files/secondary_stock_prices.csv')

df_trades                 = pd.read_csv('../input/jpx-tokyo-stock-exchange-prediction/train_files/trades.csv')

df_financials             = pd.read_csv('../input/jpx-tokyo-stock-exchange-prediction/train_files/financials.csv')

df_stock_list             = pd.read_csv('../input/jpx-tokyo-stock-exchange-prediction/stock_list.csv')


In [None]:
gc.collect()

# **Function**

In [None]:
def explore_data (df):
    print("Missing values: \n", df.isnull().sum())
    print("\n \n")
    print(df.describe(include='all'))
    print("\n \n")
    print(df.info())
        

In [None]:
def plot_missing_val(df: pd.DataFrame):
    if df.isnull().sum().sum() != 0:
        na_df = (df.isnull().sum() / len(df)) * 100      
        na_df = na_df.drop(na_df[na_df == 0].index).sort_values(ascending=False)
        missing_data = pd.DataFrame({'Missing Ratio %' :na_df})
        missing_data.plot(kind = "barh", figsize=(10,5))
        plt.title("Percentage of missing values per feature")
        plt.show()
    else:
        print('No NAs found')

In [None]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

    return df

# Concatene dataframe

* Find shared keys

In [None]:
print(" Common columns between df_stock_prices and df_options -->", np.intersect1d(df_stock_prices.columns, df_options.columns) )

print("\n Common columns between df_stock_prices and df_trades -->", np.intersect1d(df_stock_prices.columns, df_trades.columns) )

print(" \n Common columns between df_stock_prices and df_secondary_stock_prices -->", np.intersect1d(df_stock_prices.columns, df_secondary_stock_prices.columns) )

print("\n Common columns between df_stock_prices and df_financials -->", np.intersect1d(df_stock_prices.columns, df_financials.columns) )

print("\n Common columns between df_stock_prices and df_stock_list -->", np.intersect1d(df_stock_prices.columns, df_stock_list.columns) )
      

In [None]:
list_size_dfs = [df_stock_prices.shape[0], df_options.shape[0], df_secondary_stock_prices.shape[0], df_trades.shape[0], df_financials.shape[0], df_stock_list.shape[0]]
name_dfs      = ["df_stock_prices", "df_options", "df_secondary_stock_prices", "df_trades", "df_financials", "df_stock_list"]

fig = plt.figure(figsize = (10, 5))
# creating the bar plot
plt.bar(name_dfs,list_size_dfs, color ='maroon', width = 0.4)
plt.xlabel('Dataframe', fontsize='15')
plt.ylabel('Shape of dataframe', fontsize='15')
plt.xticks(rotation=45)
plt.show()

In [None]:
df_stock_new = df_stock_prices.append(df_secondary_stock_prices)

In [None]:
df_stock_new = reduce_mem_usage(df_stock_new)

In [None]:
gc.collect()

# **Data exploration**

In [None]:
df_stock_new.head()

In [None]:
df_stock_new.describe(include='all')

In [None]:
df_stock_new.info()

In [None]:
plot_missing_val(df_stock_new)

**Delete feature with more than 90% of missing values**

In [None]:
treshold = 90.0
min_count =  int(((100-treshold)/100)*df_stock_new.shape[0] + 1)
df_stock_new= df_stock_new.dropna( axis=1, 
                thresh=min_count)

In [None]:
gc.collect()

In [None]:
#delete rows with missing values
df_stock_new.dropna(inplace=True)

In [None]:
print("number of missing values: ", df_stock_new.isnull().sum().sum())

In [None]:
df_stock_new['Date'] = pd.to_datetime(df_stock_new['Date'], format = '%Y/%m/%d')
df_stock_new.head().style.set_properties(subset=['Date'], **{'background-color': 'yellow'})

In [None]:
df_stock_new.plot(x='Date', y='Volume', figsize=(10,5), legend=True);

In [None]:
gc.collect()

In [None]:

df_stock_new.SupervisionFlag = df_stock_new.SupervisionFlag.replace({True: 1, False: 0})

In [None]:
gc.collect()

In [None]:
df_stock_new = reduce_mem_usage(df_stock_new)

In [None]:
df_stock_new.head()

# **Data visualization**

In [None]:
df_stock_new.boxplot(rot=45, figsize=(10,5));

In [None]:
gc.collect()

# **Train/Test Split**

In [None]:
test_size = int((df_stock_new.shape[0] *80)/100)

df_train = df_stock_new[:test_size]
df_test = df_stock_new[test_size:]

In [None]:
ax = df_train.plot(x='Date', y='Volume', figsize=(10,5), legend=True)
df_test.plot(x='Date', y='Volume', ax=ax)
ax.legend(["Train", "Test"]);

In [None]:
#Format data for prophet
df_train.rename(columns = {'Date':'ds', 'Target':'y'}, inplace = True)


In [None]:
df_train.drop(['RowId'], axis=1, inplace = True)

In [None]:
df_train.head()

# **Prophet model**

In [None]:
gc.collect()

In [None]:
model = Prophet()
model.add_regressor('SecuritiesCode')
model.add_regressor('Open')
model.add_regressor('High')
model.add_regressor('Low')
model.add_regressor('Close')
model.add_regressor('Volume')
model.add_regressor('AdjustmentFactor')
model.add_regressor('SupervisionFlag')
model.fit(df_train)

In [None]:
gc.collect()

In [None]:
future_data = model.make_future_dataframe(periods=365)

In [None]:
test_X = df_test.drop(['y'], axis=1)

In [None]:
#forecast the data for Test  data
forecast_data = model.predict(test_X)
model.plot(forecast_data);

In [None]:
#forecast = model.predict(future)

# To do:
    


* perform normalisation

* hyperparameter tuning




