In [1]:
import os, pickle
import numpy as np 
import pandas as pd
import warnings, logging
from transformers import pipeline
from datasets import load_dataset
warnings.filterwarnings('ignore')
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
from sklearn.preprocessing import StandardScaler
logging.getLogger("tensorflow").setLevel(logging.WARNING)


import tensorflow as tf
tf.get_logger().setLevel('ERROR')

In [2]:
print('Tensorflow version:', tf.__version__)
print("Keras version:", tf.keras.__version__)

Tensorflow version: 2.17.0
Keras version: 3.4.1


### Load News Model

In [3]:
dataset_news = load_dataset("dilkasithari-IT/news_data")
all_industries_news = list(set(dataset_news['train']['industry']))

pipe_news = pipeline(
                    "text-classification", 
                    model="dilkasithari-IT/fine-tuned-twitter-roberta-base-sentiment-latest",
                    device=0
                    )

### Load Imp / Exp Model

In [4]:
industries = [
            'Agricultural Raw Materials',
            'Consumer Goods',
            'Transportation',
            'Food Industry',
            'Capital Goods'
            ]

model_dir = "models/ImpExp/{}"
data_dir = "data/ImpExp/{}"

model_dict_impexp = {}

for industry in industries:
    industry_dict = {}
    industry_dict['Imp Model'] = tf.keras.models.load_model(
                                                            model_dir.format(f"{industry} Imp.h5"),
                                                            custom_objects={'mse': 'mse'}
                                                            )
    industry_dict['Exp Model'] = tf.keras.models.load_model(
                                                            model_dir.format(f"{industry} Exp.h5"),
                                                            custom_objects={'mse': 'mse'}
                                                            )
    industry_dict['Scaler'] = pickle.load(open(model_dir.format(f"{industry} Scalar.pickle"), 'rb'))

    model_dict_impexp[industry] = industry_dict



### Stocks Model

In [5]:
def load_scalars_stocks(scalar_path):
    with open(scalar_path, 'rb') as f:
        scaler_x, scaler_y = pickle.load(f)
    return scaler_x, scaler_y

def get_company_list_stocks(data_dir : str = 'data/Stocks/csvs/'):
    company_list = []
    for csv_file in os.listdir(data_dir):
        company_list.append(csv_file.replace('.N0000.csv', '').replace('.X0000.csv', ''))
    return company_list

def load_ts_models_scalars_stocks(
                                model_dir : str = 'models/Stocks/',
                                scalar_dir : str = 'models/Stocks/'
                                ):
    company_list = get_company_list_stocks()
    model_dict, scalar_dict = {}, {}

    for company in company_list:
        model_path = os.path.join(model_dir, f'{company}.N0000.h5')
        model_path = model_path.replace('\\', '/')

        if not os.path.exists(model_path):
            model_path = model_path.replace('N0000', 'X0000')

        model = tf.keras.models.load_model(
                                            model_path, 
                                            custom_objects={
                                                            'mse': 'mse',
                                                            'mae': 'mae'
                                                            }
                                            )
        model.compile(
                    loss='mean_squared_error',
                    optimizer='adam', 
                    metrics=['mae','mae','mae','mae']
                    )
        model.compiled_metrics == None
        
        model_dict[company] = model

        scalar_path = os.path.join(scalar_dir, f'{company}.N0000.pkl')
        scalar_path = scalar_path.replace('\\', '/')

        if not os.path.exists(scalar_path):
            scalar_path = scalar_path.replace('N0000', 'X0000')

        scalar = load_scalars_stocks(scalar_path)
        scalar_dict[company] = scalar

    return model_dict, scalar_dict

model_dict_stocks, scalar_dict_stocks = load_ts_models_scalars_stocks()



### Annual Reports

In [6]:
model_annual = tf.keras.models.load_model(
                                        "models/Annual/annual_report_model.h5",
                                        custom_objects={'mse': 'mse'}
                                        )
model_annual.compile(
                    optimizer='adam', 
                    loss='mse'
                    )



### Loan Model

In [7]:
with open('models/Loan/model.pkl', 'rb') as f:
    model_loan = pickle.load(f)

with open('models/Loan/scalar.pkl', 'rb') as f:
    scalar_loan = pickle.load(f)

### Utility Functions

In [8]:
def get_industries_news(specific_industry):
    filtered_dataset = dataset_news.filter(lambda example: example['industry'] == specific_industry)
    filtered_dataset = filtered_dataset.rename_column('combined_text', 'text')
    return filtered_dataset

def inference_news(specific_industry):
    if specific_industry == 'Consumer Goods':
        specific_industry = 'Consumer Goods Industry'
    elif specific_industry == 'Food Industry':
        specific_industry = 'Food Products'
        
    assert specific_industry in all_industries_news, f"Industry {specific_industry} not found in the dataset"
    filtered_dataset = get_industries_news(specific_industry)
    result = pipe_news(
                    inputs=filtered_dataset['train']['text'],
                    batch_size=50
                    )
    positive_scores = [item['score'] for item in result if item['label'] == 'positive']
    positive_score_percentage = len(positive_scores) / len(result)
    return {
            "Sentiment Score" : positive_score_percentage
            }

In [9]:
def load_industry_data(industry):
    all_csvs = os.listdir(data_dir.format(industry))
    all_csvs = [f for f in all_csvs if f.endswith(".xlsx")]
    all_csv_paths = [data_dir.format(industry) + "/" + f for f in all_csvs] 
    
    all_data = []
    for csv in all_csv_paths:
        df = pd.read_excel(
                            csv,
                            sheet_name='Partner'
                            )
        df['Partner Name'] = df['Partner Name'].str.strip()
        df = df[df['Partner Name'] == 'World']
        assert len(df) == 1, "More than one world data found"

        df = df[[
                'Year',
                'Export (US$ Thousand)',
                'Import (US$ Thousand)',
                'Revealed comparative advantage', 
                'World Growth (%)', 'Country Growth (%)', 
                'AHS Simple Average (%)', 'AHS Dutiable Tariff Lines Share (%)',
                    ]]
        df.reset_index(
                    drop=True, 
                    inplace=True
                    )
        all_data.append(df)

    all_data = pd.concat(all_data)
    all_data.reset_index(
                        drop=True, 
                        inplace=True
                        )
    
    all_data.fillna(
                    all_data.mean(), 
                    inplace=True
                    )
    all_data = all_data.astype(float)
    all_data.sort_values(
                        by='Year', 
                        inplace=True
                        )
    del all_data['Year']

    scaler = StandardScaler()
    scaler.fit(all_data)
    
    with open(model_dir.format(f"{industry} Scalar.pickle"), 'wb') as f:
        pickle.dump(scaler, f)
        
    all_data = pd.DataFrame(
                            scaler.transform(all_data), 
                            columns=all_data.columns
                            )
    return all_data

def predict_ts_on_industry_impexp(
                                industry, 
                                selective_window=7
                                ):
    industry_dict = model_dict_impexp[industry]
    scaler = industry_dict['Scaler']
    imp_model = industry_dict['Imp Model']
    exp_model = industry_dict['Exp Model']

    df = load_industry_data(industry)
    df_tail = df.tail(selective_window)
    
    x = df_tail.values
    x = x.reshape(1, x.shape[0], x.shape[1])

    p_imp = imp_model.predict(x).squeeze()
    p_imp = np.vstack([p_imp]*selective_window).squeeze()
    p_imp = np.dstack([p_imp]*selective_window).squeeze()
    p_imp = scaler.inverse_transform(p_imp) 
    p_imp = p_imp[0][0]

    p_exp = exp_model.predict(x).squeeze()
    p_exp = np.vstack([p_exp]*selective_window).squeeze()
    p_exp = np.dstack([p_exp]*selective_window).squeeze()
    p_exp = scaler.inverse_transform(p_exp)
    p_exp = p_exp[0][0]

    df_tail_inv = scaler.inverse_transform(df_tail)
    imp_data = df_tail_inv[:, 0]
    exp_data = df_tail_inv[:, 1]

    avg_imp = np.mean(imp_data)
    avg_exp = np.mean(exp_data)
    
    # percentage of change
    p_imp = ((p_imp - avg_imp) / avg_imp) * 100
    p_exp = ((p_exp - avg_exp) / avg_exp) * 100

    return {
            "Import Change" : p_imp,
            "Export Change" : p_exp
            }

In [10]:
def inference_ts_on_company_stocks(            
                                model_dict : dict,
                                scalar_dict : dict,                 
                                company_symbol : str,
                                selective_window : int = 30 * 6,
                                data_path : str = 'data/Stocks/csvs/{}.{}.csv',
                                input_columns = [
                                                "Open (Rs.)",
                                                "High (Rs.)",
                                                "Low (Rs.)",
                                                "Close (Rs.)",
                                                "TradeVolume",
                                                "ShareVolume",
                                                "Turnover (Rs.)"
                                                ]
                                ):
    model = model_dict[company_symbol]
    scalar_x, scalar_y = scalar_dict[company_symbol]

    if os.path.exists(data_path.format(company_symbol, 'N0000')):
        comp_ext = 'N0000'
    elif os.path.exists(data_path.format(company_symbol, 'X0000')):
        comp_ext = 'X0000'
    else:
        AssertionError(comp_ext is not None, "Data Path Must Exists !!!")

    df = pd.read_csv(data_path.format(company_symbol, comp_ext))
    df = df.dropna()
    df = df.drop_duplicates()
    df = df.reset_index(drop=True)

    df['Trade Date'] = pd.to_datetime(df['Trade Date'], format='%m/%d/%y')
    df = df.sort_values(by='Trade Date', ascending=True)    
    df = df.tail(selective_window)
    df = df.reset_index(drop=True)

    df_x = df[input_columns]
    x = df_x.values
    x = scalar_x.transform(x)
    x = x.reshape(1, x.shape[0], x.shape[1])

    p = model.predict(x, verbose=0)
    p = np.asarray(p).reshape(-1, 4)
    p = scalar_y.inverse_transform(p)
    p = np.abs(p)
    
    p_open = p[:, 0].squeeze()
    p_close = p[:, 1].squeeze()
    p_trade = p[:, 2].squeeze()
    p_turnover = p[:, 3].squeeze()

    y_open = df['Open (Rs.)'].values
    y_close = df['Close (Rs.)'].values
    y_trade = df['TradeVolume'].values
    y_turnover = df['Turnover (Rs.)'].values

    avg_p_open = np.mean(p_open)
    avg_p_close = np.mean(p_close)
    avg_p_trade = np.mean(p_trade)
    avg_p_turnover = np.mean(p_turnover)

    avg_y_open = np.mean(y_open)
    avg_y_close = np.mean(y_close)
    avg_y_trade = np.mean(y_trade)
    avg_y_turnover = np.mean(y_turnover)

    change_open_percent = ((avg_p_open - avg_y_open) / avg_y_open) * 100
    change_close_percent = ((avg_p_close - avg_y_close) / avg_y_close) * 100
    change_trade_percent = ((avg_p_trade - avg_y_trade) / avg_y_trade) * 100
    change_turnover_percent = ((avg_p_turnover - avg_y_turnover) / avg_y_turnover) * 100

    return {
            'Change Open Price (%)': change_open_percent,
            'Change Close Price (%)': change_close_percent,
            'Change Trade Volume (%)': change_trade_percent,
            'Change Turnover (%)': change_turnover_percent
            }


def inference_on_industry(
                        industry,
                        company_industry_file : str = 'data/Stocks/Company Industry groups V3.csv'
                        ):
    df = pd.read_csv(company_industry_file)
    df = df[['Industry', 'Symbol']]
    df['Symbol'] = df['Symbol'].apply(lambda x: x.replace('.N0000', '').replace('.X0000', ''))
    df = df[df['Industry'] == industry]
    company_list = df['Symbol'].values
    
    results = {}
    results['Change Open Price (%)'] = []
    results['Change Close Price (%)'] = []
    results['Change Trade Volume (%)'] = []
    results['Change Turnover (%)'] = []

    for company in company_list:
        try:
            result = inference_ts_on_company_stocks(model_dict_stocks, scalar_dict_stocks, company)
            results['Change Open Price (%)'].append(result['Change Open Price (%)'])
            results['Change Close Price (%)'].append(result['Change Close Price (%)'])
            results['Change Trade Volume (%)'].append(result['Change Trade Volume (%)'])
            results['Change Turnover (%)'].append(result['Change Turnover (%)'])
        except Exception as e:
            pass
            
    df_results = pd.DataFrame(results)
    df_results = df_results[df_results['Change Open Price (%)'] < 100]
    avg_change_open = np.mean(df_results['Change Open Price (%)'])
    avg_change_close = np.mean(df_results['Change Close Price (%)'])
    avg_change_trade = np.mean(df_results['Change Trade Volume (%)'])
    avg_change_turnover = np.mean(df_results['Change Turnover (%)'])

    return {
            'Change Open Price (%)': avg_change_open,
            'Change Close Price (%)': avg_change_close,
            'Change Trade Volume (%)': avg_change_trade,
            'Change Turnover (%)': avg_change_turnover
            }

In [11]:
def predict_on_industry_annual(industry):
    csv_dir = f"data/AnnualReports/CSV/{industry}.csv"
    df = pd.read_csv(csv_dir)
    df = df.sort_values(
                        by='Year', 
                        ascending=True
                        )
    df = df.tail(n=3)
    del df['Year']
    
    X = df.values.reshape(1, df.shape[0], df.shape[1])
    P = model_annual.predict(X).squeeze()

    avg = df.mean().values
    # Calculate the percentage change
    return {
            "Revenue": (P[0] - avg[0]) / avg[0] * 100,
            "Net Profit": (P[1] - avg[1]) / avg[1] * 100,
            "Gross Profit": (P[2] - avg[2]) / avg[2] * 100,
            "Total Assets": (P[3] - avg[3]) / avg[3] * 100,
            "Total Liabilities": (P[4] - avg[4]) / avg[4] * 100,
            "Equity": (P[5] - avg[5]) / avg[5] * 100
            }
    

In [12]:
def inference_loan(sample_json):
    sample = np.array([list(sample_json.values())])
    sample = scalar_loan.transform(sample)
    prediction = model_loan.predict(sample)[0]
    return prediction

In [13]:
def inference_pipeline(
                        industry,
                        cols = ['Change Open Price (%)', 'Change Close Price (%)',
                                'Change Trade Volume (%)', 'Change Turnover (%)', 'Revenue',
                                'Net Profit', 'Gross Profit', 'Total Assets', 'Total Liabilities',
                                'Equity', 'Export Change', 'Import Change', 'Sentiment Score']
                        ):
    news_result = inference_news(industry)
    impexp_result = predict_ts_on_industry_impexp(industry)
    annual_result = predict_on_industry_annual(industry)
    stocks_result = inference_on_industry(industry)

    # add all these to one dictionary
    final_result = {}
    for key, value in news_result.items():
        final_result[key] = value
    for key, value in impexp_result.items():
        final_result[key] = value
    for key, value in annual_result.items():
        final_result[key] = value
    for key, value in stocks_result.items():
        final_result[key] = value

    df = pd.DataFrame(final_result, index=[0])
    df = df[cols]
    
    pred = model_loan.predict(df.values)[0]
    return pred

In [14]:
inference_news("Transportation")

{'Sentiment Score': 0.2826086956521739}

In [15]:
inference_on_industry('Capital Goods')

{'Change Open Price (%)': -3.2997485203402044,
 'Change Close Price (%)': -3.404401399720544,
 'Change Trade Volume (%)': 94.62567914520316,
 'Change Turnover (%)': 180.88922344430645}

In [21]:
predict_ts_on_industry_impexp('Consumer Goods')

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 176ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 176ms/step


{'Import Change': 4.412105510458438, 'Export Change': 34.3273266991788}

In [17]:
predict_on_industry_annual('Capital Goods')

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step


{'Revenue': 96.72201956426865,
 'Net Profit': -20.06557344066903,
 'Gross Profit': 173.71189583480626,
 'Total Assets': 65.5078922893918,
 'Total Liabilities': 59.81966367740561,
 'Equity': -55.484580939600114}

In [18]:
inference_loan({
                "Change Open Price (%)":-7.36,
                "Change Close Price (%)":-9.52,
                "Change Trade Volume (%)":-18.1,
                "Change Turnover (%)":-16.11,
                "Revenue":11.98,
                "Net Profit":-38.74,
                "Gross Profit":25.4,
                "Total Assets":4.67,
                "Total Liabilities":87.11,
                "Equity":30.21,
                "Export Change":-8.25,
                "Import Change":-5.29,
                "Sentiment Score":0.12
                })

0

In [24]:
inference_pipeline('Agricultural Raw Materials')

Filter:   0%|          | 0/12771 [00:00<?, ? examples/s]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 182ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 184ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step


0