In [1]:
########################################
# packages used in this script         
########################################

import pandas as pd
import sys
import os
import time
import pickle
import numpy as np
import datetime
import random
import csv
csv.field_size_limit(sys.maxsize)
from datetime import datetime
from bs4 import BeautifulSoup as bs

from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

In [2]:
################################################################################################################################
# Function: generate_feature_table()
# Param:
#     - path (str): the path to the general news data file(s).
# Return:
#     - None
# Feature table is generated for the classification task
################################################################################################################################

def generate_feature_table(path, cv, lda):
    # the generated feature table is saved in the folder "ClassifyNews", and the file name is "NewEventTable_V1.csv"
    with open("ClassifyNews/NewEventTable_V1.csv", "w", newline='', encoding="utf-8") as f:
        csv_writer = csv.writer(f, delimiter=',')
        csv_writer.writerow(['id', 'company', 't1', 't2', 't3', 't4', 't5', 't6', 't7', 't8', 't9', 't10', '1_month', '3_month', '6_month', '12_month'])

        directory = path
        for root,dirs,files in os.walk(directory):
            for file in files:
                if file.endswith(".csv"):
                    try:
                        print(file)
                        data = pd.read_csv(directory + "/" + file, engine='python', error_bad_lines=False)
                        data = data.to_numpy()

                        event_date = (file.split(".csv")[0]).split('-')[-1]
                        event_year = event_date[:4]
                        event_month = event_date[4:6]
                        event_day = event_date[6:8]
                        new_event_date = str(event_year) + "-" + str(event_month) + "-" + str(event_day)

                        company_name = file.split("_")[0]

                        for row in data:
                            ids = row[0]
                            available_text = True

                            for i in range(len(row[1])-5):
                                if row[1][i:i+5] == "merge":
                                    available_text = False
                                    break

                                if row[1][i:i+5] == "acqui":
                                    available_text = False
                                    break

                            if available_text == True:
                                context = [row[1]]

                                try:
                                    soup = bs(row[1], "html.parser")
                                    pub_date = soup.findAll('pubinfo:dates')[0]
                                    pub_date_tag = pub_date('pubinfo:pubdate')[0]

                                    day = pub_date_tag['day']
                                    month = pub_date_tag['month']
                                    year = pub_date_tag['year']

                                    news_date = str(year) + "-" + str(month) + "-" + str(day)

                                    delta_date = (datetime.strptime(new_event_date, '%Y-%m-%d') - datetime.strptime(news_date, '%Y-%m-%d')).days

                                    df_temp = cv.transform(context)
                                    prob = lda.transform(df_temp)

                                    output_row = [ids, company_name]
                                    for item in prob[0]:
                                        output_row.append(item)

                                    if delta_date < 30:
                                        output_row.append(1)
                                        output_row.append(0)
                                        output_row.append(0)
                                        output_row.append(0)
                                    elif delta_date < 90:
                                        output_row.append(0)
                                        output_row.append(1)
                                        output_row.append(0)
                                        output_row.append(0)
                                    elif delta_date < 180:
                                        output_row.append(0)
                                        output_row.append(0)
                                        output_row.append(1)
                                        output_row.append(0)
                                    else:
                                        output_row.append(0)
                                        output_row.append(0)
                                        output_row.append(0)
                                        output_row.append(1)
                                    
                                    csv_writer.writerow(output_row)
                                except:
                                    continue
                    except:
                        print("empty file")
                        continue

In [3]:
################################################################################################################################
# Function: RandomForest_learn()
# Param:
#     - None
# Return:
#     - model: the random forest model with default parameters (untrained).
################################################################################################################################

def RandomForest_learn():
    model = RandomForestClassifier(max_depth=10, random_state=0)
    
    return model

In [4]:
################################################################################################################################
# Function: training_process()
# Param:
#     - x_train (dataframe): the features of training data set.
#     - y_train (dataframe): the label of training data set.
# Return:
#     - model_rf: the trained random forest model.
################################################################################################################################

def training_process(x_train, y_train):
    model_rf = RandomForest_learn()
    
    # collect positive data for random sampling
    positive_ids = np.where(y_train==1)[0]
    positive_dt = x_train.iloc[positive_ids, :]
    positive_labels = y_train.iloc[positive_ids]

    for learnIter in range(20):
        print("---learning iteration: " + str(learnIter) + "---")

        negative_ids = np.where(y_train==0)[0]
        random_sampled_negative_ids = np.random.choice(negative_ids, size=positive_ids.shape[0], replace=False)

        negative_dt = x_train.iloc[random_sampled_negative_ids, :]
        negative_labels= y_train.iloc[random_sampled_negative_ids]

        epoch_dt = np.concatenate((positive_dt, negative_dt))
        epoch_labels = np.concatenate((positive_labels, negative_labels))

        model_rf.fit(epoch_dt, epoch_labels)

    return model_rf

In [5]:
################################################################################################################################
# Function: calculate_confusion()
# Param:
#     - y_pred (dataframe): the predicted label.
#     - y_true (dataframe): the actual label.
# Return:
#     - accuracy (double): accuracy of prediction
#     - precision (double): precision of prediction
#     - recall (double): recall of prediction
#     - false alarm (double): false alarm of prediction
################################################################################################################################

def calculate_confusion(y_pred, y_true):
    tp = 0
    fn = 0
    fp = 0
    tn = 0
    
    # calculate confusion matrix
    for i in range(len(y_pred)):
        if y_true[i] == 1 and y_pred[i] == 1:
            tp += 1
        elif y_true[i] == 1 and y_pred[i] == 0:
            fn += 1
        elif y_true[i] == 0 and y_pred[i] == 1:
            fp += 1
        elif y_true[i] == 0 and y_pred[i] == 0:
            tn += 1
            
    print("tp: " + str(tp))
    print("fn: " + str(fn))
    print("fp: " + str(fp))
    print("tn: " + str(tn))
    
    # accuracy
    accuracy = (tp + tn) / (tp + tn + fn + fp)
    
    # precision
    precision = tp / (tp + fp)
    
    # recall
    recall = tp / (tp + fn)
    
    # false alarm
    false_alarm = fp / (fp + tn)
    
    return accuracy, precision, recall, false_alarm

In [6]:
################################################################################################################################
# Function: test_process()
# Param:
#     - model: the trained random forest model.
#     - x_train (dataframe): features in training data set
#     - x_test (dataframe): features in test data set
#     - y_train (dataframe): labels in training data set
#     - y_test (dataframe): labels in test data set
# Return:
#     - None
# The performance metrics are printed in the stdout.
################################################################################################################################


def test_process(model, x_train, x_test, y_train, y_test):
    predictedLabels = np.asarray(model.predict(x_test))
    train_predictedLabels = np.asarray(model.predict(x_train))

    accuracy, precision, recall, false_alarm = calculate_confusion(train_predictedLabels, np.array(y_train))
    print("performance in training data set: ")
    print("train accuracy is: " + str(accuracy))
    print("train precision is: " + str(precision))
    print("train recall is: " + str(recall))
    print("train false alarm is: " + str(false_alarm))

    print("")

    accuracy, precision, recall, false_alarm = calculate_confusion(predictedLabels, np.array(y_test))
    print("performance in test data set: ")
    print("test accuracy is: " + str(accuracy))
    print("test precision is: " + str(precision))
    print("test recall is: " + str(recall))
    print("test false alarm is: " + str(false_alarm))

In [9]:
# def main():
    print("Code Starting...")
    # load lda model (models are saved in the same folder as CLEAN_LDAonNews.ipynb)
    print("Loading models...")
    cv = pickle.load(open('cv.sav', 'rb'))
    lda = pickle.load(open('ldaM&A.sav', 'rb'))
    
    # define path to data file. (In local machine, this jupyter notebook is in the same level with folder "data collection")
    path = "data collection/NewData"
    
    # generate feature table for the classification
    ######################################################################################################################
    # IMPORTANT HERE!!!!!                                                                                                #
    # YOU WANT TO COMMENT THIS PART OUT after the first run... You don't want to generate feature table again unless you #
    # have new input data.                                                                                               #
    ######################################################################################################################
    generate_feature_table(path, cv, lda)
    print("Finish generating features table...")
    
    # start training and test phase
    # feature_path is the path to the feature table generated above. 
    # In local machine, its in the same level with this jupyter notebook.
    feature_path = 'ClassifyNews/NewEventTable_V1.csv'
    df = pd.read_csv(feature_path)

    x = df.iloc[:, 2:-4]
    y_1 = df.iloc[:, -4]
    y_2 = df.iloc[:, -3]
    y_3 = df.iloc[:, -2]
    y_4 = df.iloc[:, -1]
    
    # select 1/3/6/12 month: index = 1 -> 1 month; index = 2 -> 3 month; index = 3 -> 6 month; index = 4 -> 12 month.
    index = 2
    start_time = time.time()
    
    if index == 1:
        print("-----Training for 1 month data-----")
        x_train, x_test, y_train, y_test = train_test_split(x, y_1, test_size=0.3, random_state=42)
        model = training_process(x_train, y_train)
        print("-----Testing for 1 month data-----")
        test_process(model, x_train, x_test, y_train, y_test)
    elif index == 2:
        print("-----Training for 3 month data-----")
        x_train, x_test, y_train, y_test = train_test_split(x, y_2, test_size=0.3, random_state=42)
        model = training_process(x_train, y_train)
        print("-----Testing for 3 month data-----")
        test_process(model, x_train, x_test, y_train, y_test)
    elif index == 3:
        print("-----Training for 6 month data-----")
        x_train, x_test, y_train, y_test = train_test_split(x, y_3, test_size=0.3, random_state=42)
        model = training_process(x_train, y_train)
        print("-----Testing for 6 month data-----")
        test_process(model, x_train, x_test, y_train, y_test)
    elif index == 4:
        print("-----Training for 12 month data-----")
        x_train, x_test, y_train, y_test = train_test_split(x, y_4, test_size=0.3, random_state=42)
        model = training_process(x_train, y_train)
        print("-----Testing for 12 month data-----")
        test_process(model, x_train, x_test, y_train, y_test)
        
    end_time = time.time()
    print("CPU running time: " + str(end_time-start_time))

Code Starting...
Loading models...
Finish generating features table...
-----Training for 3 month data-----
---learning iteration: 0---
---learning iteration: 1---
---learning iteration: 2---
---learning iteration: 3---
---learning iteration: 4---
---learning iteration: 5---
---learning iteration: 6---
---learning iteration: 7---
---learning iteration: 8---
---learning iteration: 9---
---learning iteration: 10---
---learning iteration: 11---
---learning iteration: 12---
---learning iteration: 13---
---learning iteration: 14---
---learning iteration: 15---
---learning iteration: 16---
---learning iteration: 17---
---learning iteration: 18---
---learning iteration: 19---
-----Testing for 3 month data-----
tp: 13443
fn: 5213
fp: 26156
tn: 49353
performance in training data set: 
train accuracy is: 0.6668719800350449
train precision is: 0.33947826965327405
train recall is: 0.7205724699828473
train false alarm is: 0.34639579387887537

tp: 4425
fn: 3578
fp: 12030
tn: 20324
performance in test