In [0]:

import pandas as pd
from sklearn.model_selection import train_test_split
import os

import_folder = r'/content/drive/My Drive/Colab Notebooks/capstone_betterdwelling/annotations_bnn_cbc/'
export_folder = r'/content/drive/My Drive/Colab Notebooks/capstone_betterdwelling/annotations_bnn_cbc/oversampled/'

def oversampling_on_indicator_and_split(indicator_name, import_folder, export_folder):
    '''takes in a csv file containing mixed-sourced news articles related to a particular economic indicator, 
    performs oversampling on the train_set to make all labels have equal number of examples, export the oversampled train csvfile. '''

    indicator_file = 'annotated_'+indicator_name+'_bnn&CBC.csv'
    
    df = pd.read_csv(os.path.join(import_folder,indicator_file), usecols = ['title','description','title_desc_sent_1','publishedAt'])

    # add one more column to combine title+description
    df['title_desc'] = df['title'] + ". " + df['description']

    # do train_test before oversampling to avoid contamination of test set
    df_train,df_eva = train_test_split(df,test_size = 0.2, random_state = 42)
    df_eva.to_csv(os.path.join(export_folder,indicator_name+'_eva_df.csv'))

    print(df_train['title_desc_sent_1'].value_counts())

    

    negative_rows = df_train[df_train['title_desc_sent_1'] == -1]
    neutral_rows =df_train[df_train['title_desc_sent_1'] == 0]
    positive_rows = df_train[df_train['title_desc_sent_1'] == 1]

    ## find the majority class
    majority_label = df_train['title_desc_sent_1'].value_counts().idxmax()
    print(majority_label)
    majority_label_len = df_train['title_desc_sent_1'].value_counts().max()
    print(majority_label_len)

    resampled_rows = []
    for rows in [negative_rows, neutral_rows, positive_rows]:
      resampled = rows.sample(frac = majority_label_len/len(rows),random_state=42,replace=True)
      resampled_rows.append(resampled)


    df_train_oversampled = pd.concat(resampled_rows)


    ## now export
    #indicator_name = indicator_file.split("_")[1]
    #print(indicator_name)

    export_file_path = os.path.join(export_folder,indicator_name+'_train_df_oversampled.csv')
    df_train_oversampled.to_csv(export_file_path)


    print("done",indicator_name)



In [0]:
indicator_names = ['GDP','employment','housing','interest_rate','mortgage_rate','stock']

for indicator_name in indicator_names:
  print(indicator_name)
  oversampling_on_indicator_and_split(indicator_name, import_folder, export_folder)

GDP
-1    43
 0    35
 1    26
Name: title_desc_sent_1, dtype: int64
-1
43
done GDP
employment
 0    52
-1    23
 1    18
Name: title_desc_sent_1, dtype: int64
0
52
done employment
housing
 0    45
-1    25
 1    22
Name: title_desc_sent_1, dtype: int64
0
45
done housing
interest_rate
 0    52
-1    31
 1    10
Name: title_desc_sent_1, dtype: int64
0
52
done interest_rate
mortgage_rate
 0    55
-1    28
 1    17
Name: title_desc_sent_1, dtype: int64
0
55
done mortgage_rate
stock
 0    47
-1    31
 1    18
Name: title_desc_sent_1, dtype: int64
0
47
done stock
