In [0]:
### make a modular function to oversample and train_test_split on the 6 econ indicators for 2nd-stage finetuning of sentiment analyzer:
import pandas as pd
from sklearn.model_selection import train_test_split
import os

import_folder = r'/content/drive/My Drive/Colab Notebooks/capstone_betterdwelling/annotations_bnn_cbc/'
export_folder = r'/content/drive/My Drive/Colab Notebooks/capstone_betterdwelling/annotations_bnn_cbc/oversampled/'

def oversampling_on_indicator_and_split(indicator_name, import_folder, export_folder):
    '''takes in a csv file containing mixed-sourced news articles related to a particular economic indicator, 
    performs oversampling to make all labels have equal number of examples, export the oversampled csvfile. '''

    indicator_file = 'annotated_'+indicator_name+'_bnn&CBC.csv'
    
    df = pd.read_csv(os.path.join(import_folder,indicator_file), usecols = ['title','description','title_desc_sent_1','publishedAt'])

    print(df['title_desc_sent_1'].value_counts())

    # add one more column to combine title+description
    df['title_desc'] = df['title'] + ". " + df['description']

    negative_rows = df[df['title_desc_sent_1'] == -1]
    neutral_rows = df[df['title_desc_sent_1'] == 0]
    positive_rows = df[df['title_desc_sent_1'] == 1]

    ## find the majority class
    majority_label = df['title_desc_sent_1'].value_counts().idxmax()
    print(majority_label)
    majority_label_len = df['title_desc_sent_1'].value_counts().max()
    print(majority_label_len)

    resampled_rows = []
    for rows in [negative_rows, neutral_rows, positive_rows]:
      resampled = rows.sample(frac = majority_label_len/len(rows),random_state=42,replace=True)
      resampled_rows.append(resampled)


    df_oversampled = pd.concat(resampled_rows)


    ## now export
    #indicator_name = indicator_file.split("_")[1]
    #print(indicator_name)

    export_file_path = os.path.join(export_folder,indicator_name+'_df_oversampled.csv')
    df_oversampled.to_csv(export_file_path)

    #train_test_split

    df_train,df_eva = train_test_split(df_oversampled,test_size = 0.2, random_state = 42)

    df_train.to_csv(os.path.join(export_folder,indicator_name+'_train_df_oversampled.csv'))
    df_eva.to_csv(os.path.join(export_folder,indicator_name+'_eva_df_oversampled.csv'))

    print("done",indicator_name)



In [0]:
indicator_names = ['GDP','employment','housing','interest_rate','mortgage_rate','stock']

for indicator_name in indicator_names:
  print(indicator_name)
  oversampling_on_indicator_and_split(indicator_name, import_folder, export_folder)

GDP
-1    55
 0    43
 1    32
Name: title_desc_sent_1, dtype: int64
-1
55
done GDP
employment
 0    67
-1    30
 1    20
Name: title_desc_sent_1, dtype: int64
0
67
done employment
housing
 0    59
-1    29
 1    28
Name: title_desc_sent_1, dtype: int64
0
59
done housing
interest_rate
 0    68
-1    38
 1    11
Name: title_desc_sent_1, dtype: int64
0
68
done interest_rate
mortgage_rate
 0    69
-1    37
 1    19
Name: title_desc_sent_1, dtype: int64
0
69
done mortgage_rate
stock
 0    63
-1    37
 1    21
Name: title_desc_sent_1, dtype: int64
0
63
done stock
