### Undersampling datasets

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
import os

In [2]:
def undersampling_and_split(import_folder, export_path):
    '''takes in a csv file containing mixed-sourced news articles related to a particular economic indicator, 
    performs undersampling on the train_set to make all labels have similar number of examples, 
    export the undersampled train, dev, and test csvfile. '''
    
    df = pd.read_csv(import_folder)
    df_non_zero = df[df['title_desc_sent_1'] != 0]
    numof_min_class_bet_pos_neg = df_non_zero['title_desc_sent_1'].value_counts().min()
    
    df_zero = df[df['title_desc_sent_1'] == 0].sample(numof_min_class_bet_pos_neg - 1)
    concat_df = pd.concat([df_non_zero, df_zero])
    print(concat_df['title_desc_sent_1'].value_counts())
    
    train, test = train_test_split(concat_df, test_size=0.15, random_state=42)
    
    train.to_csv(export_path + 'train.csv')
    test.to_csv(export_path + 'dev.csv')
    test.to_csv(export_path + 'test.csv')
    


In [3]:
gdp_df = pd.read_csv('annotated_GDP_bnn&CBC.csv')
gdp_df['title_desc_sent_1'].value_counts()

-1    55
 0    43
 1    32
Name: title_desc_sent_1, dtype: int64

In [4]:
mr_df = pd.read_csv('annotated_mortgage_rate_bnn&CBC.csv')
mr_df['title_desc_sent_1'].value_counts()

 0    69
-1    37
 1    19
Name: title_desc_sent_1, dtype: int64

In [3]:
undersampling_and_split('annotated_GDP_bnn&CBC.csv', 'undersampled/GDP/')

-1    55
 1    32
 0    31
Name: title_desc_sent_1, dtype: int64


In [4]:
undersampling_and_split('annotated_housing_bnn&CBC.csv', 'undersampled/housing/')

-1    29
 1    28
 0    27
Name: title_desc_sent_1, dtype: int64


In [5]:
undersampling_and_split('annotated_interest_rate_bnn&CBC.csv', 'undersampled/interest_rates/')

-1    38
 1    11
 0    10
Name: title_desc_sent_1, dtype: int64


In [6]:
undersampling_and_split('annotated_mortgage_rate_bnn&CBC.csv', 'undersampled/mortgage_rates/')

-1    37
 1    19
 0    18
Name: title_desc_sent_1, dtype: int64


In [7]:
undersampling_and_split('annotated_employment_bnn&CBC.csv', 'undersampled/employment/')

-1    30
 1    20
 0    19
Name: title_desc_sent_1, dtype: int64
