In [2]:
import pandas as pd
from imblearn.over_sampling import SMOTE
import pickle

from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score,accuracy_score,confusion_matrix

In [3]:
abortion_csv = pd.read_csv('../text-classifier/train-data/abortion.csv')
environment_csv = pd.read_csv('../text-classifier/train-data/environment.csv')
guns_csv = pd.read_csv('../text-classifier/train-data/guns.csv')
healthcare_csv = pd.read_csv('../text-classifier/train-data/health care.csv')
immigration_csv = pd.read_csv('../text-classifier/train-data/immigration.csv')
LGBTQ_csv = pd.read_csv('../text-classifier/train-data/LGBTQ.csv')
racism_csv = pd.read_csv('../text-classifier/train-data/racism.csv')
taxes_csv = pd.read_csv('../text-classifier/train-data/taxes.csv')
technology_csv = pd.read_csv('../text-classifier/train-data/technology.csv')
trade_csv = pd.read_csv('../text-classifier/train-data/trade.csv')
trump_csv = pd.read_csv('../text-classifier/train-data/trump impeachment.csv')
election_csv = pd.read_csv('../text-classifier/train-data/us 2020 election.csv')
military_csv = pd.read_csv('../text-classifier/train-data/us military.csv')
welfare_csv = pd.read_csv('../text-classifier/train-data/welfare.csv')

In [4]:
targets = []
for csv in [environment_csv, guns_csv, healthcare_csv, immigration_csv, LGBTQ_csv, racism_csv, taxes_csv, technology_csv, trade_csv, trump_csv, election_csv, military_csv, welfare_csv]:
    targets.append(csv.columns[4])
print(targets)

['environment', 'guns', 'health care', 'immigration', 'LGBTQ', 'racism', 'taxes', 'technology', 'trade', 'trump impeachment', 'us 2020 election', 'us military', 'welfare']


In [68]:
# relevant_csv = pd.concat([abortion_csv,environment_csv,guns_csv,healthcare_csv,immigration_csv,LGBTQ_csv,racism_csv,taxes_csv,technology_csv,trade_csv,trump_csv,election_csv,military_csv,welfare_csv]).fillna(0)

TDIF vectorizer [guide](https://medium.com/@cmukesh8688/tf-idf-vectorizer-scikit-learn-dbc0244a911a)  [better guide](https://www.geeksforgeeks.org/smote-for-imbalanced-classification-with-python/) and [docs](https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html)

Using SMOTE for imbalanced data [guide](https://machinelearningmastery.com/smote-oversampling-for-imbalanced-classification/) and [docs](https://imbalanced-learn.org/stable/user_guide.html) 

Paper instructions: 
- For topic classification, as it is a multi-label classification task, we trained 14 independent binary classifiers (one per topic). 
- As the label distributions is highly imbalanced, we used SMOTE (Synthetic Minority Oversampling Technique) [10] to over-sample the positive class. 
- Each of these topic classifiers uses logistic regression and tf-idf based features. The settings for the tf-idf vectorizer are as follows: the maximum number of features is 5,000, the maximum document frequency is 0.95, and the minimum document frequency is 30. 
- These classifiers were separately optimized using a 5-fold cross validation loop with grid-search using the F1-score as the optimization metric. 
- Table 4 shows the final cross-validation results for each topic. 
    - While F1 is generally high, we note that the classifier has smaller F1 score for the technology and welfare topics. 
    - For technology, this is likely do to ambiguity of whether an article is related to U.S. politics – e.g., an article about Facebook’s earnings is not relevant, but one that discusses new regulations is. 
    - For welfare, this topic is much broader than the rest, covering everything from cash assistance programs to homelessness issues. More training data would likely help here.

Additional notes:
- Very few articles are not relevant so eh on that model (relevant: 1.0 = 26548 , 0.0 = 56)

In [5]:
def clean_text(text, target):
    # instantiate the vectorizer object
    tfidfvectorizer = TfidfVectorizer(max_features=5000, max_df=0.95, min_df=30, lowercase=True, analyzer='word',stop_words= 'english')
    # convert th documents into a matrix
    tfidf_wm = tfidfvectorizer.fit_transform(text)
    #retrieve the terms found in the corpora
    tfidf_tokens = tfidfvectorizer.get_feature_names_out() 
    df_tfidfvect = pd.DataFrame(data = tfidf_wm.toarray(),columns = tfidf_tokens)
    # add abortion column to the dataframe
    df_tfidfvect['target'] = target
    return df_tfidfvect

def split_data(model_data):
    smote = SMOTE()
    X, y = smote.fit_resample(model_data.drop('target', axis=1), model_data['target'])
    return train_test_split(X, y, test_size=0.25, random_state=0)

def train_model(x_train, x_test, y_train, y_test):
    logisticRegr = LogisticRegression()
    logisticRegr.fit(x_train, y_train)
    return logisticRegr

# NOTE: right now I can only validate on testing data
def validate_model(model, x_val, y_val):
    y_pred = model.predict(x_val)
    return f1_score(y_val, y_pred), accuracy_score(y_val, y_pred), confusion_matrix(y_val, y_pred)

def run(text, target, full_test=False):
    model_data = clean_text(text, target)
    x_train, x_test, y_train, y_test = split_data(model_data)
    model = train_model(x_train, x_test, y_train, y_test)
    # print('testing on test data')
    if full_test:
        return validate_model(model, model_data.drop('target', axis=1), model_data['target'])
    return validate_model(model, x_test, y_test)

def generate_model(text, target):
    model_data = clean_text(text, target)
    x_train, x_test, y_train, y_test = split_data(model_data)
    model = train_model(x_train, x_test, y_train, y_test)
    return model

## Generate Models

In [8]:
model_set = []
for i, csv in enumerate([environment_csv, guns_csv, healthcare_csv, immigration_csv, LGBTQ_csv, racism_csv, taxes_csv, technology_csv, trade_csv, trump_csv, election_csv, military_csv, welfare_csv]):
    model_set.append(generate_model(csv['Input.text'], csv[targets[i]]))
pickle.dump(model_set, open('model_set.pkl', 'wb'))

In [6]:
for i, csv in enumerate([environment_csv, guns_csv, healthcare_csv, immigration_csv, LGBTQ_csv, racism_csv, taxes_csv, technology_csv, trade_csv, trump_csv, election_csv, military_csv, welfare_csv]):
    print(targets[i])
    results = run(csv['Input.text'], csv[targets[i]], full_test=True)
    print('f1 score: ', results[0])
    print('accuracy score: ', results[1])
    print('confusion matrix: ', results[2])
    print('\n')

environment
f1 score:  0.9682539682539683
accuracy score:  0.9957582184517497
confusion matrix:  [[1756    7]
 [   1  122]]


guns
f1 score:  0.9722222222222222
accuracy score:  0.9978791092258749
confusion matrix:  [[1812    2]
 [   2   70]]


health care
f1 score:  0.9305555555555556
accuracy score:  0.9893955461293743
confusion matrix:  [[1732   15]
 [   5  134]]


immigration
f1 score:  0.9769585253456221
accuracy score:  0.9973488865323435
confusion matrix:  [[1775    3]
 [   2  106]]


LGBTQ
f1 score:  0.9655172413793104
accuracy score:  0.9957582184517497
confusion matrix:  [[1766    6]
 [   2  112]]


racism
f1 score:  0.9150943396226415
accuracy score:  0.9904559915164369
confusion matrix:  [[1771   15]
 [   3   97]]


taxes
f1 score:  0.9076923076923077
accuracy score:  0.9872746553552492
confusion matrix:  [[1744   19]
 [   5  118]]


technology
f1 score:  0.9
accuracy score:  0.9936373276776246
confusion matrix:  [[1820   12]
 [   0   54]]


trade
f1 score:  0.9080459770114

In [24]:
for i, m in enumerate(model_set):
    coef_series = pd.Series(
        data=m.coef_[0],
        index=m.feature_names_in_
    )
    print(targets[i])
    print(coef_series.sort_values(ascending=False).head(10))
    print()



environment
climate          9.175533
energy           4.848426
change           3.754582
environmental    2.799547
emissions        2.793684
epa              2.618874
green            2.504350
water            2.378185
carbon           2.259599
gas              2.220376
dtype: float64

guns
gun          9.804459
guns         4.550536
shooting     4.336415
firearms     3.563884
rourke       3.188125
shooter      3.094102
shootings    2.786238
weapons      2.752162
paso         2.323355
mass         2.201877
dtype: float64

health care
health       9.704673
care         4.798540
tobacco      3.172287
insurance    3.101655
medical      3.067929
medicare     2.935680
patients     2.821917
drug         2.452463
pregnancy    2.395087
disease      2.210833
dtype: float64

immigration
immigration    8.692332
border         6.374818
immigrants     4.723594
ice            4.122513
illegal        2.928892
detention      2.846387
immigrant      2.712376
children       2.497428
miller         2.30