In [1]:
import pandas as pd

In [2]:
DATA_PATH = '../data/'
DATA_PROCESSED_PATH = DATA_PATH + 'processed/'
train0 = pd.read_csv(DATA_PROCESSED_PATH + 'train10000_0.csv')
train1 = pd.read_csv(DATA_PROCESSED_PATH + 'train10000_1.csv')

In [3]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

def cv(data):
    count_vectorizer = CountVectorizer()

    emb = count_vectorizer.fit_transform(data)

    return emb, count_vectorizer

In [20]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report

def train_lr(data):

    list_corpus = data["preprocessed_text"].tolist()
    list_labels = data["hyperpartisan"].tolist()

    X_train, X_val, y_train, y_val = train_test_split(list_corpus, list_labels, test_size=0.2, 
                                                                                    random_state=40)
    
    X_train_counts, count_vectorizer = cv(X_train)
    X_val_counts = count_vectorizer.transform(X_val)
    
    clf = LogisticRegression(C=30.0, class_weight='balanced', solver='newton-cg', 
                         multi_class='multinomial', n_jobs=-1, random_state=40)
    clf.fit(X_train_counts, y_train)

    y_predicted_counts = clf.predict(X_val_counts)
    
    return y_val, y_predicted_counts

def get_metrics(y_test, y_predicted):  
    # true positives / (true positives+false positives)
    precision = precision_score(y_test, y_predicted, pos_label=None,
                                    average='weighted')             
    # true positives / (true positives + false negatives)
    recall = recall_score(y_test, y_predicted, pos_label=None,
                              average='weighted')
    
    # harmonic mean of precision and recall
    f1 = f1_score(y_test, y_predicted, pos_label=None, average='weighted')
    
    # true positives + true negatives/ total
    accuracy = accuracy_score(y_test, y_predicted)
    return accuracy, precision, recall, f1

def eval_lr(y_test, y_predicted_counts):
    accuracy, precision, recall, f1 = get_metrics(y_test, y_predicted_counts)
    print("accuracy = %.3f, precision = %.3f, recall = %.3f, f1 = %.3f" % (accuracy, precision, recall, f1))
    return accuracy, precision, recall, f1

In [22]:
y_val, y_pred = train_lr(train1)

In [23]:
eval_lr(y_val, y_pred)

accuracy = 0.780, precision = 0.780, recall = 0.780, f1 = 0.780


(0.7805, 0.7804729549496147, 0.7805, 0.7804815962213595)