In [244]:
import pandas as pd
import numpy as np
import json
import scipy
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
from time import time
%matplotlib inline

# Tools for processing data
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import Normalizer
from sklearn.decomposition import TruncatedSVD, PCA
from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, recall_score, classification_report, confusion_matrix, make_scorer, f1_score
# Classifiers, supervised and unsupervised
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.cluster import KMeans

import warnings
warnings.filterwarnings("ignore")
import sys
import os
if not sys.warnoptions:
    warnings.simplefilter("ignore")
    os.environ["PYTHONWARNINGS"] = "ignore" # Also affect subprocesses

In [15]:
max_prolific = 99

# Data Prepossessing

In [261]:
# read train json file
train_filename = './data/train.json'
with open(train_filename, 'r', encoding='utf-8') as f:
    train = json.load(f)

# extract coauthors as a new key from train.json
for i in range(len(train)):
    coauthors = []
    prolific_authors = []
    train[i]['text'] = train[i]['abstract']
    for auth in train[i]['authors']:
        if auth >= max_prolific:
            coauthors.append(auth)
        else:
            prolific_authors.append(auth)
    train[i]['coauthors'] = coauthors
    if len(prolific_authors) == 0:
        prolific_authors.append(-1)
    train[i]['prolific_authors'] = prolific_authors
    train[i]['text'].extend(train[i]['title'])
    
train_df = pd.DataFrame.from_dict(train)

# First ten authors with more than X articles
names = train_df.prolific_authors.value_counts()[train_df.prolific_authors.value_counts()>100][-10:].index.tolist()
names_list = []
for sublist in names:
    for item in sublist:
        names_list.append(item)
# choose all the articles written by the first 10 authors         
chosen_authors = []
chosen_article = []
for index, row in train_df.iterrows():
    curr_au = row['prolific_authors']
    curr_text = row['text']
    if any(item in curr_au for item in names_list):
        chosen_authors.append(curr_au)
        chosen_article.append(curr_text)

combine = list(zip(chosen_authors, chosen_article))
authors_data = pd.DataFrame(combine, columns=['prolific_authors', 'text'])
authors_data = authors_data.reset_index().drop('index', 1)

# extract the most-common 1000 words from each author's corpus, store them in a list, and then eliminate duplicates.
common_words = []
authors_docs = {}
bow_au = []
bow_text = []
for name in names_list:
    content = []
    count = 0
    for index, row in authors_data.iterrows():
        if name in authors_data['prolific_authors'][index]:
            content.extend(authors_data['text'][index])
            if count < 50:
                bow_text.append(authors_data['text'][index])
                bow_au.append(name)
                count += 1
    authors_docs[name] = content
    # Return the most common words of that author's corpus.
    bow = [item[0] for item in Counter(content).most_common(1000)]
    common_words.extend(bow)
    
common_words = set(common_words)

combine2 = list(zip(bow_au, bow_text))
bow_counts = pd.DataFrame(combine2, columns=['prolific_authors','text'])
bow_counts = bow_counts.reset_index().drop('index',1)
# Use common_words as the columns of a temporary DataFrame
df = pd.DataFrame(columns=common_words)

# Join BOW features with the author's content
bow_counts = bow_counts.join(df)
bow_counts.loc[:,common_words] = 0

# Fill the DataFrame with counts of each feature in each article
for i, t in enumerate(bow_counts.text):
    for word in t:
        if word in common_words:
            bow_counts.loc[i,word] += 1
y = bow_counts['prolific_authors']
X = bow_counts.drop(['text', 'prolific_authors'], 1)

In [271]:
# read test json file
test_filename = './data/test.json'
with open(test_filename, 'r', encoding='utf-8') as f:
    test = json.load(f)
    
for i in range(len(test)):
    test[i]['text'] = test[i]['abstract']
    test[i]['text'].extend(test[i]['title'])
test_df = pd.DataFrame.from_dict(test)
bow_counts_test = pd.DataFrame(test_df, columns=['text'])
bow_counts_test = bow_counts_test.reset_index().drop('index',1)
# Join BOW features with the author's content
bow_counts_test = bow_counts_test.join(df)
bow_counts_test.loc[:,common_words] = 0
# Fill the DataFrame with counts of each feature in each article
for i, t in enumerate(bow_counts_test.text):
    for word in t:
        if word in common_words:
            bow_counts_test.loc[i,word] += 1
X_test = bow_counts_test.drop(['text'], 1)

In [263]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.24, random_state=0, stratify=y)

In [264]:
# Parameters to optimize
params = [{
    'logisticregression__solver': ['newton-cg'],
    'logisticregression__C': [0.3, 0.5, 0.7, 1],
    'logisticregression__penalty': ['l2']
    },{
    'logisticregression__solver': ['saga'],
    'logisticregression__C': [0.3, 0.5, 0.7, 1],
    'logisticregression__penalty': ['l1','l2']
}]

pipe = make_pipeline(StandardScaler(), LogisticRegression())
# Find best parameters based on scoring of choice
gridsearch = GridSearchCV(estimator=pipe, param_grid=params, n_jobs=-1, scoring='f1_micro', cv=5).fit(X,y)

# Extract best estimator
best = gridsearch.best_estimator_
print("Best parameters:",gridsearch.best_params_)

# Get train accuracy
best = best.fit(X_train, y_train)
train = best.score(X=X_train, y=y_train)
print("\nTrain Accuracy Score:",train)

# Get test accuracy
test = best.score(X=X_val,y=y_val)
print("\nTest Accuracy Score:",test)

y_pred = best.predict(X_val)
f1 = f1_score(y_val, y_pred, average='weighted')
print("\nF1-score': %.3f" % f1)

Best parameters: {'logisticregression__C': 1, 'logisticregression__penalty': 'l1', 'logisticregression__solver': 'saga'}

Train Accuracy Score: 0.9842105263157894

Test Accuracy Score: 0.5083333333333333

F1-score': 0.492


In [272]:
final_y_pred = best.predict(X_test)
test_filename = './data/test.json'
with open(test_filename, 'r', encoding='utf-8') as f:
    test = json.load(f)
    
for i in range(len(test)):
    test[i]['text'] = test[i]['abstract']
    test[i]['text'].extend(test[i]['title'])
test_df = pd.DataFrame.from_dict(test)

test_df['Predict'] = final_y_pred
result = test_df[['identifier', 'Predict']]
result = result.rename(columns={'identifier':'ID'})
result.to_csv('./results.csv', index=False)

In [273]:
result

Unnamed: 0,ID,Predict
0,0,41
1,1,84
2,2,53
3,3,10
4,4,41
...,...,...
795,795,32
796,796,27
797,797,10
798,798,41
