In [2]:
# %%time

# from google.colab import files

# uploaded = files.upload()

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import string
import re
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import defaultdict, Counter

from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score

import warnings
warnings.filterwarnings('ignore')

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

  import pandas.util.testing as tm


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [4]:
# # import tensorflow as tf

# !pip install -q tensorflow-hub
# !pip install -q tensorflow-datasets
# import tensorflow_hub as hub
# import tensorflow_datasets as tfds

# from keras.losses import categorical_crossentropy
# from keras.optimizers import Adam
# from keras.callbacks import EarlyStopping

In [5]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

## Text Cleanup

In [6]:
def cleanup_df(df, column):
  num_pattern = re.compile(r'[0-9]+')
  single_word_pattern = re.compile(r'^[a-zA-Z]?$')
  df[column] = df[column].apply(lambda x: x.replace('\n',' ').\
                            replace('(','').replace(')','').\
                            replace('/', ' ').lower())
  # df[column] = df[column].apply(lambda x: re.sub(r'[0-9]+','', x))
  df[column] = df[column].apply(lambda x: x.split(' '))
  df[column] = df[column].apply(lambda x: [w for w in x if not num_pattern.match(w)])
  # df[column] = df[column].apply(lambda x: [w for w in x if not single_word_pattern.match(w)]) 
  df[column] = df[column].apply(lambda x: [c for c in x if c not in '' if c 
                                           not in stop_words])
  df[column] = df[column].apply(lambda x: ' '.join(x))

  return df

In [7]:
train = cleanup_df(train, 'ABSTRACT')
test = cleanup_df(test, 'ABSTRACT')

In [8]:
train['all_text'] = train.TITLE.values + '. ' + train.ABSTRACT.values
test['all_text'] = test.TITLE.values + '. ' + test.ABSTRACT.values

train['all_text'] = train['all_text'].apply(lambda x: x.lower())
test['all_text'] = test['all_text'].apply(lambda x: x.lower())

In [9]:
train['all_text'][2]

'spherical polyharmonics and poisson kernels for polyharmonic functions. introduce develop notion spherical polyharmonics, natural generalisation spherical harmonics. particular study theory zonal polyharmonics, allows us, analogously zonal harmonics, construct poisson kernels polyharmonic functions union rotated balls. find representation poisson kernels zonal polyharmonics terms gegenbauer polynomials. show connection classical poisson kernel harmonic functions ball, poisson kernels polyharmonic functions union rotated balls, cauchy-hua kernel holomorphic functions lie ball.'

In [10]:
# all_words = []
# for i in range(0, len(train)):
#   tmp = train['all_text'][i].split(' ')
#   for l in range(0, len(tmp)):
#     all_words.append(tmp[l])

# len(all_words)

In [11]:
# c = Counter(all_words)
# c.most_common()[600:1000]

## Naive Bayes Classifier

In [12]:
target_cols = ['Computer Science', 'Physics', 'Mathematics', 'Statistics',
               'Quantitative Biology', 'Quantitative Finance']

def naivebayesclassifier(train, test, column_index):
  X_train, X_test, y_train, y_test = train_test_split(train['all_text'],
                                                      train[target_cols[column_index]],
                                                      random_state=1)
  cv = CountVectorizer()

  X_train_cv = cv.fit_transform(X_train)
  X_test_cv = cv.transform(X_test)

  nb = MultinomialNB()
  nb.fit(X_train_cv, y_train)
  predictions = nb.predict(X_test_cv)

  print('Accuracy score: ', accuracy_score(y_test, predictions))
  print('Precision score: ', precision_score(y_test, predictions))
  print('Recall score: ', recall_score(y_test, predictions))

  df_test = cv.transform(test['all_text'])
  test_pred = nb.predict(df_test)

  return pd.Series(test_pred)

In [13]:
nb_pred_0 = naivebayesclassifier(train, test, 0)

Accuracy score:  0.8502765592218196
Precision score:  0.7645891226677253
Recall score:  0.9093484419263456


In [14]:
nb_pred_1 = naivebayesclassifier(train, test, 1)

Accuracy score:  0.9300019073049781
Precision score:  0.9074204946996467
Recall score:  0.8447368421052631


In [15]:
nb_pred_2 = naivebayesclassifier(train, test, 2)

Accuracy score:  0.8880411977875262
Precision score:  0.7705345501955672
Recall score:  0.8341566690190544


In [16]:
nb_pred_3 = naivebayesclassifier(train, test, 3)

Accuracy score:  0.8447453747854282
Precision score:  0.6203353163872364
Recall score:  0.9110405083399523


In [17]:
nb_pred_4 = naivebayesclassifier(train, test, 4)

Accuracy score:  0.9746328437917223
Precision score:  0.6329113924050633
Recall score:  0.3246753246753247


In [18]:
nb_pred_5 = naivebayesclassifier(train, test, 5)

Accuracy score:  0.9906542056074766
Precision score:  0.9411764705882353
Recall score:  0.25


In [19]:
nb_test_preds = pd.concat((test['ID'], nb_pred_0, nb_pred_1, nb_pred_2,
                           nb_pred_3, nb_pred_4, nb_pred_5), axis=1)
nb_test_preds.columns = ['ID'] + target_cols

In [20]:
nb_test_preds.to_csv('nbcountvecclassifier_08222020.csv', index=False)

## Logistic Regression with OVR

In [21]:
target_cols = ['Computer Science', 'Physics', 'Mathematics', 'Statistics',
               'Quantitative Biology', 'Quantitative Finance']

def logisticclassifier(train, test, column_index):
  X_train, X_test, y_train, y_test = train_test_split(train['all_text'],
                                                      train[target_cols[column_index]],
                                                      random_state=1)
  cv = CountVectorizer()

  X_train_cv = cv.fit_transform(X_train)
  X_test_cv = cv.transform(X_test)

  lr = LogisticRegression(multi_class='ovr')
  lr.fit(X_train_cv, y_train)
  predictions = lr.predict(X_test_cv)

  print('Accuracy score: ', accuracy_score(y_test, predictions))
  print('Precision score: ', precision_score(y_test, predictions))
  print('Recall score: ', recall_score(y_test, predictions))

  df_test = cv.transform(test['all_text'])
  test_pred = lr.predict(df_test)

  return pd.Series(test_pred)

In [22]:
lr_pred_0 = logisticclassifier(train, test, 0)

Accuracy score:  0.8369254243753577
Precision score:  0.8017200191113235
Recall score:  0.7922568460812087


In [23]:
lr_pred_1 = logisticclassifier(train, test, 1)

Accuracy score:  0.9254243753576197
Precision score:  0.8939288206559665
Recall score:  0.8427631578947369


In [24]:
lr_pred_2 = logisticclassifier(train, test, 2)

Accuracy score:  0.8891855807743658
Precision score:  0.8051094890510949
Recall score:  0.7784050811573747


In [25]:
lr_pred_3 = logisticclassifier(train, test, 3)

Accuracy score:  0.8630555025748617
Precision score:  0.7193836171938361
Recall score:  0.704527402700556


In [26]:
lr_pred_4 = logisticclassifier(train, test, 4)

Accuracy score:  0.9721533473202365
Precision score:  0.5487804878048781
Recall score:  0.2922077922077922


In [27]:
lr_pred_5 = logisticclassifier(train, test, 5)

Accuracy score:  0.9908449361052832
Precision score:  0.6904761904761905
Recall score:  0.453125


In [28]:
lr_test_preds = pd.concat((test['ID'], lr_pred_0, lr_pred_1, lr_pred_2,
                           lr_pred_3, lr_pred_4, lr_pred_5), axis=1)
lr_test_preds.columns = ['ID'] + target_cols

In [29]:
lr_test_preds.to_csv('lrcountvecclassifier_08222020.csv', index=False)

In [30]:
combo_test_preds = pd.concat((test['ID'], nb_pred_0, nb_pred_1, nb_pred_2,
                              nb_pred_3, nb_pred_4, lr_pred_5), axis=1)
combo_test_preds.columns = ['ID'] + target_cols

combo_test_preds.to_csv('combocountvecclassifier_08222020.csv', index=False)