## Get all your imports

In [2]:
import pickle
from tqdm.notebook import tqdm
import pandas as pd
import numpy as np
from lightgbm import LGBMClassifier
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import f1_score
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from scipy import sparse
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
import warnings
warnings.filterwarnings("ignore")
nlp = spacy.load('en')

## Define all functions here

In [3]:
# Replace latex math symbols with the special word "math_equation"
def convert_latex_eqn_to_word(text):
  while True:
    if "$" in text and text.count("$") > 1:
      start_index = text.index("$")
      end_index = text.index("$", start_index+1)
      text = text[:start_index] + "math_equation" + text[end_index+1:]
    else:
      return text

def tokenize(text):
  return [i.text for i in nlp.tokenizer(text)]

def dummy(text):
  return text

## Load all the required files and do a bit of pre-processing

In [4]:
df_train = pd.read_csv("/content/drive/My Drive/JanataHack_IndependenceDay/train.csv")
df_test = pd.read_csv("/content/drive/My Drive/JanataHack_IndependenceDay/test.csv")

# Concatenate title and abstract
df_train["Concat"] = df_train["TITLE"].str.cat(df_train["ABSTRACT"].str.strip(), sep=" ")
df_test["Concat"] = df_test["TITLE"].str.cat(df_test["ABSTRACT"].str.strip(), sep=" ")

# Remove unnecessary shit
df_train["Concat"] = df_train["Concat"].str.strip()
df_test["Concat"] = df_test["Concat"].str.strip()

# Lowercase
df_train["Concat"] = df_train["Concat"].str.lower()
df_test["Concat"] = df_test["Concat"].str.lower()

# Seems like the data was scraped from a pdf or something cause there are random \n characters in a sentence. Remove them
df_train["Concat"] = df_train["Concat"].str.replace("\\n", " ")
df_test["Concat"] = df_test["Concat"].str.replace("\\n", " ")

# Looks like latex was used to write these articles as there are latex equations. Replace them with some special token
df_train["Concat"] = [convert_latex_eqn_to_word(text) for text in df_train["Concat"]]
df_test["Concat"] = [convert_latex_eqn_to_word(text) for text in df_test["Concat"]]

# Perform tokenization using spacy
df_train["Concat"] = [tokenize(i) for i in tqdm(df_train["Concat"].values)]
df_test["Concat"] = [tokenize(i) for i in tqdm(df_test["Concat"].values)]

# Convert pandas column to numpy array
X_train = df_train["Concat"].values
X_test = df_test["Concat"].values

# Get the class labels
classes = ["Computer Science", "Physics", "Mathematics", "Statistics", "Quantitative Biology", "Quantitative Finance"]
Y = df_train[classes].values

print("Classes train shape: ", Y.shape)
df_train.head()

HBox(children=(FloatProgress(value=0.0, max=20972.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=8989.0), HTML(value='')))


Classes train shape:  (20972, 6)


Unnamed: 0,ID,TITLE,ABSTRACT,Computer Science,Physics,Mathematics,Statistics,Quantitative Biology,Quantitative Finance,Concat
0,1,Reconstructing Subject-Specific Effect Maps,Predictive models allow subject-specific inf...,1,0,0,0,0,0,"[reconstructing, subject, -, specific, effect,..."
1,2,Rotation Invariance Neural Network,Rotation invariance and translation invarian...,1,0,0,0,0,0,"[rotation, invariance, neural, network, rotati..."
2,3,Spherical polyharmonics and Poisson kernels fo...,We introduce and develop the notion of spher...,0,0,1,0,0,0,"[spherical, polyharmonics, and, poisson, kerne..."
3,4,A finite element approximation for the stochas...,The stochastic Landau--Lifshitz--Gilbert (LL...,0,0,1,0,0,0,"[a, finite, element, approximation, for, the, ..."
4,5,Comparative study of Discrete Wavelet Transfor...,Fourier-transform infra-red (FTIR) spectra o...,1,0,0,1,0,0,"[comparative, study, of, discrete, wavelet, tr..."


## Now we generate probabilities for each class using CountVectorizer

In [5]:
kfold = KFold(n_splits=10, shuffle=True, random_state=27)
scores = list()

# Stacking the probabilities obtained via countvectorizer
X_train_count_vec = np.zeros(Y.shape, dtype=np.float32)
for train, test in kfold.split(X_train):
  x_train, x_test = X_train[train], X_train[test]
  y_train, y_test = Y[train], Y[test]

  # Generate vectors
  vectorizer = CountVectorizer(ngram_range=(1,2), min_df=7, stop_words=list(STOP_WORDS), preprocessor=dummy, tokenizer=dummy, lowercase=False)
  x_train = vectorizer.fit_transform(x_train)
  x_test = vectorizer.transform(x_test)

  print(x_train.shape)
  
  model = OneVsRestClassifier(LGBMClassifier(random_state=27, max_depth=-1, n_estimators=100))
  model.fit(x_train, y_train)
  preds_proba = model.predict_proba(x_test)
  preds = model.predict(x_test)
  X_train_count_vec[test] = preds_proba
  
  score = f1_score(y_test, preds, average="micro")
  print("Score: ", score)
  scores.append(score)
print("\nAverage Score: ", sum(scores)/len(scores))

# Get the test probabilities as well
vectorizer = CountVectorizer(ngram_range=(1,2), min_df=7, stop_words=list(STOP_WORDS), preprocessor=dummy, tokenizer=dummy, lowercase=False)
model = OneVsRestClassifier(LGBMClassifier(random_state=27, max_depth=-1, n_estimators=100))
model.fit(vectorizer.fit_transform(X_train), Y)
X_test_count_vec = model.predict_proba(vectorizer.transform(X_test))

## This time, use TFIDFVectorizer while adding the probabilities generate earlier

In [None]:
kfold = KFold(n_splits=10, shuffle=True, random_state=27)
scores = list()

# Stacking the probabilities obtained via countvectorizer
X_train_tfidf_vec = np.zeros(Y.shape, dtype=np.float32)
for train, test in kfold.split(X_train):
  x_train, x_test = X_train[train], X_train[test]
  y_train, y_test = Y[train], Y[test]

  vectorizer = TfidfVectorizer(ngram_range=(1,2), min_df=7, stop_words=list(STOP_WORDS), preprocessor=dummy, tokenizer=dummy)
  x_train = sparse.csr_matrix(np.concatenate((vectorizer.fit_transform(x_train).toarray(), X_train_count_vec[train]), axis=1))
  x_test = sparse.csr_matrix(np.concatenate((vectorizer.transform(x_test).toarray(), X_train_count_vec[test]), axis=1))

  print(x_train.shape)
  
  model = OneVsRestClassifier(LGBMClassifier(random_state=27, max_depth=-1, n_estimators=100))
  model.fit(x_train, y_train)
  preds_proba = model.predict_proba(x_test)
  preds = model.predict(x_test)
  X_train_tfidf_vec[test] = preds_proba
  
  score = f1_score(y_test, preds, average="micro")
  print("Score: ", score)
  scores.append(score)
print("\nAverage Score: ", sum(scores)/len(scores))

# Get the test probabilities as well
vectorizer = TfidfVectorizer(ngram_range=(1,2), min_df=7, stop_words=list(STOP_WORDS), preprocessor=dummy, tokenizer=dummy)
model = OneVsRestClassifier(LGBMClassifier(random_state=27, max_depth=-1, n_estimators=100))
model.fit(sparse.csr_matrix(np.concatenate((vectorizer.fit_transform(X_train).toarray(), X_train_count_vec), axis=1)), Y)
X_test_tfidf_vec = model.predict_proba(sparse.csr_matrix(np.concatenate((vectorizer.transform(X_test).toarray(), X_test_count_vec), axis=1)))

## Finally, write out the predictions. We'll use these later as input to pre-trained models

In [None]:
# Write out the train predictions
fp = open("best_tfidf_train.csv", "w")
fp.write("ID,Computer Science,Physics,Mathematics,Statistics,Quantitative Biology,Quantitative Finance\n")
for id_, pred in zip(df_test["ID"].values, X_train_tfidf_vec):
    fp.write(str(id_)+","+",".join([str(i) for i in pred])+"\n")
fp.close()

# Write out the test predictions
fp = open("best_tfidf_test.csv", "w")
fp.write("ID,Computer Science,Physics,Mathematics,Statistics,Quantitative Biology,Quantitative Finance\n")
for id_, pred in zip(df_test["ID"].values, X_test_tfidf_vec):
    fp.write(str(id_)+","+",".join([str(i) for i in pred])+"\n")
fp.close()