In [1]:
import pandas as pd
import numpy as np

In [2]:
raw_data = pd.read_csv('train.csv')
raw_data.head()

Unnamed: 0,ID,TITLE,ABSTRACT,Computer Science,Physics,Mathematics,Statistics,Quantitative Biology,Quantitative Finance
0,1,Reconstructing Subject-Specific Effect Maps,Predictive models allow subject-specific inf...,1,0,0,0,0,0
1,2,Rotation Invariance Neural Network,Rotation invariance and translation invarian...,1,0,0,0,0,0
2,3,Spherical polyharmonics and Poisson kernels fo...,We introduce and develop the notion of spher...,0,0,1,0,0,0
3,4,A finite element approximation for the stochas...,The stochastic Landau--Lifshitz--Gilbert (LL...,0,0,1,0,0,0
4,5,Comparative study of Discrete Wavelet Transfor...,Fourier-transform infra-red (FTIR) spectra o...,1,0,0,1,0,0


In [3]:
raw_data.shape

(20972, 9)

In [4]:
raw_data['TITLE'][0]

'Reconstructing Subject-Specific Effect Maps'

In [9]:
#we are going to combine the title and abstract column and run it as a feature
raw_data['text'] = raw_data['TITLE'].str.cat(raw_data['ABSTRACT'], sep=" ")
raw_data.head()

Unnamed: 0,ID,TITLE,ABSTRACT,Computer Science,Physics,Mathematics,Statistics,Quantitative Biology,Quantitative Finance,text
0,1,Reconstructing Subject-Specific Effect Maps,Predictive models allow subject-specific inf...,1,0,0,0,0,0,Reconstructing Subject-Specific Effect Maps ...
1,2,Rotation Invariance Neural Network,Rotation invariance and translation invarian...,1,0,0,0,0,0,Rotation Invariance Neural Network Rotation ...
2,3,Spherical polyharmonics and Poisson kernels fo...,We introduce and develop the notion of spher...,0,0,1,0,0,0,Spherical polyharmonics and Poisson kernels fo...
3,4,A finite element approximation for the stochas...,The stochastic Landau--Lifshitz--Gilbert (LL...,0,0,1,0,0,0,A finite element approximation for the stochas...
4,5,Comparative study of Discrete Wavelet Transfor...,Fourier-transform infra-red (FTIR) spectra o...,1,0,0,1,0,0,Comparative study of Discrete Wavelet Transfor...


In [10]:
#make a copy of this dataframe
data = raw_data.copy()

In [11]:
#lets rearrange the columns a little bit to get better understanding
data.columns.values

array(['ID', 'TITLE', 'ABSTRACT', 'Computer Science', 'Physics',
       'Mathematics', 'Statistics', 'Quantitative Biology',
       'Quantitative Finance', 'text'], dtype=object)

In [12]:
data = data[['ID', 'TITLE', 'ABSTRACT','text', 'Computer Science', 'Physics',
       'Mathematics', 'Statistics', 'Quantitative Biology',
       'Quantitative Finance']]

In [13]:
data.head()

Unnamed: 0,ID,TITLE,ABSTRACT,text,Computer Science,Physics,Mathematics,Statistics,Quantitative Biology,Quantitative Finance
0,1,Reconstructing Subject-Specific Effect Maps,Predictive models allow subject-specific inf...,Reconstructing Subject-Specific Effect Maps ...,1,0,0,0,0,0
1,2,Rotation Invariance Neural Network,Rotation invariance and translation invarian...,Rotation Invariance Neural Network Rotation ...,1,0,0,0,0,0
2,3,Spherical polyharmonics and Poisson kernels fo...,We introduce and develop the notion of spher...,Spherical polyharmonics and Poisson kernels fo...,0,0,1,0,0,0
3,4,A finite element approximation for the stochas...,The stochastic Landau--Lifshitz--Gilbert (LL...,A finite element approximation for the stochas...,0,0,1,0,0,0
4,5,Comparative study of Discrete Wavelet Transfor...,Fourier-transform infra-red (FTIR) spectra o...,Comparative study of Discrete Wavelet Transfor...,1,0,0,1,0,0


In [14]:
#after adding the text column we dont need the TITLE and ABSTRACT columns
data = data.drop(['TITLE', 'ABSTRACT'], axis =1)
data.head()

Unnamed: 0,ID,text,Computer Science,Physics,Mathematics,Statistics,Quantitative Biology,Quantitative Finance
0,1,Reconstructing Subject-Specific Effect Maps ...,1,0,0,0,0,0
1,2,Rotation Invariance Neural Network Rotation ...,1,0,0,0,0,0
2,3,Spherical polyharmonics and Poisson kernels fo...,0,0,1,0,0,0
3,4,A finite element approximation for the stochas...,0,0,1,0,0,0
4,5,Comparative study of Discrete Wavelet Transfor...,1,0,0,1,0,0


# Naive_bayes model and its accuracy

In [16]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB

In [17]:
from sklearn.pipeline import Pipeline
text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB()),
])

In [18]:
X = data['text']
y = data['Computer Science']

In [19]:
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state = 0)

In [20]:
#train the model with this data and later find its accuracy
text_clf.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)),
                ('tfidf',
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=True)),
                ('clf',
                 MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))],
         verbose=False)

In [21]:
trial_predictions = text_clf.predict(X_val)
np.mean(trial_predictions == y_val)

0.8593563766388558

In [22]:
test_data = pd.read_csv('test.csv')

In [23]:
#now we do the same thing for the test data 
test_data['text'] = test_data['TITLE'].str.cat(test_data['ABSTRACT'], sep=" ")

In [24]:
test_data = test_data.drop(['TITLE', 'ABSTRACT'], axis =1)

In [25]:
test_data.head()

Unnamed: 0,ID,text
0,20973,Closed-form Marginal Likelihood in Gamma-Poiss...
1,20974,Laboratory mid-IR spectra of equilibrated and ...
2,20975,Case For Static AMSDU Aggregation in WLANs F...
3,20976,The $Gaia$-ESO Survey: the inner disk intermed...
4,20977,Witness-Functions versus Interpretation-Functi...


In [26]:
PREDICTIONS =pd.DataFrame()
PREDICTIONS['ID'] = test_data['ID']
PREDICTIONS['Computer Science'] = text_clf.predict(test_data['text'])

In [27]:
PREDICTIONS.head()

Unnamed: 0,ID,Computer Science
0,20973,0
1,20974,0
2,20975,1
3,20976,0
4,20977,1


In [28]:
#do the same for different targets
#physics
y1=data['Physics']
text_clf.fit(X,y1)
PREDICTIONS['Physics'] = text_clf.predict(test_data['text'])

In [29]:
#mathematics
y2=data['Mathematics']
text_clf.fit(X,y2)
PREDICTIONS['Mathematics'] = text_clf.predict(test_data['text'])

In [30]:
#statistics
y3=data['Statistics']
text_clf.fit(X,y3)
PREDICTIONS['Statistics'] = text_clf.predict(test_data['text'])

In [31]:
#quantitative Biology
y4=data['Quantitative Biology']
text_clf.fit(X,y4)
PREDICTIONS['Quantitative Biology'] = text_clf.predict(test_data['text'])

In [32]:
#quantitative finance
y5=data['Quantitative Finance']
text_clf.fit(X,y5)
PREDICTIONS['Quantitative Finance'] = text_clf.predict(test_data['text'])

In [33]:
PREDICTIONS.to_csv('hackathon_submission_3.csv', index=False)