# Βιβλιοθήκες

In [1]:
import numpy as np
import pandas as pd

from sklearn import feature_extraction, model_selection, feature_selection, naive_bayes, pipeline, metrics
from sklearn.metrics import classification_report

import gensim
import gensim.downloader as api
from gensim.models.phrases import Phrases

from tensorflow.keras import models, layers, preprocessing as kprocessing

import transformers

# Εισαγωγή δεδομένων και επισκόπησή τους

In [2]:
sample_submission_file = "/kaggle/input/eeestech-challenge-task-3/sample_submission.csv"
train_file = "/kaggle/input/eestech-challenge-task-3/train.csv"
test_file = "/kaggle/input/eestech-challenge-task-3/test.csv"

In [3]:
train_data = pd.read_csv(train_file) 
test_data = pd.read_csv(test_file) 
train_data.head()

Unnamed: 0,pmid,label,abstract,text_clean
0,32716900,Prevention,BACKGROUND: Advance care planning is the proce...,background advance care planning process discu...
1,32464491,Mechanism,Selenium (Se) is a ubiquitous element akin to ...,selenium se ubiquitous element akin sulfur exi...
2,32361001,Mechanism,"During virus infection, host toll-like recepto...",virus infection host tolllike receptor tlrs re...
3,32915888,Prevention,"Globally, little evidence exists on transmissi...",globally little evidence exists transmission p...
4,33004659,Prevention,BACKGROUND: The UK has been one of the Europea...,background uk one european country affected co...


In [4]:
train_data.shape

(20587, 4)

Ας δούμε την περίληψη του άρθρου της πρώτης γραμμής που είναι στην κατηγορία "Prevention".

In [5]:
train_data.iloc[0]['abstract']

"BACKGROUND: Advance care planning is the process of discussing health care treatment preferences based on patients' personal values, and it often involves the completion of advance directives. In the first months of 2020, a novel coronavirus, severe acute respiratory syndrome coronavirus 2 (SARS-CoV-2), began circulating widely in the American state of Colorado, leading to widespread diagnosis of coronavirus disease (COVID-19), hospitalizations, and deaths. In this context, the importance of technology-based, non-face-to-face methods to conduct advance care planning via patient portals has increased. OBJECTIVE: The aim of this study was to determine the rates of use of a web-based advance care planning tool through a health system-based electronic patient portal both before and in the early months of the COVID-19 pandemic. METHODS: In 2017, we implemented web-based tools through the patient portal of UCHealth's electronic health record (EHR) for patients to learn about advance care pl

Πράγματι, μιλάει για "Advance care planning" κοκ. Ας κάνουμε και μια άλλη δοκιμή:

In [6]:
print(train_data.iloc[100]['abstract'])
print(train_data.iloc[100]['label'])

This work presents the modeling and prediction of cases of COVID-19 infection in Mexico through mathematical and computational models using only the confirmed cases provided by the daily technical report COVID-19 MEXICO until May 8(th). The mathematical models: Gompertz and Logistic, as well as the computational model: Artificial Neural Network were applied to carry out the modeling of the number of cases of COVID-19 infection from February 27(th) to May 8(th). The results show a good fit between the observed data and those obtained by the Gompertz, Logistic and Artificial Neural Networks models with an R(2) of 0.9998, 0.9996, 0.9999, respectively. The same mathematical models and inverse Artificial Neural Network were applied to predict the number of cases of COVID-19 infection from May 9(th) to 16(th) in order to analyze tendencies and extrapolate the projection until the end of the epidemic. The Gompertz model predicts a total of 47,576 cases, the Logistic model a total of 42,131 ca

In [7]:
test_data.head()

Unnamed: 0,pmid,abstract,text_clean
0,32355107,Coronavirus disease (COVID-19) is responsible ...,coronavirus disease covid19 responsible global...
1,32606823,The outbreak of coronavirus disease 2019 (COVI...,outbreak coronavirus disease 2019 covid19 pand...
2,32931811,OBJECTIVE: To describe the implementation and ...,objective describe implementation result proac...
3,32311451,The coronavirus disease 2019 (COVID-19) was fi...,coronavirus disease 2019 covid19 first reporte...
4,32395672,As the coronavirus disease 2019 pandemic sprea...,coronavirus disease 2019 pandemic spread aroun...


In [8]:
test_data.shape

(6862, 3)

In [9]:
## split dataset
dtf_train, dtf_test = model_selection.train_test_split(train_data, test_size=0.3)
## get target
y_train = dtf_train["label"].values
y_test = dtf_test["label"].values

# Bag-of-Words
Η παραδοσιακή μέθοδος NLP μετατροπής κειμένων σε διανύσματα.

In [10]:
## Count (classic BoW)
vectorizer = feature_extraction.text.CountVectorizer(max_features=10000, ngram_range=(1,2))

## Tf-Idf (advanced variant of BoW)
vectorizer = feature_extraction.text.TfidfVectorizer(max_features=10000, ngram_range=(1,2))

In [11]:
corpus = dtf_train["text_clean"].astype('U')
vectorizer.fit(corpus)
X_train = vectorizer.transform(corpus)

In [12]:
print(corpus)

17628    coronavirus disease2019 covid19 severe impact ...
17506    objective patient covid19 may present respirat...
8780     emergence highly transmissible sarscov2 varian...
11991    background evaluate diagnostic efficacy densel...
13829    abstrakcovid19 telah menjadi pandemik di indon...
                               ...                        
18201    background severe acute respiratory syndrome s...
3958     coronaviruses covs large family virus cause di...
14776    objective understanding novel coronavirus covi...
12113    coronavirus disease 19 covid19 turned pandemic...
8209     loss smell taste common complaint patient covi...
Name: text_clean, Length: 14410, dtype: object


In [13]:
print(X_train)

  (0, 9956)	0.08292752908096584
  (0, 9910)	0.07052778679783324
  (0, 9653)	0.11837947385409926
  (0, 9562)	0.07307626899055777
  (0, 9513)	0.2139283933562384
  (0, 9331)	0.06425346108992852
  (0, 9325)	0.12010891088149392
  (0, 9014)	0.12791432325274424
  (0, 9013)	0.16428721570070587
  (0, 8991)	0.08554338173685706
  (0, 8944)	0.11544721050644496
  (0, 8872)	0.13297070199925592
  (0, 8845)	0.09623016618457027
  (0, 8843)	0.07830733545429687
  (0, 8678)	0.11603074422565053
  (0, 8433)	0.12607913829552625
  (0, 8412)	0.22510987892200926
  (0, 8093)	0.06530658623737133
  (0, 7893)	0.10346968202523227
  (0, 7658)	0.08283893662232333
  (0, 7264)	0.08664797917235892
  (0, 6910)	0.09071246314352296
  (0, 6771)	0.13400797307671944
  (0, 6616)	0.03194367465524602
  (0, 6539)	0.1411424784793411
  :	:
  (14409, 1651)	0.04241386503328609
  (14409, 1650)	0.04005008888383883
  (14409, 1627)	0.07618108448013651
  (14409, 1530)	0.044962372942633035
  (14409, 1520)	0.0430050241798329
  (14409, 1299)	

In [14]:
# print(y_train)

## Ένας απλός ταξινομητής

In [15]:
# classifier = naive_bayes.MultinomialNB()

In [16]:
# ## pipeline
# model = pipeline.Pipeline([("vectorizer", vectorizer),  
#                            ("classifier", classifier)])
# ## train classifier
# model["classifier"].fit(X_train, y_train)
# ## test
# X_test = dtf_test["text_clean"].values.astype('U')
# predicted = model.predict(X_test)

In [17]:
# print(classification_report(y_test, predicted, zero_division=0))

In [18]:
from sklearn.svm import SVC
classifier = SVC(kernel = 'linear', C = 1, cache_size = 1000).fit(X_train, y_train)
## pipeline
model = pipeline.Pipeline([("vectorizer", vectorizer),  
                           ("classifier", classifier)])
## train classifier
model["classifier"].fit(X_train, y_train)
## test
X_test = dtf_test["text_clean"].values.astype('U')
predicted = model.predict(X_test)

In [19]:
print(classification_report(y_test, predicted, zero_division=0))

                      precision    recall  f1-score   support

         Case Report       0.84      0.75      0.79       491
           Diagnosis       0.73      0.69      0.71       927
Epidemic Forecasting       0.64      0.31      0.42        97
           Mechanism       0.65      0.36      0.47       387
          Prevention       0.87      0.94      0.91      2429
        Transmission       0.64      0.27      0.38       113
           Treatment       0.76      0.85      0.81      1733

            accuracy                           0.80      6177
           macro avg       0.73      0.60      0.64      6177
        weighted avg       0.80      0.80      0.79      6177



## Πρόβλεψη
Στα test data πλέον.

In [20]:
final_test_X = test_data["text_clean"].astype('U')

# Υποβολή

In [21]:
final_predictions = model.predict(final_test_X)
df_final_predictions = pd.DataFrame(final_predictions)
df_final_predictions.columns=['label']
df_ids = test_data[["pmid"]].copy()
submission_concat = pd.concat([df_ids, df_final_predictions], axis=1)
mapping = {'Case Report': 0, 'Diagnosis': 1, 'Epidemic Forecasting': 2, 'Mechanism': 3, 'Prevention': 4, 'Transmission': 5, 'Treatment': 6}
submission_df=submission_concat.replace({"label": mapping})
submission_df.to_csv('submission.csv', index=False)

In [22]:
submission_df

Unnamed: 0,pmid,label
0,32355107,4
1,32606823,6
2,32931811,4
3,32311451,6
4,32395672,4
...,...,...
6857,34226887,4
6858,34216472,6
6859,32826763,4
6860,32793973,4
